diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dd5e1e1..29be9eda 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,8 +16,11 @@ endif()
 
 if(DEFINED ENV{D_PKG_SERVER})
     set(GE_PB_PKG $ENV{D_PKG_SERVER})
-    message("Download packages from PKG server")
-endif()
+    message("Download packages from DPKG server")
+elseif(DEFINED ENV{MSLIBS_SERVER})
+    set(GE_PB_PKG "http://$ENV{MSLIBS_SERVER}:8081")
+    message("Download packages from MSPKG server")
+endif ()
 
 set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
 set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
@@ -37,7 +40,7 @@ set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}
 option(ENABLE_OPEN_SRC "Enable graphengine compile in opensource." FALSE)
 
 if (ENABLE_OPEN_SRC)
-    set(HI_PYTHON python3.7)
+    set(HI_PYTHON python3)
 
     include(cmake/external_libs/protobuf_shared.cmake)
     include(cmake/external_libs/protobuf_static.cmake)
@@ -49,10 +52,6 @@ if (ENABLE_OPEN_SRC)
     include(cmake/FindModule.cmake)
     include(cmake/intf_pub_linux.cmake)
 
-    # for CPU/GPU mode, find c_sec and slog from local prebuild
-    #if(NOT ENABLE_D AND NOT GE_ONLY)
-    #    set(GE_PREBUILD_PATH ${GE_CODE_DIR}/third_party/prebuild/${CMAKE_HOST_SYSTEM_PROCESSOR})
-    #    find_module(slog libslog.so ${GE_PREBUILD_PATH})
     # if D_LINK_PATH is set in environment variables, search libraries in given path
     if(DEFINED ENV{D_LINK_PATH})
         # D_LINK_PATH is set
@@ -69,9 +68,9 @@ if (ENABLE_OPEN_SRC)
         endif()
         set(GE_LIB_PATH ${GE_LIB_PATH}/${GE_SYS_ARCH})
         set(STATIC_ACL_LIB ${GE_LIB_PATH})
-        find_module(slog libslog.so ${GE_LIB_PATH})
+        find_module(slog libalog.so ${GE_LIB_PATH})
         find_module(static_mmpa libmmpa.a ${GE_LIB_PATH})
-        find_module(msprof libmsprof.so ${GE_LIB_PATH})
+        find_module(msprofiler_ext libmsprofiler.a ${GE_LIB_PATH})
         find_module(hccl libhccl.so ${GE_LIB_PATH})
         find_module(adump_server libadump_server.a ${GE_LIB_PATH})
         find_module(runtime libruntime.so ${GE_LIB_PATH})
@@ -80,20 +79,21 @@ if (ENABLE_OPEN_SRC)
         find_module(error_manager liberror_manager.so ${GE_LIB_PATH})
         find_module(ascend_hal_stub libascend_hal.so ${GE_LIB_PATH})
         find_module(error_manager_static liberror_manager.a ${GE_LIB_PATH})
-        find_module(msprofiler libmsprofiler.a ${GE_LIB_PATH})
+        find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${GE_LIB_PATH})
         #find_module(ascendcl_static libascendcl.a ${GE_LIB_PATH})
+    elseif(ENABLE_GE_COV OR ENABLE_GE_UT)
+	add_subdirectory(tests)
     else()
         find_module(slog libslog.so ${ASCEND_ATC_DIR} ${ASCEND_DRIVER_COMMON_DIR})
         find_module(static_mmpa libmmpa.a ${ASCEND_ATC_DIR} ${ASCEND_RUNTIME_DIR})
         find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR} ${ASCEND_RUNTIME_DIR})
         if(PLATFORM STREQUAL "train")
-            find_module(msprof libmsprof.so ${ASCEND_DRIVER_COMMON_DIR})
             find_module(hccl libhccl.so ${ASCEND_RUNTIME_DIR})
             find_module(adump_server libadump_server.a ${ASCEND_RUNTIME_DIR})
             find_module(runtime libruntime.so ${ASCEND_RUNTIME_DIR})
             find_module(resource libresource.so ${ASCEND_RUNTIME_DIR})
             find_module(error_manager liberror_manager.so ${ASCEND_RUNTIME_DIR})
-            find_module(msprofiler libmsprofiler.a ${ASCEND_RUNTIME_DIR})
+            find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR})
             find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
             if(PRODUCT STREQUAL "flr3")
                 message(FATAL_ERROR "This platform is not supported in train mode, build terminated")
@@ -105,21 +105,18 @@ if (ENABLE_OPEN_SRC)
             find_module(resource libresource.so ${ASCEND_ATC_DIR})
             find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR})
             find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR})
-            find_module(msprofiler libmsprofiler.a ${ASCEND_ACL_DIR})
-	        #find_module(ascendcl_static libascendcl.a ${ASCEND_ACL_DIR})
+            find_module(msprofiler_ext libmsprofiler.a ${ASCEND_ACL_DIR})
+            #find_module(ascendcl_static libascendcl.a ${ASCEND_ACL_DIR})
             if(PRODUCT STREQUAL "flr3")
-                find_module(msprof libmsprof.so ${ASCEND_DRIVER_SHARE_DIR})
             elseif(PRODUCT STREQUAL "flr1")
                 find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
-                find_module(msprof libmsprof.so ${ASCEND_DRIVER_COMMON_DIR})
             elseif(PRODUCT STREQUAL "flr2")
                 # flr2 ascend_hal_stub limsprof ?
             else()
                 find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR})
-                find_module(msprof libmsprof.so ${ASCEND_DRIVER_DIR})
             endif()
         elseif(PLATFORM STREQUAL "all")
-            find_module(msprof libmsprof.so ${ASCEND_DRIVER_COMMON_DIR})
+            find_module(msprofiler_ext libmsprofiler.a ${ASCEND_ACL_DIR})
             find_module(hccl libhccl.so ${ASCEND_RUNTIME_DIR})
             find_module(adump_server libadump_server.a ${ASCEND_ACL_DIR})
             find_module(runtime libruntime.so ${ASCEND_ACL_DIR})
@@ -127,17 +124,12 @@ if (ENABLE_OPEN_SRC)
             find_module(resource libresource.so ${ASCEND_ATC_DIR})
             find_module(error_manager liberror_manager.so ${ASCEND_ATC_DIR})
             find_module(error_manager_static liberror_manager.a ${ASCEND_ACL_DIR})
-            find_module(msprofiler libmsprofiler.a ${ASCEND_ACL_DIR})
+            find_module(msprofiler_fwk_ext libmsprofiler_fwk.a ${ASCEND_RUNTIME_DIR})
             find_module(ascend_hal_stub libascend_hal.so ${ASCEND_DRIVER_DIR}/driver)
             #find_module(ascendcl_static libascendcl.a ${ASCEND_ACL_DIR})
         else()
-	    message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!")
+            message(STATUS "PLATFORM param is invalid, should be train or inference, you choose nothing!")
         endif()
-
-	if (ENABLE_GE_COV OR ENABLE_GE_UT)
-            add_subdirectory(tests)
-        endif()
-
     endif()
 
     set(METADEF_DIR ${CMAKE_CURRENT_LIST_DIR}/metadef)
@@ -158,7 +150,7 @@ elseif (ENABLE_D OR ENABLE_ACL)
     include(cmake/intf_pub_linux.cmake)
 
     # common libraries
-    find_module(slog libslog.so ${ASCEND_MS_DRIVER_PATH})
+    find_module(slog libalog.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
     find_module(error_manager liberror_manager.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
     find_module(static_mmpa libmmpa.a ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
 
@@ -178,7 +170,7 @@ elseif(ENABLE_MS_TESTCASES)
     include(cmake/intf_pub_linux.cmake)
 
     # common libraries
-    find_module(slog libslog.so ${ASCEND_MS_DRIVER_PATH})
+    find_module(slog libalog.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
     find_module(error_manager liberror_manager.so ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
     find_module(static_mmpa libmmpa.a ${ASCEND_MS_RUNTIME_PATH} ${ATLAS_MS_RUNTIME_PATH})
 
diff --git a/Third_Party_Open_Source_Software_Notice b/Third_Party_Open_Source_Software_Notice
index 0d79cfa5..ba8da1fb 100644
--- a/Third_Party_Open_Source_Software_Notice
+++ b/Third_Party_Open_Source_Software_Notice
@@ -458,3 +458,76 @@ Copyright (c) Facebook Inc. and Microsoft Corporation.
 
 License: MIT License
 Please see above.
+
+
+
+Software: caffe 1.0
+
+License: BSD 2-Clause License
+
+Open Source Software Licensed Under the BSD 2-Clause License
+
+GraphEngine uses source code files from caffe so as to support model format conversion from caffe model to GraphEngine model.
+Please see below for the full list of source code files from caffe that are used by GraphEngine.
+The below software in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications"). All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+----------------------------------------------------------------------------------------
+1. caffe.proto  master
+All contributions by the University of California:
+Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+All rights reserved.
+
+
+Terms of the BSD 2-Clause License:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+Software: tensorflow 1.15.0
+
+License: Apache-2.0 License
+
+Open Source Software Licensed Under the Apache-2.0 License
+
+
+GraphEngine uses source code files from tensorflow so as to support model format conversion from tensorflow model to GraphEngine model.
+Please see below for the full list of source code files from tensorflow that are used by GraphEngine.
+The below software in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications"). All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+----------------------------------------------------------------------------------------
+1. attr_value.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+2. function.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+3. graph.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+4. node_def.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+5. op_def.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+6. resource_handle.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+7. tensor.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+8. tensor_shape.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+9. types.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+10. versions.proto  master
+Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Terms of the Apache-2.0 License:
+Please see above.
diff --git a/build.sh b/build.sh
index 3c9a537e..5222ab5c 100644
--- a/build.sh
+++ b/build.sh
@@ -23,7 +23,7 @@ export BUILD_PATH="${BASEPATH}/build/"
 usage()
 {
   echo "Usage:"
-  echo "sh build.sh [-j[n]] [-h] [-v] [-s] [-t] [-u] [-c] [-S on|off]"
+  echo "sh build.sh [-j[n]] [-h] [-v] [-s] [-t] [-u] [-c] [-S on|off] [-M]"
   echo ""
   echo "Options:"
   echo "    -h Print usage"
@@ -35,6 +35,7 @@ usage()
   echo "    -p Build inference or train"
   echo "    -v Display build command"
   echo "    -S Enable enable download cmake compile dependency from gitee , default off"
+  echo "    -M build MindSpore mode"
   echo "to be continued ..."
 }
 
@@ -58,30 +59,27 @@ checkopts()
   ENABLE_GE_UT="off"
   ENABLE_GE_ST="off"
   ENABLE_GE_COV="off"
-  GE_ONLY="on"
   PLATFORM=""
   PRODUCT="normal"
   ENABLE_GITEE="off"
+  MINDSPORE_MODE="off"
   # Process the options
-  while getopts 'ustchj:p:g:vS:' opt
+  while getopts 'ustchj:p:g:vS:M' opt
   do
     OPTARG=$(echo ${OPTARG} | tr '[A-Z]' '[a-z]')
     case "${opt}" in
       u)
         # ENABLE_GE_UT_ONLY_COMPILE="on"
         ENABLE_GE_UT="on"
-        GE_ONLY="off"
         ;;
       s)
         ENABLE_GE_ST="on"
         ;;
       t)
 	      ENABLE_GE_UT="on"
-	      GE_ONLY="off"
 	      ;;
       c)
         ENABLE_GE_COV="on"
-        GE_ONLY="off"
         ;;
       h)
         usage
@@ -104,6 +102,9 @@ checkopts()
         ENABLE_GITEE="$OPTARG"
         echo "enable download from gitee"
         ;;
+      M)
+        MINDSPORE_MODE="on"
+        ;;
       *)
         echo "Undefined option: ${opt}"
         usage
@@ -132,7 +133,8 @@ build_graphengine()
   echo "create build directory and build GraphEngine";
   mk_dir "${BUILD_PATH}"
   cd "${BUILD_PATH}"
-  CMAKE_ARGS="-DBUILD_PATH=$BUILD_PATH -DGE_ONLY=$GE_ONLY"
+
+  CMAKE_ARGS="-DBUILD_PATH=$BUILD_PATH"
 
   if [[ "X$ENABLE_GE_COV" = "Xon" ]]; then
     CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_GE_COV=ON"
@@ -150,7 +152,13 @@ build_graphengine()
   if [[ "X$ENABLE_GITEE" = "Xon" ]]; then
     CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_GITEE=ON"
   fi
-  CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_OPEN_SRC=True -DCMAKE_INSTALL_PREFIX=${OUTPUT_PATH} -DPLATFORM=${PLATFORM} -DPRODUCT=${PRODUCT}"
+
+  if [[ "X$MINDSPORE_MODE" = "Xoff" ]]; then
+    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_OPEN_SRC=True -DCMAKE_INSTALL_PREFIX=${OUTPUT_PATH} -DPLATFORM=${PLATFORM} -DPRODUCT=${PRODUCT}"
+  else
+    CMAKE_ARGS="${CMAKE_ARGS} -DENABLE_D=ON -DCMAKE_INSTALL_PREFIX=${OUTPUT_PATH}"
+  fi
+
   echo "${CMAKE_ARGS}"
   cmake ${CMAKE_ARGS} ..
   if [ $? -ne 0 ]
@@ -162,13 +170,16 @@ build_graphengine()
   TARGET=${COMMON_TARGET}
   if [ "x${PLATFORM}" = "xtrain" ]
   then
-    TARGET="ge_runner ge_local_engine ge_local_opskernel_builder host_cpu_engine host_cpu_opskernel_builder ${TARGET}"
+    TARGET="ge_runner ge_local_engine ge_local_opskernel_builder host_cpu_engine host_cpu_opskernel_builder fwk_atc.bin ${TARGET}"
   elif [ "x${PLATFORM}" = "xinference" ]
   then
-    TARGET="ge_compiler atc_ge_local_engine atc_ge_local_opskernel_builder atc_host_cpu_engine atc_host_cpu_opskernel_builder atc opensrc_ascendcl ${TARGET}"
+    TARGET="ge_compiler atc_ge_local_engine atc_ge_local_opskernel_builder atc_host_cpu_engine atc_host_cpu_opskernel_builder atc_atc.bin opensrc_ascendcl ${TARGET}"
   elif [ "X$ENABLE_GE_UT" = "Xon" ]
   then
     TARGET="ut_libgraph ut_libge_multiparts_utest ut_libge_others_utest ut_libge_kernel_utest ut_libge_distinct_load_utest"
+  elif [ "X$MINDSPORE_MODE" = "Xon" ]
+  then
+    TARGET="ge_common graph"
   elif [ "x${PLATFORM}" = "xall" ]
   then
     # build all the target
@@ -224,12 +235,14 @@ if [[ "X$ENABLE_GE_UT" = "Xon" || "X$ENABLE_GE_COV" = "Xon" ]]; then
 #     fi
 
 #     if [[ "X$ENABLE_GE_COV" = "Xon" ]]; then
-#         echo "Generating coverage statistics, please wait..."
-#         cd ${BASEPATH}
-#         rm -rf ${BASEPATH}/cov
-#         mkdir ${BASEPATH}/cov
-#         gcovr -r ./ --exclude 'third_party' --exclude 'build' --exclude 'tests' --exclude 'prebuild' --exclude 'inc' --print-summary --html --html-details -d -o cov/index.html
-#     fi
+         echo "Generating coverage statistics, please wait..."
+         cd ${BASEPATH}
+         rm -rf ${BASEPATH}/cov
+         mkdir ${BASEPATH}/cov
+         lcov -c -d build/tests/ut/ge -d build/tests/ut/common/graph/ -o cov/tmp.info
+	 lcov --remove cov/tmp.info '*/output/*' '*/build/opensrc/*' '*/build/proto/*' '*/third_party/*' '*/tests/*' '/usr/local/*' -o cov/coverage.info
+	 cd ${BASEPATH}/cov
+	 genhtml coverage.info
 fi
 
 # generate output package in tar form, including ut/st libraries/executables
@@ -242,6 +255,7 @@ generate_package()
   FWK_PATH="fwkacllib/lib64"
   ATC_PATH="atc/lib64"
   ATC_BIN_PATH="atc/bin"
+  FWK_BIN_PATH="fwkacllib/bin"
   NNENGINE_PATH="plugin/nnengine/ge_config"
   OPSKERNEL_PATH="plugin/opskernel"
 
@@ -254,6 +268,7 @@ generate_package()
   rm -rf ${OUTPUT_PATH:?}/${ACL_PATH}/
   rm -rf ${OUTPUT_PATH:?}/${ATC_PATH}/
   rm -rf ${OUTPUT_PATH:?}/${ATC_BIN_PATH}/
+  rm -rf ${OUTPUT_PATH:?}/${FWK_BIN_PATH}/
 
   mk_dir "${OUTPUT_PATH}/${FWK_PATH}/${NNENGINE_PATH}"
   mk_dir "${OUTPUT_PATH}/${FWK_PATH}/${OPSKERNEL_PATH}"
@@ -261,6 +276,7 @@ generate_package()
   mk_dir "${OUTPUT_PATH}/${ATC_PATH}/${OPSKERNEL_PATH}"
   mk_dir "${OUTPUT_PATH}/${ACL_PATH}"
   mk_dir "${OUTPUT_PATH}/${ATC_BIN_PATH}"
+  mk_dir "${OUTPUT_PATH}/${FWK_BIN_PATH}"
  
   cd "${OUTPUT_PATH}"
 
@@ -299,7 +315,8 @@ generate_package()
     find ${OUTPUT_PATH}/${GRAPHENGINE_LIB_PATH} -maxdepth 1 -name "$lib" -exec cp -f {} ${OUTPUT_PATH}/${ATC_PATH} \;
   done
 
-  find ./bin -name atc -exec cp {} "${OUTPUT_PATH}/${ATC_BIN_PATH}" \;
+  find ./lib/atclib -name atc.bin -exec cp {} "${OUTPUT_PATH}/${ATC_BIN_PATH}" \;
+  find ./lib/fwkacl -name atc.bin -exec cp {} "${OUTPUT_PATH}/${FWK_BIN_PATH}" \;
   find ${OUTPUT_PATH}/${GRAPHENGINE_LIB_PATH} -maxdepth 1 -name "libascendcl.so" -exec cp -f {} ${OUTPUT_PATH}/${ACL_PATH} \;
   
   if [ "x${PLATFORM}" = "xtrain" ]
@@ -314,7 +331,12 @@ generate_package()
   fi
 }
 
-if [[ "X$ENABLE_GE_UT" = "Xoff" ]]; then
+if [[ "X$ENABLE_GE_UT" = "Xoff" && "X$MINDSPORE_MODE" = "Xoff" ]]; then
   generate_package
+elif [ "X$MINDSPORE_MODE" = "Xon" ]
+then
+  cd "${OUTPUT_PATH}"
+  find ./ -name graphengine_lib.tar -exec rm {} \;
+  tar -cf graphengine_lib.tar lib
 fi
-echo "---------------- GraphEngine package archive generated ----------------"
+echo "---------------- GraphEngine package archive generated ----------------"
\ No newline at end of file
diff --git a/classify_rule.txt b/classify_rule.txt
new file mode 100644
index 00000000..5c88f67e
--- /dev/null
+++ b/classify_rule.txt
@@ -0,0 +1,5 @@
+[graphengine]
+ge
+inc
+metadef
+parser
diff --git a/cmake/FindModule.cmake b/cmake/FindModule.cmake
index eab39b10..14737c71 100644
--- a/cmake/FindModule.cmake
+++ b/cmake/FindModule.cmake
@@ -21,7 +21,7 @@ function(find_module module name)
     if ("${${module}_LIBRARY_DIR}" STREQUAL "${module}_LIBRARY_DIR-NOTFOUND")
       message(FATAL_ERROR "${name} not found in ${path}")
     endif()
-    
+
     add_library(${module} SHARED IMPORTED)
     set_target_properties(${module} PROPERTIES
       IMPORTED_LOCATION ${${module}_LIBRARY_DIR}
diff --git a/cmake/external_libs/gflags.cmake b/cmake/external_libs/gflags.cmake
index f3f0f0ef..50cfb2bc 100755
--- a/cmake/external_libs/gflags.cmake
+++ b/cmake/external_libs/gflags.cmake
@@ -23,6 +23,7 @@ ExternalProject_Add(gflags_build
                     URL ${REQ_URL}
                     #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
                     #SOURCE_DIR ${GE_CODE_DIR}/../../third_party/gflags/src/gflags-2.2.2 
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_CXX_FLAGS=${gflags_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/gflags <SOURCE_DIR>
                     BUILD_COMMAND $(MAKE)
                     INSTALL_COMMAND $(MAKE) install
diff --git a/cmake/external_libs/gtest.cmake b/cmake/external_libs/gtest.cmake
index 96ea84b4..303ce464 100755
--- a/cmake/external_libs/gtest.cmake
+++ b/cmake/external_libs/gtest.cmake
@@ -10,7 +10,10 @@ if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
     message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
 endif()
 
-if (ENABLE_GITEE)
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/ge_gtest/release-1.8.0.tar.gz")
+    set(MD5 "")
+elseif (ENABLE_GITEE)
     set(REQ_URL "https://gitee.com/mirrors/googletest/repository/archive/release-1.8.0.tar.gz")
     set(MD5 "")
 else()
@@ -22,8 +25,9 @@ set (gtest_CXXFLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 -D_FORTIFY_SOURCE=2 -O2 -fstack-
 set (gtest_CFLAGS "-D_GLIBCXX_USE_CXX11_ABI=0 -D_FORTIFY_SOURCE=2 -O2 -fstack-protector-all -Wl,-z,relro,-z,now,-z,noexecstack")
 ExternalProject_Add(gtest_build
                     URL ${REQ_URL}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_CXX_FLAGS=${gtest_CXXFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/gtest <SOURCE_DIR>
-		    -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_MACOSX_RPATH=TRUE -Dgtest_disable_pthreads=ON
+                -DBUILD_TESTING=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_MACOSX_RPATH=TRUE -Dgtest_disable_pthreads=ON
                     BUILD_COMMAND $(MAKE)
                     INSTALL_COMMAND $(MAKE) install
                     EXCLUDE_FROM_ALL TRUE 
diff --git a/cmake/external_libs/json.cmake b/cmake/external_libs/json.cmake
index ce473d4b..6e476d0d 100755
--- a/cmake/external_libs/json.cmake
+++ b/cmake/external_libs/json.cmake
@@ -18,6 +18,7 @@ ExternalProject_Add(json_build
                     URL ${REQ_URL}
                     #URL /home/txd/workspace/cloud_code/pkg/include.zip
                     SOURCE_DIR  ${JSON_SRC_DIR}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ""
                     BUILD_COMMAND ""
                     INSTALL_COMMAND ""
diff --git a/cmake/external_libs/onnx.cmake b/cmake/external_libs/onnx.cmake
index 9dadb544..1ee80d2d 100755
--- a/cmake/external_libs/onnx.cmake
+++ b/cmake/external_libs/onnx.cmake
@@ -6,7 +6,10 @@ set(ONNX_PROTO_DIR ${CMAKE_BINARY_DIR}/onnx)
 set(ONNX_PROTO_FILE ${ONNX_PROTO_DIR}/onnx.proto)
 file(MAKE_DIRECTORY ${ONNX_PROTO_DIR})
 
-if (ENABLE_GITEE)
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/onnx/onnx-1.6.0.tar.gz")
+    set(MD5 "512f2779d6215d4a36f366b6b9acdf1e")
+elseif (ENABLE_GITEE)
     set(REQ_URL "https://gitee.com/mirrors/ONNX/repository/archive/v1.6.0.tar.gz")
     set(MD5 "1bdbcecdd68ea8392630467646776e02")
 else()
@@ -19,6 +22,7 @@ ExternalProject_Add(onnx
                     #URL /home/txd/workspace/cloud_code/pkg/onnx-1.6.0.tar.gz
                     #URL_HASH SHA256=3b88c3fe521151651a0403c4d131cb2e0311bd28b753ef692020a432a81ce345
                     #SOURCE_DIR ${ONNX_SRC_DIR}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ""
                     BUILD_COMMAND ""
                     #INSTALL_COMMAND "" 
diff --git a/cmake/external_libs/protobuf_shared.cmake b/cmake/external_libs/protobuf_shared.cmake
index c9c6b7d9..6334c8a3 100755
--- a/cmake/external_libs/protobuf_shared.cmake
+++ b/cmake/external_libs/protobuf_shared.cmake
@@ -26,6 +26,7 @@ set(protobuf_CXXFLAGS "-Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fst
 set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
 ExternalProject_Add(protobuf_build
                     URL ${REQ_URL}
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND}
                     -Dprotobuf_WITH_ZLIB=OFF
                     -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
diff --git a/cmake/external_libs/protobuf_static.cmake b/cmake/external_libs/protobuf_static.cmake
index 6f3e1f53..e4bbb9a0 100755
--- a/cmake/external_libs/protobuf_static.cmake
+++ b/cmake/external_libs/protobuf_static.cmake
@@ -27,6 +27,7 @@ ExternalProject_Add(protobuf_static_build
                     URL ${REQ_URL}
                     #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
                     #SOURCE_DIR ${METADEF_DIR}/../../third_party/protobuf/src/protobuf-3.8.0
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external_libs/protoc.cmake b/cmake/external_libs/protoc.cmake
index 0d162c0d..58321f04 100755
--- a/cmake/external_libs/protoc.cmake
+++ b/cmake/external_libs/protoc.cmake
@@ -1,115 +1,116 @@
-if (HAVE_PROTOC)
-    return()
-endif()
-
-include(ExternalProject)
-include(GNUInstallDirs)
-#set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output)
-
-if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
-    (${CMAKE_INSTALL_PREFIX} STREQUAL "C:/Program Files (x86)/ascend"))
-    set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output CACHE STRING "path for install()" FORCE)
-    message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
-endif()
-
-if(GE_PB_PKG)
-    set(REQ_URL "${GE_PB_PKG}/libs/protobuf/v3.8.0.tar.gz")
-else()
-    if (ENABLE_GITEE)
-        set(REQ_URL "https://gitee.com/mirrors/protobuf_source/repository/archive/v3.8.0.tar.gz")
-        set(MD5 "eba86ae9f07ba5cfbaf8af3bc4e84236")
-    else()
-        set(REQ_URL "https://github.com/protocolbuffers/protobuf/archive/v3.8.0.tar.gz")
-        set(MD5 "3d9e32700639618a4d2d342c99d4507a")
-    endif ()
-endif()
-
-set(protobuf_CXXFLAGS "-Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
-set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
-ExternalProject_Add(protoc_build
-                    URL ${REQ_URL}
-                    #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
-                    #SOURCE_DIR ${GE_CODE_DIR}/../third_party/protobuf/src/protobuf-3.8.0
-                    CONFIGURE_COMMAND ${CMAKE_COMMAND} -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS=${protobuf_CXXFLAGS} -DCMAKE_CXX_LDFLAGS=${protobuf_LDFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/protoc <SOURCE_DIR>/cmake
-                    BUILD_COMMAND $(MAKE)
-                    INSTALL_COMMAND $(MAKE) install
-                    EXCLUDE_FROM_ALL TRUE
-)
-
-set(PROTOC_PKG_DIR ${CMAKE_INSTALL_PREFIX}/protoc)
-
-set(protoc_EXECUTABLE ${PROTOC_PKG_DIR}/${CMAKE_INSTALL_BINDIR}/protoc)
-
-function(protobuf_generate comp c_var h_var)
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: protobuf_generate() called without any proto files")
-        return()
-    endif()
-    set(${c_var})
-    set(${h_var})
-
-    foreach(file ${ARGN})
-        get_filename_component(abs_file ${file} ABSOLUTE)
-        get_filename_component(file_name ${file} NAME_WE)
-        get_filename_component(file_dir ${abs_file} PATH)
-        get_filename_component(parent_subdir ${file_dir} NAME)
-
-        if("${parent_subdir}" STREQUAL "proto")
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
-        else()
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
-        endif()
-        list(APPEND ${c_var} "${proto_output_path}/${file_name}.pb.cc")
-        list(APPEND ${h_var} "${proto_output_path}/${file_name}.pb.h")
-
-        add_custom_command(
-                OUTPUT "${proto_output_path}/${file_name}.pb.cc" "${proto_output_path}/${file_name}.pb.h"
-                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
-                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --cpp_out=${proto_output_path} ${abs_file}
-                DEPENDS protoc_build ${abs_file}
-                COMMENT "Running C++ protocol buffer compiler on ${file}" VERBATIM )
-    endforeach()
-
-    set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
-    set(${c_var} ${${c_var}} PARENT_SCOPE)
-    set(${h_var} ${${h_var}} PARENT_SCOPE)
-
-endfunction()
-
-function(protobuf_generate_py comp py_var)
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: protobuf_generate_py() called without any proto files")
-        return()
-    endif()
-    set(${py_var})
-
-    foreach(file ${ARGN})
-        get_filename_component(abs_file ${file} ABSOLUTE)
-        get_filename_component(file_name ${file} NAME_WE)
-        get_filename_component(file_dir ${abs_file} PATH)
-        get_filename_component(parent_subdir ${file_dir} NAME)
-
-        if("${parent_subdir}" STREQUAL "proto")
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
-        else()
-            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
-        endif()
-        list(APPEND ${py_var} "${proto_output_path}/${file_name}_pb2.py")
-
-        add_custom_command(
-                OUTPUT "${proto_output_path}/${file_name}_pb2.py"
-                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
-                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --python_out=${proto_output_path} ${abs_file}
-                DEPENDS protoc_build ${abs_file}
-                COMMENT "Running PYTHON protocol buffer compiler on ${file}" VERBATIM )
-    endforeach()
-
-    set_source_files_properties(${${py_var}} PROPERTIES GENERATED TRUE)
-    set(${py_var} ${${py_var}} PARENT_SCOPE)
-
-endfunction()
-
-#set(HAVE_PROTOC TRUE CACHE BOOL "protoc build add")
-set(HAVE_PROTOC TRUE)
+if (HAVE_PROTOC)
+    return()
+endif()
+
+include(ExternalProject)
+include(GNUInstallDirs)
+#set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output)
+
+if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
+    (${CMAKE_INSTALL_PREFIX} STREQUAL "C:/Program Files (x86)/ascend"))
+    set(CMAKE_INSTALL_PREFIX ${GE_CODE_DIR}/output CACHE STRING "path for install()" FORCE)
+    message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
+endif()
+
+if(GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/protobuf/v3.8.0.tar.gz")
+else()
+    if (ENABLE_GITEE)
+        set(REQ_URL "https://gitee.com/mirrors/protobuf_source/repository/archive/v3.8.0.tar.gz")
+        set(MD5 "eba86ae9f07ba5cfbaf8af3bc4e84236")
+    else()
+        set(REQ_URL "https://github.com/protocolbuffers/protobuf/archive/v3.8.0.tar.gz")
+        set(MD5 "3d9e32700639618a4d2d342c99d4507a")
+    endif ()
+endif()
+
+set(protobuf_CXXFLAGS "-Wno-maybe-uninitialized -Wno-unused-parameter -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
+set(protobuf_LDFLAGS "-Wl,-z,relro,-z,now,-z,noexecstack")
+ExternalProject_Add(protoc_build
+                    URL ${REQ_URL}
+                    #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
+                    #SOURCE_DIR ${GE_CODE_DIR}/../third_party/protobuf/src/protobuf-3.8.0
+                    TLS_VERIFY OFF
+                    CONFIGURE_COMMAND ${CMAKE_COMMAND} -Dprotobuf_WITH_ZLIB=OFF -Dprotobuf_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS=${protobuf_CXXFLAGS} -DCMAKE_CXX_LDFLAGS=${protobuf_LDFLAGS} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}/protoc <SOURCE_DIR>/cmake
+                    BUILD_COMMAND $(MAKE)
+                    INSTALL_COMMAND $(MAKE) install
+                    EXCLUDE_FROM_ALL TRUE
+)
+
+set(PROTOC_PKG_DIR ${CMAKE_INSTALL_PREFIX}/protoc)
+
+set(protoc_EXECUTABLE ${PROTOC_PKG_DIR}/${CMAKE_INSTALL_BINDIR}/protoc)
+
+function(protobuf_generate comp c_var h_var)
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: protobuf_generate() called without any proto files")
+        return()
+    endif()
+    set(${c_var})
+    set(${h_var})
+
+    foreach(file ${ARGN})
+        get_filename_component(abs_file ${file} ABSOLUTE)
+        get_filename_component(file_name ${file} NAME_WE)
+        get_filename_component(file_dir ${abs_file} PATH)
+        get_filename_component(parent_subdir ${file_dir} NAME)
+
+        if("${parent_subdir}" STREQUAL "proto")
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
+        else()
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
+        endif()
+        list(APPEND ${c_var} "${proto_output_path}/${file_name}.pb.cc")
+        list(APPEND ${h_var} "${proto_output_path}/${file_name}.pb.h")
+
+        add_custom_command(
+                OUTPUT "${proto_output_path}/${file_name}.pb.cc" "${proto_output_path}/${file_name}.pb.h"
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
+                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --cpp_out=${proto_output_path} ${abs_file}
+                DEPENDS protoc_build ${abs_file}
+                COMMENT "Running C++ protocol buffer compiler on ${file}" VERBATIM )
+    endforeach()
+
+    set_source_files_properties(${${c_var}} ${${h_var}} PROPERTIES GENERATED TRUE)
+    set(${c_var} ${${c_var}} PARENT_SCOPE)
+    set(${h_var} ${${h_var}} PARENT_SCOPE)
+
+endfunction()
+
+function(protobuf_generate_py comp py_var)
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: protobuf_generate_py() called without any proto files")
+        return()
+    endif()
+    set(${py_var})
+
+    foreach(file ${ARGN})
+        get_filename_component(abs_file ${file} ABSOLUTE)
+        get_filename_component(file_name ${file} NAME_WE)
+        get_filename_component(file_dir ${abs_file} PATH)
+        get_filename_component(parent_subdir ${file_dir} NAME)
+
+        if("${parent_subdir}" STREQUAL "proto")
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto)
+        else()
+            set(proto_output_path ${CMAKE_BINARY_DIR}/proto/${comp}/proto/${parent_subdir})
+        endif()
+        list(APPEND ${py_var} "${proto_output_path}/${file_name}_pb2.py")
+
+        add_custom_command(
+                OUTPUT "${proto_output_path}/${file_name}_pb2.py"
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                COMMAND ${CMAKE_COMMAND} -E make_directory "${proto_output_path}"
+                COMMAND ${protoc_EXECUTABLE} -I${file_dir} --python_out=${proto_output_path} ${abs_file}
+                DEPENDS protoc_build ${abs_file}
+                COMMENT "Running PYTHON protocol buffer compiler on ${file}" VERBATIM )
+    endforeach()
+
+    set_source_files_properties(${${py_var}} PROPERTIES GENERATED TRUE)
+    set(${py_var} ${${py_var}} PARENT_SCOPE)
+
+endfunction()
+
+#set(HAVE_PROTOC TRUE CACHE BOOL "protoc build add")
+set(HAVE_PROTOC TRUE)
diff --git a/cmake/external_libs/securec.cmake b/cmake/external_libs/securec.cmake
index 0bd62ab2..0f8b6d3a 100755
--- a/cmake/external_libs/securec.cmake
+++ b/cmake/external_libs/securec.cmake
@@ -10,11 +10,20 @@ if ((${CMAKE_INSTALL_PREFIX} STREQUAL /usr/local) OR
     message(STATUS "No install prefix selected, default to ${CMAKE_INSTALL_PREFIX}.")
 endif()
 
+if (GE_PB_PKG)
+    set(REQ_URL "${GE_PB_PKG}/libs/securec/v1.1.10.tar.gz")
+    set(MD5 "")
+else()
+    set(REQ_URL "https://gitee.com/openeuler/libboundscheck/repository/archive/v1.1.10.tar.gz")
+    set(MD5 "")
+endif ()
+
 ExternalProject_Add(c_sec_build
-                    URL https://gitee.com/openeuler/libboundscheck/repository/archive/v1.1.10.tar.gz
-                    #URL /home/txd/workspace/linux_cmake/pkg/protobuf-3.8.0.tar.gz
+                    URL ${REQ_URL}
+                    #URL https://gitee.com/openeuler/libboundscheck/repository/archive/v1.1.10.tar.gz
                     #SOURCE_DIR ${GE_CODE_DIR}/../libc_sec
                     PATCH_COMMAND patch -p1 < ${GE_CODE_DIR}/metadef/third_party/patch/securec/0001-add-securec-cmake-script.patch
+                    TLS_VERIFY OFF
                     CONFIGURE_COMMAND ${CMAKE_COMMAND}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/intf_pub_linux.cmake b/cmake/intf_pub_linux.cmake
index 40c6bca9..61237d11 100755
--- a/cmake/intf_pub_linux.cmake
+++ b/cmake/intf_pub_linux.cmake
@@ -16,6 +16,7 @@ target_compile_definitions(intf_pub INTERFACE
     $<$<CONFIG:Debug>:CFG_BUILD_DEBUG>   
     WIN64=1
     LINUX=0
+    LOG_CPP
 )
 target_link_options(intf_pub INTERFACE
     -Wl,-z,relro
diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt
index 88a5c52f..a8eabf05 100755
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@@ -1,7 +1,6 @@
 if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES)
     add_subdirectory(common)
     add_subdirectory(plugin/engine)
-    add_subdirectory(graph/build/memory)
     add_subdirectory(ge_local_engine)
     add_subdirectory(host_cpu_engine)
     add_subdirectory(executor)
@@ -33,6 +32,51 @@ protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST})
 protobuf_generate(ge PROTO_CLIENT_SRCS PROTO_CLIENT_HDRS ${PROTO_CLIENT_LIST})
 protobuf_generate(ge PROTO_HEADER_SRCS PROTO_HEADER_HDRS ${PROTO_HEADER_LIST})
 
+if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES)
+############ libge_proto_common.a ############
+add_library(ge_proto_common STATIC
+    ${PROTO_HEADER_HDRS}
+    ${PROTO_SRCS}
+)
+
+target_compile_definitions(ge_proto_common PRIVATE
+    PROTOBUF_INLINE_NOT_IN_HEADERS=0
+    google=ascend_private
+)
+
+target_compile_options(ge_proto_common PRIVATE
+    -O2
+    -fno-common
+)
+
+target_link_libraries(ge_proto_common PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    ascend_protobuf
+)
+
+############ libge_proto_client.a ############
+add_library(ge_proto_client STATIC
+    ${PROTO_HEADER_HDRS}
+    ${PROTO_CLIENT_SRCS}
+)
+
+target_compile_definitions(ge_proto_client PRIVATE
+    PROTOBUF_INLINE_NOT_IN_HEADERS=0
+    google=ascend_private
+)
+
+target_compile_options(ge_proto_client PRIVATE
+    -O2
+    -fno-common
+)
+
+target_link_libraries(ge_proto_client PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    ascend_protobuf
+)
+endif ()
+
+##################################################################
 set(TRAIN_SRC_LIST
     "common/formats/format_transfers/datatype_transfer.cc"
     "common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc"
@@ -60,6 +104,8 @@ set(TRAIN_SRC_LIST
     "common/dump/dump_manager.cc"
     "common/dump/dump_properties.cc"
     "common/dump/dump_op.cc"
+    "common/profiling/ge_profiling.cc"
+    "common/profiling/ge_runner_profiling.cc"
     "engine_manager/dnnengine_manager.cc"
     "ge_local_engine/engine/host_cpu_engine.cc"
     "generator/ge_generator.cc"
@@ -123,6 +169,7 @@ set(TRAIN_SRC_LIST
     "graph/manager/graph_var_manager.cc"
     "graph/manager/host_mem_manager.cc"
     "graph/manager/rdma_pool_allocator.cc"
+    "graph/manager/host_mem_allocator.cc"
     "graph/manager/memory_api.cc"
     "graph/manager/model_manager/event_manager.cc"
     "graph/manager/trans_var_data_utils.cc"
@@ -142,6 +189,7 @@ set(TRAIN_SRC_LIST
     "graph/passes/atomic_addr_clean_pass.cc"
     "graph/passes/mark_same_addr_pass.cc"
     "graph/passes/mark_graph_unknown_status_pass.cc"
+    "graph/passes/dynamic_single_op_reset_shape_pass.cc"
     "graph/passes/mark_agnostic_pass.cc"
     "graph/partition/dynamic_shape_partition.cc"
     "graph/partition/stage_partition.cc"
@@ -154,13 +202,17 @@ set(TRAIN_SRC_LIST
     "graph/passes/compile_nodes_pass.cc"
     "graph/passes/constant_folding_pass.cc"
     "graph/passes/constant_fuse_same_pass.cc"
+    "graph/passes/fuse_data_nodes_with_common_input_pass.cc"
+    "graph/passes/remove_same_const_pass.cc"
+    "graph/passes/useless_control_out_remove_pass.cc"
     "graph/passes/control_trigger_pass.cc"
     "graph/passes/dimension_adjust_pass.cc"
     "graph/passes/dimension_compute_pass.cc"
     "graph/passes/dropout_pass.cc"
     "graph/passes/hccl_group_pass.cc"
     "graph/passes/enter_pass.cc"
-    "graph/passes/assign_pass.cc"
+    "graph/passes/assign_remove_pass.cc"
+    "graph/passes/inplace_support_check_pass.cc"
     "graph/passes/flow_ctrl_pass.cc"
     "graph/passes/global_step_insert_pass.cc"
     "host_kernels/transpose_kernel.cc"
@@ -201,6 +253,7 @@ set(TRAIN_SRC_LIST
     "host_kernels/sub_kernel.cc"
     "host_kernels/transdata_kernel.cc"
     "host_kernels/unpack_kernel.cc"
+    "host_kernels/reformat_kernel.cc"
     "graph/passes/folding_pass.cc"
     "graph/passes/get_original_format_pass.cc"
     "graph/passes/guarantee_const_pass.cc"
@@ -331,10 +384,16 @@ set(TRAIN_SRC_LIST
     "hybrid/hybrid_davinci_model.cc"
     "executor/ge_executor.cc"
     "client/ge_api.cc"
-    "client/ge_prof.cc"
     "analyzer/analyzer.cc"
     "ir_build/ge_ir_build.cc"
     "ir_build/atc_ir_common.cc"
+    "graph/build/memory/memory_assigner.cc"
+    "graph/build/memory/graph_mem_assigner.cc"
+    "graph/build/memory/binary_block_mem_assigner.cc"
+    "graph/build/memory/block_mem_assigner.cc"
+    "graph/build/memory/hybrid_mem_assigner.cc"
+    "graph/build/memory/max_block_mem_assigner.cc"
+    "graph/build/memory/var_mem_assign_util.cc"
 )
 
 set(INFER_SRC_LIST
@@ -396,6 +455,7 @@ set(INFER_SRC_LIST
     "graph/manager/graph_var_manager.cc"
     "graph/manager/host_mem_manager.cc"
     "graph/manager/rdma_pool_allocator.cc"
+    "graph/manager/host_mem_allocator.cc"
     "graph/manager/graph_mem_allocator.cc"
     "graph/manager/graph_caching_allocator.cc"
     "model/ge_model.cc"
@@ -425,6 +485,7 @@ set(INFER_SRC_LIST
     "graph/passes/net_output_pass.cc"
     "graph/passes/replace_transshape_pass.cc"
     "graph/passes/constant_fuse_same_pass.cc"
+    "graph/passes/fuse_data_nodes_with_common_input_pass.cc"
     "graph/passes/print_op_pass.cc"
     "graph/passes/no_use_reshape_remove_pass.cc"
     "graph/passes/iterator_op_pass.cc"
@@ -432,6 +493,7 @@ set(INFER_SRC_LIST
     "graph/passes/atomic_addr_clean_pass.cc"
     "graph/passes/mark_same_addr_pass.cc"
     "graph/passes/mark_graph_unknown_status_pass.cc"
+    "graph/passes/dynamic_single_op_reset_shape_pass.cc"
     "graph/passes/mark_agnostic_pass.cc"
     "graph/common/omg_util.cc"
     "graph/common/bcast.cc"
@@ -487,6 +549,7 @@ set(INFER_SRC_LIST
     "host_kernels/slice_d_kernel.cc"
     "host_kernels/dynamic_stitch_kernel.cc"
     "host_kernels/identity_kernel.cc"
+    "host_kernels/reformat_kernel.cc"
     "graph/passes/stop_gradient_pass.cc"
     "graph/passes/prevent_gradient_pass.cc"
     "graph/passes/identity_pass.cc"
@@ -514,9 +577,12 @@ set(INFER_SRC_LIST
     "graph/passes/cond_remove_pass.cc"
     "graph/passes/for_pass.cc"
     "graph/passes/enter_pass.cc"
-    "graph/passes/assign_pass.cc"
+    "graph/passes/assign_remove_pass.cc"
+    "graph/passes/inplace_support_check_pass.cc"
     "graph/passes/addn_pass.cc"
     "graph/passes/common_subexpression_elimination_pass.cc"
+    "graph/passes/remove_same_const_pass.cc"
+    "graph/passes/useless_control_out_remove_pass.cc"
     "graph/passes/transop_symmetry_elimination_pass.cc"
     "graph/passes/save_pass.cc"
     "graph/passes/switch_dead_branch_elimination.cc"
@@ -598,11 +664,34 @@ set(INFER_SRC_LIST
     "graph/label/while_label_maker.cc"
     "graph/label/partitioned_call_label_maker.cc"
     "analyzer/analyzer.cc"
+    "graph/build/memory/memory_assigner.cc"
+    "graph/build/memory/graph_mem_assigner.cc"
+    "graph/build/memory/binary_block_mem_assigner.cc"
+    "graph/build/memory/block_mem_assigner.cc"
+    "graph/build/memory/hybrid_mem_assigner.cc"
+    "graph/build/memory/max_block_mem_assigner.cc"
+    "graph/build/memory/var_mem_assign_util.cc"
 )
 
 if (NOT ENABLE_D AND NOT ENABLE_ACL AND NOT ENABLE_MS_TESTCASES)
+message("CMAKE_CXX_COMPILER_VERSION = ${CMAKE_CXX_COMPILER_VERSION}")
 ############ libge_runner.so ############
-add_library(ge_runner SHARED ${TRAIN_SRC_LIST} ${PROTO_SRCS} ${PROTO_CLIENT_SRCS})
+add_library(ge_runner SHARED
+    ${TRAIN_SRC_LIST}
+    $<TARGET_OBJECTS:$<IF:$<TARGET_EXISTS:msprofiler_fwk>,msprofiler_fwk,msprofiler_fwk_object>>
+)
+
+add_library(msprofiler_fwk_object OBJECT IMPORTED GLOBAL)
+
+if (msprofiler_fwk_ext_LIBRARY_DIR)
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/msprofiler_fwk_object)
+    execute_process(
+        COMMAND ar x ${msprofiler_fwk_ext_LIBRARY_DIR}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/msprofiler_fwk_object
+    )
+    file(GLOB MSPROFILER_FWK_OBJECT_LIST ${CMAKE_CURRENT_BINARY_DIR}/msprofiler_fwk_object/*.o)
+    set_property(TARGET msprofiler_fwk_object PROPERTY IMPORTED_OBJECTS ${MSPROFILER_FWK_OBJECT_LIST})
+endif()
 
 target_compile_definitions(ge_runner PRIVATE
     PROTOBUF_INLINE_NOT_IN_HEADERS=0
@@ -615,9 +704,12 @@ target_compile_definitions(ge_runner PRIVATE
 
 target_compile_options(ge_runner PRIVATE
     -O2
+    -fno-common
+    $<$<STREQUAL:${CMAKE_CXX_COMPILER_VERSION},7.3.0>:-Werror=unused-variable>
+    $<$<STREQUAL:${CMAKE_CXX_COMPILER_VERSION},7.3.0>:-Werror=unused-const-variable>
 )
 
-target_include_directories(ge_runner PRIVATE
+target_include_directories(ge_runner SYSTEM PRIVATE
     ${GE_CODE_DIR}/ge
     ${GE_CODE_DIR}/ge/analyzer
     ${GE_CODE_DIR}/inc
@@ -643,12 +735,12 @@ target_include_directories(ge_runner PRIVATE
     ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
 )
 
-target_link_libraries(ge_runner
+target_link_libraries(ge_runner PRIVATE
     $<BUILD_INTERFACE:intf_pub>
-    ge_memory
     adump_server
-    msprofiler
     static_mmpa
+    ge_proto_common
+    ge_proto_client
     -Wl,--no-as-needed
     graph
     ge_common
@@ -656,9 +748,7 @@ target_link_libraries(ge_runner
     register
     c_sec
     slog
-    msprof
     runtime
-    resource
     error_manager
     ascend_hal_stub
     -Wl,--as-needed
@@ -668,7 +758,9 @@ target_link_libraries(ge_runner
 )
 
 ############ libge_compiler.so ############
-add_library(ge_compiler SHARED ${INFER_SRC_LIST} ${PROTO_SRCS})
+add_library(ge_compiler SHARED
+    ${INFER_SRC_LIST}
+)
 
 target_compile_definitions(ge_compiler PRIVATE
     PROTOBUF_INLINE_NOT_IN_HEADERS=0
@@ -681,9 +773,12 @@ target_compile_definitions(ge_compiler PRIVATE
 
 target_compile_options(ge_compiler PRIVATE
     -O2
+    -fno-common
+    $<$<STREQUAL:${CMAKE_CXX_COMPILER_VERSION},7.3.0>:-Werror=unused-variable>
+    $<$<STREQUAL:${CMAKE_CXX_COMPILER_VERSION},7.3.0>:-Werror=unused-const-variable>
 )
 
-target_include_directories(ge_compiler PRIVATE
+target_include_directories(ge_compiler SYSTEM PRIVATE
     ${GE_CODE_DIR}/ge
     ${GE_CODE_DIR}/ge/analyzer
     ${GE_CODE_DIR}/inc
@@ -709,10 +804,10 @@ target_include_directories(ge_compiler PRIVATE
     ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
 )
 
-target_link_libraries(ge_compiler
+target_link_libraries(ge_compiler PRIVATE
     $<BUILD_INTERFACE:intf_pub>
-    ge_memory
     static_mmpa
+    ge_proto_common
     -Wl,--no-as-needed
     graph
     ge_common
@@ -722,7 +817,6 @@ target_link_libraries(ge_compiler
     error_manager
     slog
     runtime_compile
-    resource
     -Wl,--as-needed
     json
     -lrt
@@ -739,7 +833,7 @@ file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ascendcl_object)
 if(EXISTS ${STATIC_ACL_LIB}/libascendcl.a)
     execute_process(
         COMMAND ar x ${STATIC_ACL_LIB}/libascendcl.a
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ascendcl_object    
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ascendcl_object
     )
     file(GLOB OBJECT_LIST ${CMAKE_CURRENT_BINARY_DIR}/ascendcl_object/*.o)
 else()
@@ -748,8 +842,21 @@ endif()
 
 add_library(opensrc_ascendcl SHARED
     ${OBJECT_LIST}
+    $<TARGET_OBJECTS:$<IF:$<TARGET_EXISTS:msprofiler>,msprofiler,msprofiler_object>>
 )
 
+add_library(msprofiler_object OBJECT IMPORTED GLOBAL)
+
+if (msprofiler_ext_LIBRARY_DIR)
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/msprofiler_object)
+    execute_process(
+        COMMAND ar x ${msprofiler_ext_LIBRARY_DIR}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/msprofiler_object
+    )
+    file(GLOB MSPROFILER_OBJECT_LIST ${CMAKE_CURRENT_BINARY_DIR}/msprofiler_object/*.o)
+    set_property(TARGET msprofiler_object PROPERTY IMPORTED_OBJECTS ${MSPROFILER_OBJECT_LIST})
+endif()
+
 target_compile_definitions(opensrc_ascendcl PRIVATE
     google=ascend_private
 )
@@ -775,13 +882,11 @@ target_link_libraries(opensrc_ascendcl PRIVATE
                      register_static
                      error_manager_static
                      adump_server
-                     msprofiler
                      -Wl,--no-whole-archive
                      -Wl,--no-as-needed
                      c_sec
                      runtime
                      slog
-                     msprof
                      ascend_hal_stub
                      -Wl,--as-needed
                      -lrt
@@ -797,12 +902,10 @@ set_target_properties(opensrc_ascendcl PROPERTIES
 add_custom_command(
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_ir_build.cc
            ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_api.cc
-           ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_prof.cc
     COMMAND echo "Generating stub files."
             && ${HI_PYTHON} ${CMAKE_CURRENT_LIST_DIR}/stub/gen_stubapi.py ${GE_CODE_DIR}/inc/external ${CMAKE_CURRENT_BINARY_DIR}
             && mv ge_ir_build.cc stub_ge_ir_build.cc
             && mv ge_api.cc stub_ge_api.cc
-            && mv ge_prof.cc stub_ge_prof.cc
             &&  echo "Generating stub files end."
     #WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     #DEPENDS stub/gen_stubapi.py ${TOP_DIR}/inc/external ${CMAKE_CURRENT_BINARY_DIR}
@@ -811,7 +914,6 @@ add_custom_command(
 add_custom_target(ge_stub
     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_ir_build.cc
             ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_api.cc
-            ${CMAKE_CURRENT_BINARY_DIR}/stub_ge_prof.cc
 )
 
 ##################################################################
@@ -853,7 +955,6 @@ target_include_directories(atc_stub_ge_compiler PRIVATE
 ############ stub/libge_runner.so ############
 add_library(fwk_stub_ge_runner SHARED
     stub_ge_api.cc
-    stub_ge_prof.cc
     stub_ge_ir_build.cc
 )
 
diff --git a/ge/analyzer/analyzer.cc b/ge/analyzer/analyzer.cc
index 29181384..c63a6008 100755
--- a/ge/analyzer/analyzer.cc
+++ b/ge/analyzer/analyzer.cc
@@ -217,10 +217,15 @@ ge::Status Analyzer::SaveAnalyzerDataToFile(uint64_t session_id, uint64_t graph_
 
   json jsn;
   GraphInfoToJson(jsn, *graph_info);
-  json_file_ << jsn.dump(kJsonDumpLevel) << std::endl;
+  bool ret_failed = false;
+  try {
+    json_file_ << jsn.dump(kJsonDumpLevel) << std::endl;
+  } catch (nlohmann::detail::type_error &e) {
+    GELOGE(FAILED, "analyzer file [%s] failed because [%s]", json_file_name_.c_str(), e.what());
+    ret_failed = true;
+  }
   json_file_.close();
-
-  return SUCCESS;
+  return ret_failed ? FAILED : SUCCESS;
 }
 
 ge::Status Analyzer::DoAnalyze(DataInfo &data_info) {
diff --git a/ge/client/ge_api.cc b/ge/client/ge_api.cc
index 9ecc3016..d65d7667 100644
--- a/ge/client/ge_api.cc
+++ b/ge/client/ge_api.cc
@@ -32,6 +32,7 @@
 #include "graph/common/ge_call_wrapper.h"
 #include "register/op_registry.h"
 #include "common/ge/tbe_plugin_manager.h"
+#include "toolchain/plog.h"
 
 using domi::OpRegistry;
 using std::map;
@@ -129,12 +130,15 @@ Status GEInitializeImpl(const std::map<string, string> &options) {
 
 // Initialize GE, prepare for execution, call GELib::Initialize
 Status GEInitialize(const std::map<string, string> &options) {
+  if (DlogReportInitialize() != SUCCESS) {
+    GELOGW("Dlog report device log initialize failed.");
+  }
   return GEInitializeImpl(options);
 }
 
 Status GEInitialize(const std::map<AscendString, AscendString> &options) {
   std::map<std::string, std::string> str_options;
-  for (auto & option : options) {
+  for (auto &option : options) {
     if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) {
       GELOGE(FAILED, "GEInitialize options is nullptr.");
       return FAILED;
@@ -143,6 +147,9 @@ Status GEInitialize(const std::map<AscendString, AscendString> &options) {
     std::string val = option.second.GetString();
     str_options[key] = val;
   }
+  if (DlogReportInitialize() != SUCCESS) {
+    GELOGW("Dlog report device log initialize failed.");
+  }
   return GEInitializeImpl(str_options);
 }
 
@@ -187,6 +194,10 @@ Status GEFinalize() {
   // to avoid memory fragment, use malloc_trim to back free stack to system
   malloc_trim(0);
 
+  if (DlogReportFinalize() != SUCCESS) {
+    GELOGW("Dlog report device log finalize failed.");
+  }
+
   GELOGT(TRACE_STOP, "GEFinalize finished");
   return ret;
 }
diff --git a/ge/client/ge_prof.cc b/ge/client/ge_prof.cc
deleted file mode 100644
index ede38430..00000000
--- a/ge/client/ge_prof.cc
+++ /dev/null
@@ -1,369 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ge/ge_prof.h"
-#include "ge/ge_api.h"
-#include "init/gelib.h"
-#include "common/debug/log.h"
-#include "framework/common/debug/ge_log.h"
-#include "common/profiling/profiling_manager.h"
-#include "graph/load/graph_loader.h"
-#include "toolchain/prof_acl_api.h"
-
-using std::map;
-using std::string;
-using std::vector;
-
-namespace {
-const uint32_t kMaxDeviceNum = 64;
-const uint32_t kDeviceListIndex = 3;
-const std::string kProfilingInit = "prof_init";
-const std::string kProfilingFinalize = "prof_finalize";
-const std::string kProfilingStart = "prof_start";
-const std::string kProfilingStop = "prof_stop";
-const std::string kDeviceNums = "devNums";
-const std::string kDeviceIdList = "devIdList";
-const std::string kAicoreMetrics = "aicoreMetrics";
-
-const std::map<ge::ProfilingAicoreMetrics, std::string> kProfAicoreMetricsToString = {
-    {ge::kAicoreArithmaticThroughput, "AICORE_ARITHMATIC_THROUGHPUT"},
-    {ge::kAicorePipeline, "AICORE_PIPELINE"},
-    {ge::kAicoreSynchronization, "AICORE_SYNCHRONIZATION"},
-    {ge::kAicoreMemory, "AICORE_MEMORY"},
-    {ge::kAicoreInternalMemory, "AICORE_INTERNAL_MEMORY"},
-    {ge::kAicoreStall, "AICORE_STALL"}};
-}  // namespace
-
-static bool g_graph_prof_init_ = false;
-static std::mutex g_prof_mutex_;
-
-namespace ge {
-struct aclgrphProfConfig {
-  ProfConfig config;
-};
-
-Status aclgrphProfInit(const char *profiler_path, uint32_t length) {
-  GELOGT(TRACE_INIT, "Graph prof init start");
-
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  if (g_graph_prof_init_) {
-    GELOGW("Multi graph profiling initializations.");
-    return GE_PROF_MULTI_INIT;
-  }
-
-  Status ret = CheckPath(profiler_path, length);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Profiling config path is invalid.");
-    return ret;
-  }
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof init failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-
-  ret = ProfInit(profiler_path);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "ProfInit init fail");
-    return ret;
-  }
-
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingInit;
-  command.module_index = PROF_MODEL_LOAD;
-  ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command %s failed, config = %s", kProfilingInit.c_str(), profiler_path);
-    return ret;
-  }
-  if (!g_graph_prof_init_) {
-    g_graph_prof_init_ = true;
-    GELOGI("Profiling init successfully.");
-  }
-
-  GELOGI("Successfully execute GraphProfInit.");
-  return SUCCESS;
-}
-
-Status aclgrphProfFinalize() {
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-
-  if (!g_graph_prof_init_) {
-    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
-    return GE_PROF_NOT_INIT;
-  }
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingFinalize;
-  Status ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command %s failed.", kProfilingFinalize.c_str());
-    return ret;
-  }
-
-  ret = ProfFinalize();
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Finalize profiling failed, result = %d", ret);
-  }
-
-  if (ret == SUCCESS) {
-    g_graph_prof_init_ = false;
-    GELOGI("Successfully execute GraphProfFinalize.");
-  }
-  return ret;
-}
-
-bool TransProfConfigToParam(const aclgrphProfConfig *profiler_config, vector<string> &prof_config_params) {
-  prof_config_params.clear();
-  prof_config_params.emplace_back(kDeviceNums);
-  prof_config_params.emplace_back(std::to_string(profiler_config->config.devNums));
-  prof_config_params.emplace_back(kDeviceIdList);
-  std::string devID = "";
-  if (profiler_config->config.devNums == 0) {
-    GELOGW("The device num is invalid.");
-    return false;
-  }
-  for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
-    devID.append(std::to_string(profiler_config->config.devIdList[i]));
-    if (i != profiler_config->config.devNums - 1) {
-      devID.append(",");
-    }
-  }
-
-  prof_config_params.push_back(devID);
-  prof_config_params.push_back(kAicoreMetrics);
-  auto iter =
-      kProfAicoreMetricsToString.find(static_cast<ProfilingAicoreMetrics>(profiler_config->config.aicoreMetrics));
-  if (iter == kProfAicoreMetricsToString.end()) {
-    GELOGW("The prof aicore metrics is invalid.");
-    return false;
-  }
-  prof_config_params.push_back(iter->second);
-  return true;
-}
-
-bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
-  if (deviceid_list == nullptr) {
-    GELOGE(PARAM_INVALID, "deviceIdList is nullptr");
-    return false;
-  }
-  if (device_nums == 0 || device_nums > kMaxDeviceNum) {
-    GELOGE(PARAM_INVALID, "The device nums is invalid.");
-    return false;
-  }
-
-  // real device num
-  int32_t dev_count = 0;
-  rtError_t rt_err = rtGetDeviceCount(&dev_count);
-  if (rt_err != RT_ERROR_NONE) {
-    GELOGE(INTERNAL_ERROR, "Get the Device count fail.");
-    return false;
-  }
-
-  if (device_nums > static_cast<uint32_t>(dev_count)) {
-    GELOGE(PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
-    return false;
-  }
-
-  std::unordered_set<uint32_t> record;
-  for (size_t i = 0; i < device_nums; ++i) {
-    uint32_t dev_id = deviceid_list[i];
-    if (dev_id >= static_cast<uint32_t>(dev_count)) {
-      GELOGE(PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
-      return false;
-    }
-    if (record.count(dev_id) > 0) {
-      GELOGE(PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
-      return false;
-    }
-    record.insert(dev_id);
-  }
-  return true;
-}
-
-aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
-                                           ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
-                                           uint64_t data_type_config) {
-  if (!isProfConfigValid(deviceid_list, device_nums)) {
-    return nullptr;
-  }
-  aclgrphProfConfig *config = new (std::nothrow) aclgrphProfConfig();
-  if (config == nullptr) {
-    GELOGE(INTERNAL_ERROR, "new aclgrphProfConfig fail");
-    return nullptr;
-  }
-  config->config.devNums = device_nums;
-  if (memcpy_s(config->config.devIdList, sizeof(config->config.devIdList), deviceid_list,
-               device_nums * sizeof(uint32_t)) != EOK) {
-    GELOGE(INTERNAL_ERROR, "copy devID failed. size = %u", device_nums);
-    delete config;
-    return nullptr;
-  }
-
-  config->config.aicoreMetrics = static_cast<ProfAicoreMetrics>(aicore_metrics);
-  config->config.dataTypeConfig = data_type_config;
-  GELOGI("Successfully create prof config.");
-  return config;
-}
-
-Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config) {
-  if (profiler_config == nullptr) {
-    GELOGE(PARAM_INVALID, "destroy profilerConfig failed, profilerConfig must not be nullptr");
-    return PARAM_INVALID;
-  }
-
-  delete profiler_config;
-  GELOGI("Successfully destroy prof config.");
-  return SUCCESS;
-}
-
-Status aclgrphProfStart(aclgrphProfConfig *profiler_config) {
-  if (profiler_config == nullptr) {
-    GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
-    return FAILED;
-  }
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-  if (!g_graph_prof_init_) {
-    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
-    return GE_PROF_NOT_INIT;
-  }
-
-  Status ret = ProfStartProfiling(&profiler_config->config);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Start profiling failed, prof result = %d", ret);
-    return FAILED;
-  }
-
-  std::vector<string> prof_params;
-  if (!TransProfConfigToParam(profiler_config, prof_params)) {
-    GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
-    return PARAM_INVALID;
-  }
-
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingStart;
-  command.cmd_params = prof_params;
-  command.module_index = profiler_config->config.dataTypeConfig;
-  GELOGI("Profiling will start, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
-         prof_params[kDeviceListIndex].c_str(), command.module_index);
-  ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command failed");
-    return FAILED;
-  }
-
-  GELOGI("Successfully execute GraphProfStartProfiling.");
-
-  return SUCCESS;
-}
-
-Status aclgrphProfStop(aclgrphProfConfig *profiler_config) {
-  if (profiler_config == nullptr) {
-    GELOGE(PARAM_INVALID, "aclgrphProfConfig is invalid.");
-    return FAILED;
-  }
-  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
-  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
-    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "Ge client is not initialized.");
-    return FAILED;
-  }
-
-  std::lock_guard<std::mutex> lock(g_prof_mutex_);
-  // if command mode is set, just return
-  if (ProfilingManager::Instance().ProfilingOn()) {
-    GELOGW("Graph prof finalize failed, cause profiling command pattern is running.");
-    return GE_PROF_MODE_CONFLICT;
-  }
-  if (!g_graph_prof_init_) {
-    GELOGE(GE_PROF_NOT_INIT, "Graph not profiling initialize.");
-    return GE_PROF_NOT_INIT;
-  }
-
-  for (uint32_t i = 0; i < profiler_config->config.devNums; i++) {
-    uint64_t data_type_config;
-    Status status = ProfGetDataTypeConfig(profiler_config->config.devIdList[i], data_type_config);
-    if (status != SUCCESS) {
-      GELOGE(status, "Prof get data type config failed, prof result = %d", status);
-      return status;
-    }
-    if (data_type_config != profiler_config->config.dataTypeConfig) {
-      GELOGE(FAILED, "data type config verify failed");
-      return FAILED;
-    }
-  }
-
-  std::vector<string> prof_params;
-  if (!TransProfConfigToParam(profiler_config, prof_params)) {
-    GELOGE(PARAM_INVALID, "Transfer profilerConfig to string vector failed");
-    return PARAM_INVALID;
-  }
-
-  GraphLoader graph_loader;
-  Command command;
-  command.cmd_params.clear();
-  command.cmd_type = kProfilingStop;
-  command.cmd_params = prof_params;
-  command.module_index = profiler_config->config.dataTypeConfig;
-  GELOGI("Profiling will stop, device nums:%s , deviceID:[%s], data type config: 0x%llx", prof_params[0].c_str(),
-         prof_params[kDeviceListIndex].c_str(), command.module_index);
-  Status ret = graph_loader.CommandHandle(command);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Handle profiling command failed");
-    return FAILED;
-  }
-
-  ret = ProfStopProfiling(&profiler_config->config);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Stop profiling failed, prof result = %d", ret);
-    return ret;
-  }
-
-  GELOGI("Successfully execute GraphProfStopProfiling.");
-  return SUCCESS;
-}
-}  // namespace ge
diff --git a/ge/client/module.mk b/ge/client/module.mk
index 6ac69d31..e9d35418 100644
--- a/ge/client/module.mk
+++ b/ge/client/module.mk
@@ -4,7 +4,6 @@ LOCAL_PATH := $(call my-dir)
 COMMON_LOCAL_SRC_FILES := \
     proto/ge_api.proto \
     ge_api.cc \
-    ge_prof.cc \
 
 
 COMMON_LOCAL_C_INCLUDES := \
@@ -69,9 +68,9 @@ LOCAL_SHARED_LIBRARIES := \
     libgraph \
     libregister \
     libge_compiler \
-    libge_common \
-    libmsprof
+    libge_common
 
+LOCAL_STATIC_LIBRARIES += libmsprofiler_fwk \
 
 
 LOCAL_LDFLAGS := -lrt -ldl
@@ -104,8 +103,10 @@ LOCAL_SHARED_LIBRARIES := \
     libregister \
     libruntime \
     libge_compiler \
-    libge_common \
-    libmsprof
+    libge_common
+
+
+LOCAL_STATIC_LIBRARIES += libmsprofiler_fwk \
 
 
 LOCAL_LDFLAGS := -lrt -ldl
diff --git a/ge/client/proto/ge_ir.proto b/ge/client/proto/ge_ir.proto
index e7bfe0cb..12989a54 100644
--- a/ge/client/proto/ge_ir.proto
+++ b/ge/client/proto/ge_ir.proto
@@ -30,6 +30,7 @@ enum DataType
     DT_RESOURCE  = 23;         // resource type
     DT_STRING_REF = 24;        // string_ref type
     DT_DUAL      = 25;              /**< dual output type */
+    DT_VARIANT = 26;           // variant type
 }
 
 message AttrDef
diff --git a/ge/common/CMakeLists.txt b/ge/common/CMakeLists.txt
index aa546c0d..0172628c 100755
--- a/ge/common/CMakeLists.txt
+++ b/ge/common/CMakeLists.txt
@@ -24,6 +24,7 @@ set(SRC_LIST
     "helper/om_file_helper.cc"
     "helper/model_helper.cc"
     "../model/ge_model.cc"
+    "../model/ge_root_model.cc"
     "auth/file_saver.cc"
     "fp16_t.cc"
     "math/fp16_math.cc"
@@ -79,6 +80,7 @@ target_compile_options(ge_common PRIVATE
     -O2
     -Werror
     -Wno-deprecated-declarations
+    -fno-common
 )
 
 target_include_directories(ge_common PRIVATE
@@ -129,10 +131,11 @@ target_compile_definitions(ge_common_static PRIVATE
     google=ascend_private
     $<IF:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,OS_TYPE=WIN,OS_TYPE=0>
     $<$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>:SECUREC_USING_STD_SECURE_LIB=0 NOMINMAX>
+    LOG_CPP
 )
 
 target_compile_options(ge_common_static PRIVATE
-    $<$<OR:$<STREQUAL:${TARGET_SYSTEM_NAME},Linux>,$<STREQUAL:${TARGET_SYSTEM_NAME},Android>>:-fvisibility=hidden -O2 -Werror -Wno-deprecated-declarations>
+    $<$<OR:$<STREQUAL:${TARGET_SYSTEM_NAME},Linux>,$<STREQUAL:${TARGET_SYSTEM_NAME},Android>>:-fvisibility=hidden -O2 -Werror -Wno-deprecated-declarations -fno-common>
     $<$<AND:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,$<STREQUAL:${CMAKE_CONFIGURATION_TYPES},Debug>>:/MTd>
     $<$<AND:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,$<STREQUAL:${CMAKE_CONFIGURATION_TYPES},Release>>:/MT>
 )
@@ -177,12 +180,15 @@ target_compile_definitions(ge_common PRIVATE
     FMK_SUPPORT_DUMP
     OS_CENTOS
     google=ascend_private
+    LOG_CPP
 )
 
 target_compile_options(ge_common PRIVATE
     -fvisibility=hidden
     -O2
     -Werror
+    -Wno-deprecated-declarations
+    -fno-common
 )
 
 target_include_directories(ge_common PRIVATE
diff --git a/ge/common/auth/file_saver.cc b/ge/common/auth/file_saver.cc
index 7b41397a..e708653a 100755
--- a/ge/common/auth/file_saver.cc
+++ b/ge/common/auth/file_saver.cc
@@ -54,8 +54,8 @@ Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) {
 Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size == 0 || data == nullptr, return PARAM_INVALID);
   mmSsize_t write_count;
-  uint32_t size_2g = ((uint32_t) 0x1 << 31);
-  uint32_t size_1g = ((uint32_t) 0x1 << 30);
+  uint32_t size_2g = 2147483648;  // 0x1 << 31
+  uint32_t size_1g = 1073741824;  // 0x1 << 30
   // Write data
   if (size > size_2g) {
     auto seek = reinterpret_cast<uint8_t *>(const_cast<void *>(data));
@@ -258,6 +258,65 @@ FileSaver::SaveToFile(const string &file_path, ModelFileHeader &file_header, Mod
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status
+FileSaver::SaveToFile(const string &file_path, ModelFileHeader &file_header,
+                      vector<ModelPartitionTable *> &model_partition_tables,
+                      const vector<vector<ModelPartition>> &all_partition_datas) {
+  file_header.is_encrypt = ModelEncryptType::UNENCRYPTED;
+
+  const Status ret = SaveWithFileHeader(file_path, file_header, model_partition_tables, all_partition_datas);
+  GE_CHK_BOOL_RET_STATUS(ret == SUCCESS, FAILED, "save file failed, file_path:%s, file header len:%u.",
+                         file_path.c_str(), file_header.length);
+  return SUCCESS;
+}
+
+Status FileSaver::SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header,
+                                     vector<ModelPartitionTable *> &model_partition_tables,
+                                     const vector<vector<ModelPartition>> &all_partition_datas) {
+
+  GE_CHK_BOOL_EXEC(model_partition_tables.size() == all_partition_datas.size(),
+                   return PARAM_INVALID,
+                   "model table size %zu does not match partition size %zu",
+                   model_partition_tables.size(), all_partition_datas.size())
+  for (size_t index = 0; index < model_partition_tables.size(); ++index) {
+    auto &cur_partiton_data = all_partition_datas[index];
+    auto &cur_model_partition_table = *model_partition_tables[index];
+    GE_CHK_BOOL_RET_STATUS(!cur_partiton_data.empty() && cur_model_partition_table.num != 0
+                           && cur_model_partition_table.num == cur_partiton_data.size(), FAILED,
+                           "Invalid param:partition data size is (%u), model_partition_table.num is (%zu).",
+                           cur_model_partition_table.num, cur_partiton_data.size());
+  }
+
+  // Open file
+  int32_t fd = 0;
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(OpenFile(fd, file_path) != SUCCESS, return FAILED);
+  Status ret = SUCCESS;
+  do {
+    // Write file header
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+        WriteData(static_cast<const void *>(&file_header), sizeof(ModelFileHeader), fd) != SUCCESS, ret = FAILED;
+        break);
+    for (size_t index = 0; index < model_partition_tables.size(); ++index) {
+      // Write model partition table
+      auto &cur_tabel = *model_partition_tables[index];
+      uint32_t table_size = static_cast<uint32_t>(SIZE_OF_MODEL_PARTITION_TABLE(cur_tabel));
+      GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+          WriteData(static_cast<const void *>(&cur_tabel), table_size, fd) != SUCCESS, ret = FAILED; break);
+      // Write partition data
+      auto &cur_partition_datas = all_partition_datas[index];
+      for (const auto &partition_data : cur_partition_datas) {
+        GELOGI("GC:size[%zu]", partition_data.size);
+        GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+            WriteData(static_cast<const void *>(partition_data.data), partition_data.size, fd) != SUCCESS, ret = FAILED;
+            break);
+      }
+    }
+  } while (0);
+  // Close file
+  GE_CHK_BOOL_RET_STATUS(mmClose(fd) == EN_OK, FAILED, "Close file failed.");
+  return ret;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status FileSaver::SaveToFile(const string &file_path, const void *data,
                                                                               int len) {
   if (data == nullptr || len <= 0) {
diff --git a/ge/common/auth/file_saver.h b/ge/common/auth/file_saver.h
index 79e2126e..97fbaae5 100644
--- a/ge/common/auth/file_saver.h
+++ b/ge/common/auth/file_saver.h
@@ -74,6 +74,10 @@ class FileSaver {
                            ModelPartitionTable &model_partition_table,
                            const std::vector<ModelPartition> &partition_datas);
 
+  static Status SaveToFile(const string &file_path, ModelFileHeader &file_header,
+                        vector<ModelPartitionTable *> &model_partition_tables,
+                        const vector<vector<ModelPartition>> &all_partition_datas);
+
   static Status SaveToBuffWithFileHeader(const ModelFileHeader &file_header,
                                             ModelPartitionTable &model_partition_table,
                                             const std::vector<ModelPartition> &partitionDatas,
@@ -108,6 +112,9 @@ class FileSaver {
   static Status SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header,
                                    ModelPartitionTable &model_partition_table,
                                    const std::vector<ModelPartition> &partition_datas);
+  static Status SaveWithFileHeader(const std::string &file_path, const ModelFileHeader &file_header,
+                                       vector<ModelPartitionTable *> &model_partition_tables,
+                                       const vector<vector<ModelPartition>> &all_partition_datas);
 };
 }  // namespace ge
 #endif  // GE_COMMON_AUTH_FILE_SAVER_H_
diff --git a/ge/common/base64.h b/ge/common/base64.h
index fb6c1870..a537e585 100644
--- a/ge/common/base64.h
+++ b/ge/common/base64.h
@@ -25,32 +25,38 @@
 
 namespace ge {
 namespace {
-const char* kBase64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                           "abcdefghijklmnopqrstuvwxyz"
-                           "0123456789+/";
+const char *kBase64Chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
 const char kEqualSymbol = '=';
 const size_t kBase64CharsNum = 64;
 const size_t kThreeByteOneGroup = 3;
 const size_t kFourByteOneGroup = 4;
-}
+const size_t kThreeByteOneGroupIndex0 = 0;
+const size_t kThreeByteOneGroupIndex1 = 1;
+const size_t kThreeByteOneGroupIndex2 = 2;
+const size_t kFourByteOneGroupIndex0 = 0;
+const size_t kFourByteOneGroupIndex1 = 1;
+const size_t kFourByteOneGroupIndex2 = 2;
+const size_t kFourByteOneGroupIndex3 = 3;
+}  // namespace
 
 namespace base64 {
-static inline bool IsBase64Char(const char &c) {
-  return (isalnum(c) || (c == '+') || (c == '/'));
-}
+static inline bool IsBase64Char(const char &c) { return (isalnum(c) || (c == '+') || (c == '/')); }
 
 static std::string EncodeToBase64(const std::string &raw_data) {
   size_t encode_length = raw_data.size() / kThreeByteOneGroup * kFourByteOneGroup;
   encode_length += raw_data.size() % kThreeByteOneGroup == 0 ? 0 : kFourByteOneGroup;
-  size_t raw_data_index = 0 ;
+  size_t raw_data_index = 0;
   size_t encode_data_index = 0;
   std::string encode_data;
   encode_data.resize(encode_length);
 
   for (; raw_data_index + kThreeByteOneGroup <= raw_data.size(); raw_data_index += kThreeByteOneGroup) {
     auto char_1 = static_cast<uint8_t>(raw_data[raw_data_index]);
-    auto char_2 = static_cast<uint8_t>(raw_data[raw_data_index + 1]);
-    auto char_3 = static_cast<uint8_t>(raw_data[raw_data_index + 2]);
+    auto char_2 = static_cast<uint8_t>(raw_data[raw_data_index + kThreeByteOneGroupIndex1]);
+    auto char_3 = static_cast<uint8_t>(raw_data[raw_data_index + kThreeByteOneGroupIndex2]);
     encode_data[encode_data_index++] = kBase64Chars[char_1 >> 2u];
     encode_data[encode_data_index++] = kBase64Chars[((char_1 << 4u) & 0x30) | (char_2 >> 4u)];
     encode_data[encode_data_index++] = kBase64Chars[((char_2 << 2u) & 0x3c) | (char_3 >> 6u)];
@@ -80,8 +86,7 @@ static std::string EncodeToBase64(const std::string &raw_data) {
 #pragma GCC diagnostic ignored "-Wunused-function"
 static Status DecodeFromBase64(const std::string &base64_data, std::string &decode_data) {
   if (base64_data.size() % kFourByteOneGroup != 0) {
-    GELOGE(PARAM_INVALID, "base64 data size must can be divided by 4, but given data size is %zu",
-           base64_data.size());
+    GELOGE(PARAM_INVALID, "base64 data size must can be divided by 4, but given data size is %zu", base64_data.size());
     return PARAM_INVALID;
   }
   decode_data.clear();
@@ -92,10 +97,10 @@ static Status DecodeFromBase64(const std::string &base64_data, std::string &deco
     return static_cast<uint8_t>(std::distance(kBase64Chars, char_pos)) & 0xff;
   };
 
-  for (std::size_t input_data_index = 0; input_data_index < base64_data_len; input_data_index += 4) {
+  for (std::size_t input_data_index = 0; input_data_index < base64_data_len; input_data_index += kFourByteOneGroup) {
     for (size_t i = 0; i < kFourByteOneGroup; ++i) {
       if (base64_data[input_data_index + i] == kEqualSymbol &&
-          input_data_index >= base64_data_len - 4 && i > 1) {
+          input_data_index >= base64_data_len - kFourByteOneGroup && i > 1) {
         byte_4[i] = kBase64CharsNum;
       } else if (IsBase64Char(base64_data[input_data_index + i])) {
         byte_4[i] = FindCharInBase64Chars(base64_data[input_data_index + i]);
@@ -104,19 +109,23 @@ static Status DecodeFromBase64(const std::string &base64_data, std::string &deco
         return PARAM_INVALID;
       }
     }
-    decode_data += static_cast<char>((byte_4[0] << 2u) + ((byte_4[1] & 0x30) >> 4u));
-    if (byte_4[2] >= kBase64CharsNum){
+    decode_data +=
+      static_cast<char>((byte_4[kFourByteOneGroupIndex0] << 2u) + ((byte_4[kFourByteOneGroupIndex1] & 0x30) >> 4u));
+    if (byte_4[kFourByteOneGroupIndex2] >= kBase64CharsNum) {
       break;
-    } else if (byte_4[3] >= kBase64CharsNum) {
-      decode_data += static_cast<char>(((byte_4[1] & 0x0f) << 4u)  + ((byte_4[2] & 0x3c) >> 2u));
+    } else if (byte_4[kFourByteOneGroupIndex3] >= kBase64CharsNum) {
+      decode_data += static_cast<char>(((byte_4[kFourByteOneGroupIndex1] & 0x0f) << 4u) +
+                                       ((byte_4[kFourByteOneGroupIndex2] & 0x3c) >> 2u));
       break;
     }
-    decode_data += static_cast<char>(((byte_4[1] & 0x0f) << 4u)  + ((byte_4[2] & 0x3c) >> 2u));
-    decode_data += static_cast<char>(((byte_4[2] & 0x03) << 6u)  + byte_4[3]);
+    decode_data += static_cast<char>(((byte_4[kFourByteOneGroupIndex1] & 0x0f) << 4u) +
+                                     ((byte_4[kFourByteOneGroupIndex2] & 0x3c) >> 2u));
+    decode_data +=
+      static_cast<char>(((byte_4[kFourByteOneGroupIndex2] & 0x03) << 6u) + byte_4[kFourByteOneGroupIndex3]);
   }
   return SUCCESS;
 }
 #pragma GCC diagnostic pop
-}
+}  // namespace base64
 }  // namespace ge
 #endif  // GE_COMMON_BASE64_H_
\ No newline at end of file
diff --git a/ge/common/debug/memory_dumper.cc b/ge/common/debug/memory_dumper.cc
index 872fe1da..527f0bb2 100644
--- a/ge/common/debug/memory_dumper.cc
+++ b/ge/common/debug/memory_dumper.cc
@@ -139,7 +139,8 @@ int MemoryDumper::OpenFile(const char *filename) {
   GE_IF_BOOL_EXEC(
     -1 != path_split_pos, string prefix_path = std::string(filename).substr(0, path_split_pos);
     string last_path = std::string(filename).substr(path_split_pos, strlen(filename) - 1);
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(prefix_path.length() >= MMPA_MAX_PATH, return kInvalidFd, "Prefix path is too long!");
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(prefix_path.length() >= MMPA_MAX_PATH,
+        return kInvalidFd, "Prefix path is too long!");
     GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mmRealPath(prefix_path.c_str(), tmp_path, MMPA_MAX_PATH) != EN_OK, return kInvalidFd,
                                    "Dir %s does not exit.", prefix_path.c_str());
     real_path = std::string(tmp_path) + last_path;)
diff --git a/ge/common/dump/dump_op.cc b/ge/common/dump/dump_op.cc
index e92ada05..0b9e9dcc 100755
--- a/ge/common/dump/dump_op.cc
+++ b/ge/common/dump/dump_op.cc
@@ -94,6 +94,9 @@ Status DumpOp::DumpOutput(aicpu::dump::Task &task) {
     for (auto dim : output_descs.at(i).GetShape().GetDims()) {
       output.mutable_shape()->add_dim(dim);
     }
+    for (auto dim : output_descs.at(i).GetOriginShape().GetDims()) {
+      output.mutable_origin_shape()->add_dim(dim);
+    }
     int64_t output_size = 0;
     if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) {
       GELOGE(PARAM_INVALID, "Get output size filed");
@@ -118,6 +121,9 @@ Status DumpOp::DumpInput(aicpu::dump::Task &task) {
     for (auto dim : input_descs.at(i).GetShape().GetDims()) {
       input.mutable_shape()->add_dim(dim);
     }
+    for (auto dim : input_descs.at(i).GetOriginShape().GetDims()) {
+      input.mutable_origin_shape()->add_dim(dim);
+    }
     int64_t input_size = 0;
     if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) {
       GELOGE(PARAM_INVALID, "Get output size filed");
@@ -214,8 +220,15 @@ Status DumpOp::LaunchDumpOp() {
   SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
   GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(),
          dump_path.c_str());
-
+  uint32_t task_id = 0;
+  uint32_t stream_id = 0;
+  rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGW("call rtGetTaskIdAndStreamID failed, ret = 0x%X", rt_ret);
+  }
   aicpu::dump::Task task;
+  task.set_task_id(task_id);
+  task.set_stream_id(stream_id);
   task.mutable_op()->set_op_name(op_desc_->GetName());
   task.mutable_op()->set_op_type(op_desc_->GetType());
   if (dump_properties_.GetDumpMode() == kDumpOutput) {
diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc b/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
index ed1c6941..cb528453 100755
--- a/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
+++ b/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
@@ -23,12 +23,30 @@
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
+#include "framework/common/types.h"
 #include "graph/utils/type_utils.h"
 
 namespace ge {
 namespace formats {
 namespace {
 const int kDimSize4D = 4;
+
+const size_t kSingleDim = 1;
+
+const size_t kNdDimIndexN = 0;
+const size_t kNdDimIndexH = 1;
+const size_t kNdDimIndexW = 2;
+
+const size_t kDimDValueBNdFNz = 2;  // dim d-value between Nd and FractalZz
+
+const size_t kNdDimCountBackwardsW = 1;
+const size_t kNdDimCountBackwardsWH = 2;
+
+const size_t kFNzDimCountBackwardsW0 = 1;
+const size_t kFNzDimCountBackwardsW0H0 = 2;
+const size_t kFNzDimCountBackwardsW0H0H1 = 3;
+const size_t kFNzDimCountBackwardsW0H0H1W1 = 4;
+
 bool IsDataTypeSupport(DataType data_type) { return GetSizeByDataType(data_type) > 0; }
 
 using ShapeVector = std::vector<int64_t>;
@@ -60,14 +78,14 @@ Status TransShapeToFracNz(const ShapeVector &src_shape, DataType data_type, Shap
   auto w0 = GetCubeSizeByDataType(data_type);
   int64_t h0 = kCubeSize;
   switch (src_shape.size()) {
-    case 1:
-      dst_shape.push_back(Ceil(src_shape[0], w0));
-      dst_shape.push_back(1);
+    case kSingleDim:
+      dst_shape.push_back(Ceil(src_shape[kNdDimIndexN], w0));
+      dst_shape.push_back(DIM_DEFAULT_VALUE);
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
-      hw_shape.push_back(1);
-      hw_shape.push_back(1);
-      hw_shape.push_back(src_shape[0]);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(src_shape[kNdDimIndexN]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -76,17 +94,17 @@ Status TransShapeToFracNz(const ShapeVector &src_shape, DataType data_type, Shap
     default:
       auto size = src_shape.size();
       int64_t times = 1;
-      for (size_t i = 0; i != size - 2; i++) {
+      for (size_t i = 0; i != size - kDimDValueBNdFNz; i++) {
         dst_shape.push_back(src_shape[i]);
         times *= src_shape[i];
       }
-      dst_shape.push_back(Ceil(src_shape[size - 1], w0));
-      dst_shape.push_back(Ceil(src_shape[size - 2], h0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsW], w0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsWH], h0));
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
       hw_shape.push_back(times);
-      hw_shape.push_back(src_shape[size - 2]);
-      hw_shape.push_back(src_shape[size - 1]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsWH]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsW]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -128,16 +146,16 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con
   }
 
   // src&dst_shape can be written as times*H*W & times*W1*H1*H0*W0, respectively. dst_shape_size >= kDimNum4D
-  auto times = hw_shape.at(0);
-  auto h = hw_shape.at(1);
-  auto w = hw_shape.at(2);
+  auto times = hw_shape.at(kNdDimIndexN);
+  auto h = hw_shape.at(kNdDimIndexH);
+  auto w = hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.dst_shape.size();
-  auto w1 = args.dst_shape[shape_size - 4];
-  auto h1 = args.dst_shape[shape_size - 3];
-  auto h0 = args.dst_shape[shape_size - 2];
-  auto w0 = args.dst_shape[shape_size - 1];
+  auto w1 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0H0H1W1];
+  auto h1 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0H0H1];
+  auto h0 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0H0];
+  auto w0 = args.dst_shape[shape_size - kFNzDimCountBackwardsW0];
   auto h1h0 = h1 * h0;
   auto h1h0w0 = h1h0 * w0;
   auto w1h1h0w0 = w1 * h1h0w0;
@@ -198,16 +216,16 @@ Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, con
     return OUT_OF_MEMORY;
   }
 
-  auto times = dst_hw_shape.at(0);
-  auto h = dst_hw_shape.at(1);
-  auto w = dst_hw_shape.at(2);
+  auto times = dst_hw_shape.at(kNdDimIndexN);
+  auto h = dst_hw_shape.at(kNdDimIndexH);
+  auto w = dst_hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.src_shape.size();
-  auto w1 = args.src_shape[shape_size - 4];
-  auto h1 = args.src_shape[shape_size - 3];
-  auto h0 = args.src_shape[shape_size - 2];
-  auto w0 = args.src_shape[shape_size - 1];
+  auto w1 = args.src_shape[shape_size - kFNzDimCountBackwardsW0H0H1W1];
+  auto h1 = args.src_shape[shape_size - kFNzDimCountBackwardsW0H0H1];
+  auto h0 = args.src_shape[shape_size - kFNzDimCountBackwardsW0H0];
+  auto w0 = args.src_shape[shape_size - kFNzDimCountBackwardsW0];
   auto h1h0 = h1 * h0;
   auto h1h0w0 = h1h0 * w0;
   auto w1h1h0w0 = w1 * h1h0w0;
diff --git a/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc b/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
index d890e681..88603d5c 100755
--- a/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
+++ b/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
@@ -23,12 +23,29 @@
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
+#include "framework/common/types.h"
 #include "graph/utils/type_utils.h"
 
 namespace ge {
 namespace formats {
 namespace {
 const int kDimSize4D = 4;
+
+const size_t kSingleDim = 1;
+
+const size_t kNdDimIndexN = 0;
+const size_t kNdDimIndexH = 1;
+const size_t kNdDimIndexW = 2;
+
+const size_t kDimDValueBNdFZz = 2;  // dim d-value between Nd and FractalZz
+
+const size_t kNdDimCountBackwardsW = 1;
+const size_t kNdDimCountBackwardsWH = 2;
+
+const size_t kFZzDimCountBackwardsW0 = 1;
+const size_t kFZzDimCountBackwardsW0H0 = 2;
+const size_t kFZzDimCountBackwardsW0H0W1 = 3;
+const size_t kFZzDimCountBackwardsW0H0W1H1 = 4;
 bool IsDataTypeSupport(DataType d_type) { return GetSizeByDataType(d_type) > 0; }
 
 using ShapeVector = std::vector<int64_t>;
@@ -40,8 +57,8 @@ bool CheckShape(Format format, const ShapeVector &shape) {
     case FORMAT_NHWC:
       return CheckShapeValid(shape, kDimSize4D);
     default:
-      std::string error = "Trans format between " +  FmtToStr(TypeUtils::FormatToSerialString(format)) +
-          " and FORMAT_FRACTAL_ZZ is not supported.";
+      std::string error = "Trans format between " + FmtToStr(TypeUtils::FormatToSerialString(format)) +
+                          " and FORMAT_FRACTAL_ZZ is not supported.";
       GE_ERRORLOG_AND_ERRORMSG(PARAM_INVALID, error.c_str());
       return false;
   }
@@ -60,14 +77,14 @@ Status TransShapeToFracZz(const ShapeVector &src_shape, DataType data_type, Shap
   auto w0 = GetCubeSizeByDataType(data_type);
   auto h0 = GetCubeSizeByDataType(data_type);
   switch (src_shape.size()) {
-    case 1:
-      dst_shape.push_back(1);
-      dst_shape.push_back(Ceil(src_shape[0], w0));
+    case kSingleDim:
+      dst_shape.push_back(DIM_DEFAULT_VALUE);
+      dst_shape.push_back(Ceil(src_shape[kNdDimIndexN], w0));
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
-      hw_shape.push_back(1);
-      hw_shape.push_back(1);
-      hw_shape.push_back(src_shape[0]);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(DIM_DEFAULT_VALUE);
+      hw_shape.push_back(src_shape[kNdDimIndexN]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -76,17 +93,17 @@ Status TransShapeToFracZz(const ShapeVector &src_shape, DataType data_type, Shap
     default:
       auto size = src_shape.size();
       int64_t times = 1;
-      for (size_t i = 0; i != size - 2; i++) {
+      for (size_t i = 0; i != size - kDimDValueBNdFZz; i++) {
         dst_shape.push_back(src_shape[i]);
         times *= src_shape[i];
       }
-      dst_shape.push_back(Ceil(src_shape[size - 2], h0));
-      dst_shape.push_back(Ceil(src_shape[size - 1], w0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsWH], h0));
+      dst_shape.push_back(Ceil(src_shape[size - kNdDimCountBackwardsW], w0));
       dst_shape.push_back(h0);
       dst_shape.push_back(w0);
       hw_shape.push_back(times);
-      hw_shape.push_back(src_shape[size - 2]);
-      hw_shape.push_back(src_shape[size - 1]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsWH]);
+      hw_shape.push_back(src_shape[size - kNdDimCountBackwardsW]);
       if (!IsShapeValid(dst_shape)) {
         GELOGE(PARAM_INVALID, "Failed to check dst shape %s", ShapeToString(dst_shape).c_str());
         return PARAM_INVALID;
@@ -127,16 +144,16 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
     return OUT_OF_MEMORY;
   }
   // The src&dst_shape can be written as times*H*W & times*H1*W1*H0*W0, respectively. dst_shape_size >= kDimNum4D
-  auto times = hw_shape.at(0);
-  auto h = hw_shape.at(1);
-  auto w = hw_shape.at(2);
+  auto times = hw_shape.at(kNdDimIndexN);
+  auto h = hw_shape.at(kNdDimIndexH);
+  auto w = hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.dst_shape.size();
-  auto h1 = args.dst_shape[shape_size - 4];
-  auto w1 = args.dst_shape[shape_size - 3];
-  auto h0 = args.dst_shape[shape_size - 2];
-  auto w0 = args.dst_shape[shape_size - 1];
+  auto h1 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0H0W1H1];
+  auto w1 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0H0W1];
+  auto h0 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0H0];
+  auto w0 = args.dst_shape[shape_size - kFZzDimCountBackwardsW0];
   auto h0w0 = h0 * w0;
   auto w1h0w0 = w1 * h0w0;
   auto h1w1h0w0 = h1 * w1h0w0;
@@ -155,8 +172,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
           auto src_offset = (src_h_head + w1_idx * w0) * size;
           auto dst_offset = (h0_head + w1_idx * h0w0) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size * w0));
           if (ret != EOK) {
@@ -171,8 +188,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
           auto src_offset = (src_h_head + src_w_idx) * size;
           auto dst_offset = (w0_head + w0_idx) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
@@ -205,16 +222,16 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
   }
 
   // The src&dst_shape can be written as times*H*W & times*H1*W1*H0*W0, respectively. dst_shape_size >= kDimNum4D
-  auto times = dst_hw_shape.at(0);
-  auto h = dst_hw_shape.at(1);
-  auto w = dst_hw_shape.at(2);
+  auto times = dst_hw_shape.at(kNdDimIndexN);
+  auto h = dst_hw_shape.at(kNdDimIndexH);
+  auto w = dst_hw_shape.at(kNdDimIndexW);
   auto hw = h * w;
 
   auto shape_size = args.src_shape.size();
-  auto h1 = args.src_shape[shape_size - 4];
-  auto w1 = args.src_shape[shape_size - 3];
-  auto h0 = args.src_shape[shape_size - 2];
-  auto w0 = args.src_shape[shape_size - 1];
+  auto h1 = args.src_shape[shape_size - kFZzDimCountBackwardsW0H0W1H1];
+  auto w1 = args.src_shape[shape_size - kFZzDimCountBackwardsW0H0W1];
+  auto h0 = args.src_shape[shape_size - kFZzDimCountBackwardsW0H0];
+  auto w0 = args.src_shape[shape_size - kFZzDimCountBackwardsW0];
   auto h0w0 = h0 * w0;
   auto w1h0w0 = w1 * h0w0;
   auto h1w1h0w0 = h1 * w1h0w0;
@@ -233,8 +250,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
           auto src_offset = (h0_head + w1_idx * h0w0) * size;
           auto dst_offset = (dst_h_head + w1_idx * w0) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size * w0));
           if (ret != EOK) {
@@ -249,8 +266,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
           auto dst_w_idx = w1_head + w0_idx;
           auto dst_offset = (dst_h_head + dst_w_idx) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
diff --git a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
index a66aeeb4..49b19f46 100644
--- a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
+++ b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
@@ -35,7 +35,6 @@
  *      Padding to (N, ceil(Z/16)*16)
  *  Last Step: View the (N, ceil(Z/16)*16) as 4D (N/16, 16, C/16, 16) and transpose to (C/16, N/16, 16, 16)
  */
-
 namespace ge {
 namespace formats {
 namespace {
diff --git a/ge/common/formats/format_transfers/format_transfer_transpose.cc b/ge/common/formats/format_transfers/format_transfer_transpose.cc
index e623d9e7..9be74b1f 100755
--- a/ge/common/formats/format_transfers/format_transfer_transpose.cc
+++ b/ge/common/formats/format_transfers/format_transfer_transpose.cc
@@ -19,6 +19,7 @@
 #include <securec.h>
 #include <memory>
 
+#include "common/formats/utils/formats_definitions.h"
 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
@@ -29,21 +30,21 @@ namespace formats {
 namespace {
 std::map<Format, std::map<Format, std::vector<int64_t>>> perm_args{
     {FORMAT_NCHW,
-     {{FORMAT_NHWC, std::vector<int64_t>({0, 2, 3, 1})},
-      {FORMAT_HWCN, std::vector<int64_t>({2, 3, 1, 0})},
-      {FORMAT_CHWN, std::vector<int64_t>({1, 2, 3, 0})}}},
+     {{FORMAT_NHWC, std::vector<int64_t>({kNchwN, kNchwH, kNchwW, kNchwC})},
+      {FORMAT_HWCN, std::vector<int64_t>({kNchwH, kNchwW, kNchwC, kNchwN})},
+      {FORMAT_CHWN, std::vector<int64_t>({kNchwC, kNchwH, kNchwW, kNchwN})}}},
     {FORMAT_NHWC,
-     {{FORMAT_NCHW, std::vector<int64_t>({0, 3, 1, 2})},
-      {FORMAT_CHWN, std::vector<int64_t>({3, 1, 2, 0})},
-      {FORMAT_HWCN, std::vector<int64_t>({1, 2, 3, 0})}}},
+     {{FORMAT_NCHW, std::vector<int64_t>({kNhwcN, kNhwcC, kNhwcH, kNhwcW})},
+      {FORMAT_CHWN, std::vector<int64_t>({kNhwcC, kNhwcH, kNhwcW, kNhwcN})},
+      {FORMAT_HWCN, std::vector<int64_t>({kNhwcH, kNhwcW, kNhwcC, kNhwcN})}}},
     {FORMAT_HWCN,
-     {{FORMAT_NCHW, std::vector<int64_t>({3, 2, 0, 1})},
-      {FORMAT_NHWC, std::vector<int64_t>({3, 0, 1, 2})},
-      {FORMAT_CHWN, std::vector<int64_t>({2, 0, 1, 3})}}},
+     {{FORMAT_NCHW, std::vector<int64_t>({kHwcnN, kHwcnC, kHwcnH, kHwcnW})},
+      {FORMAT_NHWC, std::vector<int64_t>({kHwcnN, kHwcnH, kHwcnW, kHwcnC})},
+      {FORMAT_CHWN, std::vector<int64_t>({kHwcnC, kHwcnH, kHwcnW, kHwcnN})}}},
     {FORMAT_CHWN,
-     {{FORMAT_NCHW, std::vector<int64_t>({3, 0, 1, 2})},
-      {FORMAT_NHWC, std::vector<int64_t>({3, 1, 2, 0})},
-      {FORMAT_HWCN, std::vector<int64_t>({1, 2, 0, 3})}}},
+     {{FORMAT_NCHW, std::vector<int64_t>({kChwnN, kChwnC, kChwnH, kChwnW})},
+      {FORMAT_NHWC, std::vector<int64_t>({kChwnN, kChwnH, kChwnW, kChwnC})},
+      {FORMAT_HWCN, std::vector<int64_t>({kChwnH, kChwnW, kChwnC, kChwnN})}}},
 };
 
 bool IsShapeArgValid(const std::vector<int64_t> &src_shape, const std::vector<int64_t> &perm_arg) {
diff --git a/ge/common/formats/utils/formats_definitions.h b/ge/common/formats/utils/formats_definitions.h
index 7f873f1b..25f36d6a 100755
--- a/ge/common/formats/utils/formats_definitions.h
+++ b/ge/common/formats/utils/formats_definitions.h
@@ -23,6 +23,7 @@ static const int kCubeSize = 16;
 static const int kNiSize = 16;
 static const int64_t kShapeItemNumMAX = 1024UL * 1024UL * 1024UL * 1024UL;
 
+
 enum NchwDimIndex {
   kNchwN,
   kNchwC,
@@ -47,6 +48,14 @@ enum HwcnDimIndex {
   kHwcnDimsNum
 };
 
+enum ChwnDimIndex {
+  kChwnC,
+  kChwnH,
+  kChwnW,
+  kChwnN,
+  kChwnDimsNum
+};
+
 enum Nc1hwc0DimIndex {
   kNc1hwc0N,
   kNc1hwc0C1,
diff --git a/ge/common/ge/datatype_util.cc b/ge/common/ge/datatype_util.cc
index 15234768..c051fe1d 100755
--- a/ge/common/ge/datatype_util.cc
+++ b/ge/common/ge/datatype_util.cc
@@ -62,6 +62,7 @@ std::map<ge::DataType, ge::proto::DataType> g_dump_data_type_map = {
     {ge::DT_RESOURCE, ge::proto::DT_RESOURCE},
     {ge::DT_STRING_REF, ge::proto::DT_STRING_REF},
     {ge::DT_STRING, ge::proto::DT_STRING},
+    {ge::DT_VARIANT, ge::proto::DT_VARIANT},
 };
 }  // namespace
 
diff --git a/ge/common/ge/plugin_manager.cc b/ge/common/ge/plugin_manager.cc
index 7bb1310c..75a36d99 100644
--- a/ge/common/ge/plugin_manager.cc
+++ b/ge/common/ge/plugin_manager.cc
@@ -123,7 +123,10 @@ Status PluginManager::LoadSo(const string &path, const vector<string> &func_chec
     if (handle == nullptr) {
       const char *error = mmDlerror();
       GE_IF_BOOL_EXEC(error == nullptr, error = "");
-      GELOGE(GE_PLGMGR_PATH_INVALID, "Failed to dlopen %s!", error);
+      ErrorManager::GetInstance().ATCReportErrMessage("E19012", {"function", "reason"},
+          {"mmDlopen", "shared library path is " + FmtToStr(file_path_dlopen) + ". Errormessage" + FmtToStr(error)});
+      GELOGE(GE_PLGMGR_PATH_INVALID, "Failed to dlopen the shared library path[%s]. Errormessage[%s]!",
+             file_path_dlopen.c_str(), error);
       continue;
     }
 
@@ -132,6 +135,9 @@ Status PluginManager::LoadSo(const string &path, const vector<string> &func_chec
     for (const auto &func_name : func_check_list) {
       auto real_fn = (void (*)())mmDlsym(handle, const_cast<char *>(func_name.c_str()));
       if (real_fn == nullptr) {
+        ErrorManager::GetInstance().ATCReportErrMessage("E19012", {"function", "reason"},
+            {"mmDlsym", FmtToStr(func_name) + " is skipped since function" +
+            FmtToStr(func_name) + " is not existed!"});
         GELOGE(GE_PLGMGR_PATH_INVALID, "%s is skipped since function %s is not existed!", func_name.c_str(),
                func_name.c_str());
         is_valid = false;
diff --git a/ge/common/ge/tbe_plugin_manager.cc b/ge/common/ge/tbe_plugin_manager.cc
index b91f1204..0cc7d553 100755
--- a/ge/common/ge/tbe_plugin_manager.cc
+++ b/ge/common/ge/tbe_plugin_manager.cc
@@ -37,6 +37,8 @@
 #include "graph/utils/type_utils.h"
 
 namespace ge {
+const int kBaseInt = 10;
+
 std::map<string, string> TBEPluginManager::options_ = {};
 
 // Get Singleton Instance
@@ -155,7 +157,7 @@ void TBEPluginManager::GetCustomOpPath(std::string &customop_path) {
   domi::FrameworkType type = domi::TENSORFLOW;
   auto it = options_.find(FRAMEWORK_TYPE);
   if (it != options_.end()) {
-    type = static_cast<domi::FrameworkType>(std::strtol(it->second.c_str(), nullptr, 10));
+    type = static_cast<domi::FrameworkType>(std::strtol(it->second.c_str(), nullptr, kBaseInt));
   }
   fmk_type = ge::TypeUtils::FmkTypeToSerialString(type);
   GELOGI("Framework type is %s.", fmk_type.c_str());
@@ -179,12 +181,19 @@ void TBEPluginManager::GetCustomOpPath(std::string &customop_path) {
 void TBEPluginManager::LoadCustomOpLib() {
   LoadPluginSo(options_);
 
+  std::string fmk_type = std::to_string(domi::TENSORFLOW);
+  auto it = options_.find(ge::FRAMEWORK_TYPE);
+  if (it != options_.end()) {
+   fmk_type = it->second;
+  }
   std::vector<OpRegistrationData> registration_datas = domi::OpRegistry::Instance()->registrationDatas;
   GELOGI("The size of registration_datas is: %zu", registration_datas.size());
   for (OpRegistrationData reg_data : registration_datas) {
-    GELOGD("Begin to register optype: %s, imply_type: %s", reg_data.GetOmOptype().c_str(),
-           TypeUtils::ImplyTypeToSerialString(reg_data.GetImplyType()).c_str());
-    domi::OpRegistry::Instance()->Register(reg_data);
+    if (std::to_string(reg_data.GetFrameworkType()) == fmk_type) {
+      GELOGD("Begin to register optype: %s, imply_type: %s", reg_data.GetOmOptype().c_str(),
+             TypeUtils::ImplyTypeToSerialString(reg_data.GetImplyType()).c_str());
+      (void)domi::OpRegistry::Instance()->Register(reg_data);
+    }
   }
 }
 
diff --git a/ge/common/ge_common.mk b/ge/common/ge_common.mk
index 3fffd203..e28090ad 100755
--- a/ge/common/ge_common.mk
+++ b/ge/common/ge_common.mk
@@ -7,6 +7,7 @@ GE_COMMON_LOCAL_SRC_FILES := \
     helper/om_file_helper.cc \
     helper/model_helper.cc \
     ../model/ge_model.cc \
+    ../model/ge_root_model.cc \
     auth/file_saver.cc \
     fp16_t.cc \
     math/fp16_math.cc \
diff --git a/ge/common/helper/model_helper.cc b/ge/common/helper/model_helper.cc
index 6f201461..1d5a4a9b 100644
--- a/ge/common/helper/model_helper.cc
+++ b/ge/common/helper/model_helper.cc
@@ -32,6 +32,7 @@ using domi::ModelTaskDef;
 
 namespace {
 const int64_t kOriginalOmPartitionNum = 1;
+const uint32_t kStatiOmFileModelNum = 1;
 }
 
 
@@ -39,7 +40,7 @@ namespace ge {
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelHelper::~ModelHelper() { (void)ReleaseLocalModelData(); }
 
 Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper, ModelPartitionType type,
-                                       const uint8_t *data, size_t size) {
+                                       const uint8_t *data, size_t size, size_t model_index) {
   if (size < 1 || size > UINT32_MAX) {
     GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu invalid", size);
     if (size > UINT32_MAX) {
@@ -68,25 +69,58 @@ Status ModelHelper::SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_fil
   partition_model.data = const_cast<uint8_t *>(data);
   partition_model.size = static_cast<uint32_t>(size);
   partition_model.type = type;
-  if (om_file_save_helper->AddPartition(partition_model) != SUCCESS) {
+  if (om_file_save_helper->AddPartition(partition_model, model_index) != SUCCESS) {
     GELOGE(PARAM_INVALID, "Add model partition failed, partition size %zu", size);
     return PARAM_INVALID;
   }
   return SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmModel(const GeModelPtr &ge_model,
-                                                                                   const SaveParam &save_param,
-                                                                                   const std::string &output_file,
-                                                                                   ModelBufferData& model) {
-  if (output_file.empty()) {
-    GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix");
+Status ModelHelper::SaveSizeToModelDef(const GeModelPtr &ge_model) {
+  vector<int64_t> om_info;
+  ModelPtr model_tmp = ge::MakeShared<ge::Model>(ge_model->GetName(), ge_model->GetPlatformVersion());
+  if (model_tmp == nullptr) {
+    GELOGE(FAILED, "Create Model %s Ptr failed", ge_model->GetName().c_str());
     return FAILED;
   }
+  model_tmp->SetGraph(ge_model->GetGraph());
+  model_tmp->SetVersion(ge_model->GetVersion());
+  model_tmp->SetAttr(ge_model->MutableAttrMap());
+  ge::Buffer model_buffer;
+  (void)model_tmp->Save(model_buffer);
+  GELOGD("SaveSizeToModelDef modeldef_size is %zu", model_buffer.GetSize());
+  om_info.push_back(model_buffer.GetSize());
 
-  GE_IF_BOOL_EXEC(ge_model == nullptr, GELOGE(FAILED, "Ge_model is nullptr"); return FAILED);
-  std::shared_ptr<OmFileSaveHelper> om_file_save_helper = ge::MakeShared<OmFileSaveHelper>();
-  GE_CHECK_NOTNULL(om_file_save_helper);
+  auto ge_model_weight = ge_model->GetWeight();
+  GELOGD("SaveSizeToModelDef weight_data_size is %zu, %p", ge_model_weight.GetSize(), ge_model_weight.GetData());
+  om_info.push_back(ge_model_weight.GetSize());
+
+  TBEKernelStore tbe_kernel_store = ge_model->GetTBEKernelStore();
+  GELOGD("SaveSizeToModelDef tbe_kernels_size is %zu", tbe_kernel_store.DataSize());
+  om_info.push_back(tbe_kernel_store.DataSize());
+
+  CustAICPUKernelStore cust_aicpu_kernel_store = ge_model->GetCustAICPUKernelStore();
+  GELOGD("SaveSizeToModelDef cust aicpu kernels size is %zu", cust_aicpu_kernel_store.DataSize());
+  om_info.push_back(cust_aicpu_kernel_store.DataSize());
+
+  std::shared_ptr<ModelTaskDef> model_task_def = ge_model->GetModelTaskDefPtr();
+  if (model_task_def == nullptr) {
+    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Create model task def ptr failed");
+    return ACL_ERROR_GE_MEMORY_ALLOCATION;
+  }
+  size_t partition_task_size = model_task_def->ByteSizeLong();
+  GELOGD("SaveSizeToModelDef task_info_size is %zu", partition_task_size);
+  om_info.push_back(partition_task_size);
+
+  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(*(ge_model.get()), "om_info_list", om_info),
+                   GELOGE(FAILED, "SetListInt of om_info_list failed.");
+                   return FAILED);
+
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelDef(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                 const GeModelPtr &ge_model, ge::Buffer &model_buffer, size_t model_index) {
   ModelPtr model_tmp = ge::MakeShared<ge::Model>(ge_model->GetName(), ge_model->GetPlatformVersion());
   if (model_tmp == nullptr) {
     GELOGE(FAILED, "Create Model %s Ptr failed", ge_model->GetName().c_str());
@@ -95,17 +129,26 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   model_tmp->SetGraph(ge_model->GetGraph());
   model_tmp->SetVersion(ge_model->GetVersion());
   model_tmp->SetAttr(ge_model->MutableAttrMap());
+  Status ret = SaveSizeToModelDef(ge_model);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "SaveSizeToModelDef failed");
+    return ret;
+  }
 
-  ge::Buffer model_buffer;
   (void)model_tmp->Save(model_buffer);
   GELOGD("MODEL_DEF size is %zu", model_buffer.GetSize());
   if (model_buffer.GetSize() > 0) {
     if (SaveModelPartition(om_file_save_helper, ModelPartitionType::MODEL_DEF, model_buffer.GetData(),
-                           model_buffer.GetSize()) != SUCCESS) {
+                           model_buffer.GetSize(), model_index) != SUCCESS) {
       GELOGE(PARAM_INVALID, "Add model graph partition failed");
       return PARAM_INVALID;
     }
   }
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelWeights(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                     const GeModelPtr &ge_model, size_t model_index) {
   auto ge_model_weight = ge_model->GetWeight();
   GELOGD("WEIGHTS_DATA size is %zu, %p", ge_model_weight.GetSize(), ge_model_weight.GetData());
   // weight is not necessary
@@ -113,31 +156,43 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
     GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper,
                                          ModelPartitionType::WEIGHTS_DATA,
                                          ge_model_weight.GetData(),
-                                         ge_model_weight.GetSize()), "Add weight partition failed");
+                                         ge_model_weight.GetSize(), model_index), "Add weight partition failed");
   }
+  return SUCCESS;
+}
 
+Status ModelHelper::SaveModelTbeKernel(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                       const GeModelPtr &ge_model, size_t model_index) {
   TBEKernelStore tbe_kernel_store = ge_model->GetTBEKernelStore();
   GELOGD("TBE_KERNELS size is %zu", tbe_kernel_store.DataSize());
   if (tbe_kernel_store.DataSize() > 0) {
-    GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper,
-                                         ModelPartitionType::TBE_KERNELS,
-                                         tbe_kernel_store.Data(),
-                                         tbe_kernel_store.DataSize()), "Add tbe kernel partition failed");
+    GE_CHK_STATUS_RET(
+        SaveModelPartition(om_file_save_helper, ModelPartitionType::TBE_KERNELS,
+                           ge_model->GetTBEKernelStore().Data(), ge_model->GetTBEKernelStore().DataSize(),
+                           model_index), "Add tbe kernel partition failed");
   }
-
   // no need to check value, DATA->NetOutput
   (void)tbe_kernel_store.Load(tbe_kernel_store.Data(), tbe_kernel_store.DataSize());
 
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelCustAICPU(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                       const GeModelPtr &ge_model, size_t model_index) {
   CustAICPUKernelStore cust_aicpu_kernel_store = ge_model->GetCustAICPUKernelStore();
   GELOGD("cust aicpu kernels size is %zu", cust_aicpu_kernel_store.DataSize());
   if (cust_aicpu_kernel_store.DataSize() > 0) {
     GE_CHK_STATUS_RET(SaveModelPartition(om_file_save_helper,
                                          ModelPartitionType::CUST_AICPU_KERNELS,
-                                         cust_aicpu_kernel_store.Data(),
-                                         cust_aicpu_kernel_store.DataSize()),
+                                         ge_model->GetCustAICPUKernelStore().Data(),
+                                         cust_aicpu_kernel_store.DataSize(), model_index),
                       "Add cust aicpu kernel partition failed");
   }
+  return SUCCESS;
+}
 
+Status ModelHelper::SaveModelTaskDef(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                     const GeModelPtr &ge_model, ge::Buffer &task_buffer, size_t model_index) {
   std::shared_ptr<ModelTaskDef> model_task_def = ge_model->GetModelTaskDefPtr();
   if (model_task_def == nullptr) {
     GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Create model task def ptr failed");
@@ -146,9 +201,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   size_t partition_task_size = model_task_def->ByteSizeLong();
   GE_IF_BOOL_EXEC(partition_task_size == 0 || partition_task_size > INT_MAX,
                   GELOGE(FAILED, "Model_def's byte size (%zu) is invalid!", partition_task_size);
-                  return FAILED);
+                      return FAILED);
 
-  ge::Buffer task_buffer(partition_task_size);
+  task_buffer = ge::Buffer(partition_task_size);
   if (task_buffer.GetSize() == 0) {
     GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc model task def buffer failed");
     return ACL_ERROR_GE_MEMORY_ALLOCATION;
@@ -159,21 +214,28 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   GELOGD("TASK_INFO size is %zu", partition_task_size);
 
   if (SaveModelPartition(om_file_save_helper, ModelPartitionType::TASK_INFO, task_buffer.GetData(),
-                         partition_task_size) != SUCCESS) {
+                         partition_task_size, model_index) != SUCCESS) {
     GELOGE(PARAM_INVALID, "Add model task def partition failed");
     return PARAM_INVALID;
   }
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveModelHeader(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper,
+                                    const GeModelPtr &ge_model, size_t model_num) {
   // Save target/version to model_header
   ModelFileHeader &model_header = om_file_save_helper->GetModelFileHeader();
   model_header.platform_type = ge_model->GetPlatformType();
   model_header.om_ir_version = ge_model->GetVersion();
+  model_header.model_num = model_num;
   std::string platform_version = ge_model->GetPlatformVersion();
 
   errno_t err;
   err = memcpy_s(model_header.platform_version, PLATFORM_VERSION_LEN, platform_version.c_str(),
                  platform_version.size() + 1);
   if (err != EOK) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelHelper SaveModel failed while allocating memory for platform_version.");
+    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION,
+           "ModelHelper SaveModel failed while allocating memory for platform_version.");
     return ACL_ERROR_GE_MEMORY_ALLOCATION;
   }
   string version = reinterpret_cast<char *>(model_header.platform_version);
@@ -188,8 +250,142 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmMod
   }
   string model_name = reinterpret_cast<char *>(model_header.name);
   GELOGD("Model name save:%s", model_name.c_str());
+  return SUCCESS;
+}
+
+Status ModelHelper::SaveAllModelPartiton(std::shared_ptr<OmFileSaveHelper>& om_file_save_helper,
+                                         const GeModelPtr &ge_model, ge::Buffer &model_buffer,
+                                         ge::Buffer &task_buffer, size_t model_index) {
+  if (SaveModelDef(om_file_save_helper, ge_model, model_buffer, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model def failed");
+    return FAILED;
+  }
+
+  if (SaveModelWeights(om_file_save_helper, ge_model, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model weights failed");
+    return FAILED;
+  }
+
+  if (SaveModelTbeKernel(om_file_save_helper, ge_model, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model tbe kernel failed");
+    return FAILED;
+  }
 
-  Status ret = om_file_save_helper->SaveModel(save_param, output_file.c_str(), model, is_offline_);
+  if (SaveModelCustAICPU(om_file_save_helper, ge_model, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save model cust ai cpu failed");
+    return FAILED;
+  }
+
+
+  if (SaveModelTaskDef(om_file_save_helper, ge_model, task_buffer, model_index) != SUCCESS) {
+    GELOGE(FAILED, "save task def failed");
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmModel(const GeModelPtr &ge_model,
+                                                                                   const SaveParam &save_param,
+                                                                                   const std::string &output_file,
+                                                                                   ModelBufferData& model) {
+  if (output_file.empty()) {
+    GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix");
+    return FAILED;
+  }
+
+  GE_IF_BOOL_EXEC(ge_model == nullptr, GELOGE(FAILED, "Ge_model is nullptr"); return FAILED);
+  std::shared_ptr<OmFileSaveHelper> om_file_save_helper = ge::MakeShared<OmFileSaveHelper>();
+  GE_CHECK_NOTNULL(om_file_save_helper);
+  ge::Buffer model_buffer;
+  ge::Buffer task_buffer;
+
+  auto ret = SaveAllModelPartiton(om_file_save_helper, ge_model, model_buffer, task_buffer);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "save all model partition failed");
+    return ret;
+  }
+
+  ret = SaveModelHeader(om_file_save_helper, ge_model);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "save model header failed");
+    return ret;
+  }
+
+  ret = om_file_save_helper->SaveModel(save_param, output_file.c_str(), model, is_offline_);
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "OmFileSaveHelper SaveModel return fail.");
+    return ret;
+  }
+  return SUCCESS;
+}
+
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::SaveToOmRootModel(
+    const GeRootModelPtr &ge_root_model,
+    const SaveParam &save_param,
+    const std::string &output_file,
+    ModelBufferData& model,
+    bool is_unknown_shape) {
+
+  GE_CHECK_NOTNULL(ge_root_model);
+  GE_IF_BOOL_EXEC(ge_root_model == nullptr, GELOGE(FAILED, "Ge_root_model is nullptr"); return FAILED);
+
+  auto &name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
+  GE_IF_BOOL_EXEC(name_to_ge_model.empty(), GELOGE(FAILED, "Ge_root_model has no sub model"); return FAILED);
+  GE_IF_BOOL_EXEC(output_file.empty(),
+                  GELOGE(FAILED, "GraphBuilder SaveModel received invalid file name prefix");
+                  return FAILED);
+
+  if (!is_unknown_shape) {
+    auto &model_root = name_to_ge_model.begin()->second;
+    return SaveToOmModel(model_root, save_param, output_file, model);
+  }
+
+  std::shared_ptr<OmFileSaveHelper> om_file_save_helper = ge::MakeShared<OmFileSaveHelper>();
+  GE_CHECK_NOTNULL(om_file_save_helper);
+
+  auto &first_ge_model = name_to_ge_model.at(ge_root_model->GetRootGraph()->GetName());
+
+  // ge root model must be the first to be loaded
+  vector<string> model_names{ge_root_model->GetRootGraph()->GetName()};
+  for (auto &item : name_to_ge_model) {
+    if (item.first != model_names.front()) {
+      model_names.emplace_back(item.first);
+    }
+  }
+
+  vector<ge::Buffer> model_buffers(model_names.size());
+  vector<ge::Buffer> task_buffers(model_names.size());
+
+  size_t cur_index = 0;
+
+  if (model_names.size() > 1) {
+    GELOGD("only save first model MODEL_DEF");
+    if (SaveModelDef(om_file_save_helper, first_ge_model, model_buffers[cur_index], cur_index) != SUCCESS) {
+      GELOGE(FAILED, "save model def failed");
+      return FAILED;
+    }
+    ++cur_index;
+  }
+
+  for (; cur_index < model_names.size(); ++cur_index) {
+    auto model_name = model_names[cur_index];
+    GELOGD("cur model %s index is %zu", model_name.c_str(), cur_index);
+    const GeModelPtr &ge_model = name_to_ge_model.at(model_name);
+    auto ret = SaveAllModelPartiton(om_file_save_helper, ge_model, model_buffers[cur_index],
+                                    task_buffers[cur_index], cur_index);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Save model %s failed", model_name.c_str());
+      return INTERNAL_ERROR;
+    }
+  }
+
+  auto ret = SaveModelHeader(om_file_save_helper, first_ge_model, model_names.size());
+  if (ret != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Save model %s header failed", first_ge_model->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+
+  ret = om_file_save_helper->SaveRootModel(save_param, output_file.c_str(), model, is_offline_);
   if (ret != SUCCESS) {
     GELOGE(FAILED, "OmFileSaveHelper SaveModel return fail.");
     return FAILED;
@@ -288,7 +484,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c
   }
 
   file_header_ = reinterpret_cast<ModelFileHeader *>(model_data.model_data);
-
   OmFileLoadHelper om_load_helper;
   status = om_load_helper.Init(model_addr_tmp_, model_len_tmp_);
   if (status != SUCCESS) {
@@ -310,7 +505,61 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadModel(c
     GELOGE(status, "GenerateGeModel failed");
     return status;
   }
+  GELOGD("in ModelHelper::LoadModel, is_assign_model_ is setted to true!");
+  is_assign_model_ = true;
+  return SUCCESS;
+}
+
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadRootModel(const ge::ModelData &model_data) {
+  if (model_data.model_data == nullptr || model_data.model_len == 0) {
+    GELOGE(GE_EXEC_MODEL_DATA_SIZE_INVALID, "Model_data is nullptr, or model_data_size is 0");
+    return GE_EXEC_MODEL_DATA_SIZE_INVALID;
+  }
+
+  if (is_assign_model_) {
+    GELOGE(GE_EXEC_LOAD_MODEL_REPEATED, "Model helper has already loaded!");
+    return GE_EXEC_LOAD_MODEL_REPEATED;
+  }
+
+  if (ReleaseLocalModelData() != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "ReleaseLocalModelData failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Status status = ge::DavinciModelParser::ParseModelContent(model_data, model_addr_tmp_, model_len_tmp_);
+  if (status != SUCCESS) {
+    GELOGE(status, "Parse model content failed!");
+    return status;
+  }
+
+  file_header_ = reinterpret_cast<ModelFileHeader *>(model_data.model_data);
+
+  //model verison 1.0 file header does not have model_num member
+  is_unknown_shape_model_ = file_header_->version >= ge::MODEL_VERSION &&
+                            file_header_->model_num > kStatiOmFileModelNum;
+  GELOGD("cur om model is ge root model or no %d, model version %zu", is_unknown_shape_model_, file_header_->version);
+
+  OmFileLoadHelper om_load_helper;
+  if (is_unknown_shape_model_) {
+    auto model_num = file_header_->model_num;
+    status = om_load_helper.Init(model_addr_tmp_, model_len_tmp_, model_num);
+  } else {
+    status = om_load_helper.Init(model_addr_tmp_, model_len_tmp_);
+  }
+  if (status != SUCCESS) {
+    GELOGE(status, "Om_load_helper init failed");
+    model_addr_tmp_ = nullptr;
+    return status;
+  }
+  // Encrypt model need to del temp model/no encrypt model don't need to del model
+  model_addr_tmp_ = nullptr;
 
+  status = GenerateGeRootModel(om_load_helper);
+  if (status != SUCCESS) {
+    GELOGE(status, "GenerateGeRootModel failed");
+    return status;
+  }
+  GELOGD("in ModelHelper::LoadRootModel, is_assign_model_ is setted to true!");
   is_assign_model_ = true;
   return SUCCESS;
 }
@@ -341,6 +590,61 @@ Status ModelHelper::GenerateGeModel(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::GenerateGeRootModel(OmFileLoadHelper &om_load_helper) {
+  GELOGD("Begin to generate ge root model");
+  root_model_ = ge::MakeShared<ge::GeRootModel>();
+  GE_CHECK_NOTNULL(root_model_);
+  if (!is_unknown_shape_model_) {
+    if (GenerateGeModel(om_load_helper) != SUCCESS) {
+      GELOGE(FAILED, "GenerateGeModel failed");
+      return FAILED;
+    }
+    GE_CHECK_NOTNULL(model_);
+    root_model_->SetRootGraph(GraphUtils::GetComputeGraph(model_->GetGraph()));
+    return SUCCESS;
+  }
+
+  bool is_first_model = true;
+  for (size_t mode_index = 0;  mode_index < file_header_->model_num; ++mode_index) {
+    GeModelPtr cur_model = ge::MakeShared<ge::GeModel>();
+    Status ret = LoadModelData(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_MODEL_PARTITION_FAILED;
+    }
+
+    if (is_first_model) {
+      is_first_model = false;
+      root_model_->SetRootGraph(GraphUtils::GetComputeGraph(cur_model->GetGraph()));
+      root_model_->SetModelId(cur_model->GetModelId());
+      model_ = cur_model;
+      continue;
+    }
+
+    ret = LoadWeights(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED;
+    }
+
+    ret = LoadTBEKernelStore(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED;
+    }
+
+    ret = LoadCustAICPUKernelStore(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_KERNEL_PARTITION_FAILED;
+    }
+
+    ret = LoadTask(om_load_helper, cur_model, mode_index);
+    if (ret != SUCCESS) {
+      return GE_EXEC_LOAD_TASK_PARTITION_FAILED;
+    }
+    root_model_->SetSubgraphInstanceNameToModel(cur_model->GetName(), cur_model);
+  }
+
+  return SUCCESS;
+}
+
 Status ModelHelper::LoadModelData(OmFileLoadHelper &om_load_helper) {
   ModelPartition partition_model_def;
   // no need to check value, DATA->NetOutput
@@ -353,19 +657,35 @@ Status ModelHelper::LoadModelData(OmFileLoadHelper &om_load_helper) {
     return INTERNAL_ERROR;
   }
 
-  SetModelToGeModel(model);
-
+  SetModelToGeModel(model_, model);
   return SUCCESS;
 }
 
-void ModelHelper::SetModelToGeModel(ge::Model &model) {
-  model_->SetGraph(model.GetGraph());
-  model_->SetName(model.GetName());
-  model_->SetVersion(model.GetVersion());
-  model_->SetPlatformVersion(model.GetPlatformVersion());
-  model_->SetAttr(model.MutableAttrMap());
+void ModelHelper::SetModelToGeModel(GeModelPtr &ge_model, Model &model) {
+  ge_model->SetGraph(model.GetGraph());
+  ge_model->SetName(model.GetName());
+  ge_model->SetVersion(model.GetVersion());
+  ge_model->SetPlatformVersion(model.GetPlatformVersion());
+  ge_model->SetAttr(model.MutableAttrMap());
 }
 
+Status ModelHelper::LoadModelData(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index) {
+  ModelPartition partition_model_def;
+  // no need to check value, DATA->NetOutput
+  om_load_helper.GetModelPartition(ModelPartitionType::MODEL_DEF, partition_model_def, mode_index);
+  GELOGD("Model_def partition addr:%p,size:%u", partition_model_def.data, partition_model_def.size);
+
+  ge::Model model;
+  if (ge::Model::Load(partition_model_def.data, partition_model_def.size, model) != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Load model failed.");
+    return INTERNAL_ERROR;
+  }
+
+  SetModelToGeModel(cur_model, model);
+  return SUCCESS;
+}
+
+
 Status ModelHelper::LoadWeights(OmFileLoadHelper &om_load_helper) {
   ModelPartition partition;
   if (om_load_helper.GetModelPartition(ModelPartitionType::WEIGHTS_DATA, partition) != SUCCESS) {
@@ -379,6 +699,19 @@ Status ModelHelper::LoadWeights(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::LoadWeights(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index) {
+  ModelPartition partition;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::WEIGHTS_DATA, partition, mode_index) != SUCCESS) {
+    GELOGE(FAILED, "Get weight model partition failed.");
+    return FAILED;
+  }
+  ge::Buffer weight = ge::Buffer::CopyFrom(partition.data, partition.size);
+  cur_model->SetWeight(weight);
+
+  GELOGD("GetWeight size:%u", partition.size);
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(OmFileLoadHelper &om_load_helper) {
   ModelPartition task_partition;
   if (om_load_helper.GetModelPartition(ModelPartitionType::TASK_INFO, task_partition) != SUCCESS) {
@@ -398,6 +731,27 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(Om
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelHelper::LoadTask(OmFileLoadHelper &om_load_helper,
+                                                                              GeModelPtr &cur_model,
+                                                                              size_t mode_index) {
+  ModelPartition task_partition;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::TASK_INFO, task_partition, mode_index) != SUCCESS) {
+    GELOGE(FAILED, "Get task model partition failed.");
+    return FAILED;
+  }
+  std::shared_ptr<ModelTaskDef> task = ge::MakeShared<ModelTaskDef>();
+  GE_CHECK_NOTNULL(task);
+  if (task_partition.size != 0) {
+    if (!ReadProtoFromArray(task_partition.data, task_partition.size, task.get())) {
+      GELOGE(INTERNAL_ERROR, "ReadProtoFromArray failed.");
+      return INTERNAL_ERROR;
+    }
+    GELOGD("TASK_INFO op_size:%zu, stream_num:%u", task->op().size(), task->stream_num());
+  }
+  cur_model->SetModelTaskDef(task);
+  return SUCCESS;
+}
+
 Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper) {
   // Load tbe kernels
   ModelPartition partition_kernel_def;
@@ -414,6 +768,23 @@ Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper) {
   return SUCCESS;
 }
 
+Status ModelHelper::LoadTBEKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index) {
+  // Load tbe kernels
+  ModelPartition partition_kernel_def;
+  TBEKernelStore kernel_store;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::TBE_KERNELS, partition_kernel_def, mode_index) ==
+      SUCCESS) {
+    GELOGD("Kernels partition size:%u", partition_kernel_def.size);
+    if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
+      GELOGD("Load tbe kernels success");
+    } else {
+      GELOGW("Load tbe kernels failed");
+    }
+  }
+  cur_model->SetTBEKernelStore(kernel_store);
+  return SUCCESS;
+}
+
 Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper) {
   // Load cust aicpu kernels
   ModelPartition partition_kernel_def;
@@ -421,19 +792,39 @@ Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper) {
   if (om_load_helper.GetModelPartition(ModelPartitionType::CUST_AICPU_KERNELS, partition_kernel_def) == SUCCESS) {
     GELOGD("Kernels partition size:%u", partition_kernel_def.size);
     if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
-      GELOGI("Load cust aicpu kernels success");
+      GELOGD("Load cust aicpu kernels success");
+    } else {
+      GELOGW("Load cust aicpu kernels failed");
     }
   }
   model_->SetCustAICPUKernelStore(kernel_store);
   return SUCCESS;
 }
 
+Status ModelHelper::LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper,
+                                             GeModelPtr &cur_model, size_t mode_index) {
+  // Load cust aicpu kernels
+  ModelPartition partition_kernel_def;
+  CustAICPUKernelStore kernel_store;
+  if (om_load_helper.GetModelPartition(ModelPartitionType::CUST_AICPU_KERNELS, partition_kernel_def, mode_index)
+      == SUCCESS) {
+    GELOGD("Kernels partition size:%u", partition_kernel_def.size);
+    if (kernel_store.Load(partition_kernel_def.data, partition_kernel_def.size)) {
+      GELOGD("Load cust aicpu kernels success");
+    } else {
+      GELOGW("Load cust aicpu kernels failed");
+    }
+  }
+  cur_model->SetCustAICPUKernelStore(kernel_store);
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeModelPtr ModelHelper::GetGeModel() {
   if (model_ != nullptr) {
     return model_;
   }
 
-  GELOGI("Model has not been loaded!");
+  GELOGD("Model has not been loaded!");
   std::shared_ptr<ge::GeModel> out_model = ge::MakeShared<ge::GeModel>();
   if (out_model == nullptr) {
     return nullptr;
@@ -441,6 +832,20 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeModelPtr ModelHelper::GetGeMo
   return out_model;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY GeRootModelPtr ModelHelper::GetGeRootModel() {
+  if (root_model_ != nullptr) {
+    return root_model_;
+  }
+
+  GELOGD("Model has not been loaded!");
+  std::shared_ptr<ge::GeRootModel> out_model = ge::MakeShared<ge::GeRootModel>();
+  if (out_model == nullptr) {
+    return nullptr;
+  }
+  return out_model;
+}
+
+
 Status ModelHelper::ReleaseLocalModelData() noexcept {
   Status result = SUCCESS;
   if (model_addr_tmp_ != nullptr) {
diff --git a/ge/common/helper/om_file_helper.cc b/ge/common/helper/om_file_helper.cc
index ce88cd08..d1c52b13 100644
--- a/ge/common/helper/om_file_helper.cc
+++ b/ge/common/helper/om_file_helper.cc
@@ -52,6 +52,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(u
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::Init(uint8_t *model_data,
+                                                                               uint32_t model_data_size,
+                                                                               uint32_t model_num) {
+  Status status = LoadModelPartitionTable(model_data, model_data_size, model_num);
+  if (status != SUCCESS) {
+    return status;
+  }
+  is_inited_ = true;
+  return SUCCESS;
+}
+
 // Use both
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetModelPartition(ModelPartitionType type,
                                                                                             ModelPartition &partition) {
@@ -79,6 +90,37 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetMod
   return SUCCESS;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileLoadHelper::GetModelPartition(ModelPartitionType type,
+                                                                                            ModelPartition &partition,
+                                                                                            size_t model_index) {
+  if (!is_inited_) {
+    GELOGE(PARAM_INVALID, "OmFileLoadHelper has not been initialized!");
+    return PARAM_INVALID;
+  }
+  if (model_index >= model_contexts_.size()) {
+    GELOGE(PARAM_INVALID, "cur index : %zu, model_contexts size:%zu", model_index, model_contexts_.size());
+    return PARAM_INVALID;
+  }
+  auto &cur_ctx = model_contexts_[model_index];
+  bool found = false;
+  for (ModelPartition &part : cur_ctx.partition_datas_) {
+    if (part.type == type) {
+      partition = part;
+      found = true;
+      break;
+    }
+  }
+
+  if (!found) {
+    if (type != ModelPartitionType::TBE_KERNELS && type != ModelPartitionType::WEIGHTS_DATA &&
+        type != ModelPartitionType::CUST_AICPU_KERNELS) {
+      GELOGE(FAILED, "GetModelPartition:type:%d is not in partition_datas!", static_cast<int>(type));
+      return FAILED;
+    }
+  }
+  return SUCCESS;
+}
+
 Status OmFileLoadHelper::CheckModelValid(const ge::ModelData &model) const {
   // Parameter validity check
   if (model.model_data == nullptr) {
@@ -138,7 +180,8 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint
     context_.partition_datas_.push_back(partition);
 
     if (partition.size > model_data_size || mem_offset > model_data_size - partition.size) {
-      GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID, "The partition size %zu is greater than the model data size %u.",
+      GELOGE(ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID,
+             "The partition size %zu is greater than the model data size %u.",
              partition.size + mem_offset, model_data_size);
       return ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID;
     }
@@ -148,6 +191,61 @@ Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, const uint
   return SUCCESS;
 }
 
+Status OmFileLoadHelper::LoadModelPartitionTable(uint8_t *model_data, uint32_t model_data_size, uint32_t model_num) {
+  if (model_data == nullptr) {
+    GELOGE(PARAM_INVALID, "Param model_data must not be null!");
+    return PARAM_INVALID;
+  }
+
+  uint32_t cur_offset = 0;
+  for (uint32_t index = 0; index < model_num; ++index) {
+    // Init partition table
+    auto partition_table = reinterpret_cast<ModelPartitionTable *>(model_data + cur_offset);
+    size_t partition_table_size = SIZE_OF_MODEL_PARTITION_TABLE(*partition_table);
+    cur_offset += partition_table_size;
+    GELOGD("Cur model index %zu: ModelPartitionTable num :%u, "
+           "ModelFileHeader length :%zu, ModelPartitionTable length :%zu",
+           index, partition_table->num, sizeof(ModelFileHeader), partition_table_size);
+    if (model_data_size <= cur_offset) {
+      GELOGE(GE_EXEC_MODEL_DATA_SIZE_INVALID, "invalid model data, partition_table->num:%u, model data size %u",
+             partition_table->num, model_data_size);
+      return GE_EXEC_MODEL_DATA_SIZE_INVALID;
+    }
+
+    for (uint32_t i = 0; i < partition_table->num; i++) {
+      ModelPartition partition;
+      partition.size = partition_table->partition[i].mem_size;
+      partition.data = model_data + cur_offset;
+      partition.type = partition_table->partition[i].type;
+      if (index >= model_contexts_.size()) {
+        if (index != model_contexts_.size()) {
+          GELOGE(FAILED, "cur index is %zu make model_contexts_ overflow", index);
+          return FAILED;
+        }
+
+        OmFileContext tmp_ctx;
+        tmp_ctx.partition_datas_.push_back(partition);
+        model_contexts_.push_back(tmp_ctx);
+      } else {
+        model_contexts_[index].partition_datas_.push_back(partition);
+      }
+
+      if (partition.size > model_data_size || cur_offset > model_data_size - partition.size) {
+        GELOGE(GE_EXEC_MODEL_DATA_SIZE_INVALID, "The partition size %zu is greater than the model data size %u.",
+               partition.size + cur_offset, model_data_size);
+        return GE_EXEC_MODEL_DATA_SIZE_INVALID;
+      }
+      cur_offset += partition.size;
+      GELOGD("Partition, type:%d, size:%u, model_index:%zu", static_cast<int>(partition.type), partition.size, index);
+    }
+  }
+  if (cur_offset != model_data_size) {
+    GELOGE(FAILED, "do not get the complete model, read end offset:%zu, all size:%zu", cur_offset, model_data_size);
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY const std::vector<ModelPartition>
   &OmFileSaveHelper::GetModelPartitions() const {
   return context_.partition_datas_;
@@ -172,6 +270,28 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelPartitionTable *OmFileSave
   return partition_table;
 }
 
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelPartitionTable *OmFileSaveHelper::GetPartitionTable(
+    size_t cur_ctx_index) {
+  auto &cur_ctx = model_contexts_[cur_ctx_index];
+  auto partition_size = static_cast<uint32_t>(cur_ctx.partition_datas_.size());
+  // Build ModelPartitionTable, flex array
+  cur_ctx.partition_table_.clear();
+  cur_ctx.partition_table_.resize(sizeof(ModelPartitionTable) + sizeof(ModelPartitionMemInfo) * partition_size, 0);
+
+  auto partition_table = reinterpret_cast<ModelPartitionTable *>(cur_ctx.partition_table_.data());
+  partition_table->num = partition_size;
+
+  uint32_t mem_offset = 0;
+  for (uint32_t i = 0; i < partition_size; i++) {
+    ModelPartition partition = cur_ctx.partition_datas_[i];
+    partition_table->partition[i] = {partition.type, mem_offset, partition.size};
+    mem_offset += partition.size;
+    GELOGD("Partition, type:%d, size:%u", static_cast<int>(partition.type), partition.size);
+  }
+  return partition_table;
+}
+
+
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileSaveHelper::AddPartition(ModelPartition &partition) {
   if (ge::CheckUint32AddOverflow(context_.model_data_len_, partition.size) != SUCCESS) {
     GELOGE(FAILED, "UINT32 %u and %u addition can result in overflow!", context_.model_data_len_, partition.size);
@@ -182,6 +302,27 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status OmFileSaveHelper::AddPar
   return SUCCESS;
 }
 
+Status OmFileSaveHelper::AddPartition(ModelPartition &partition, size_t cur_index) {
+  if (ge::CheckUint32AddOverflow(context_.model_data_len_, partition.size) != SUCCESS) {
+    GELOGE(FAILED, "UINT32 %u and %u addition can result in overflow!", context_.model_data_len_, partition.size);
+    return FAILED;
+  }
+  if (cur_index >= model_contexts_.size()) {
+    if (cur_index != model_contexts_.size()) {
+      GELOGE(FAILED, "cur index is %zu make model_contexts_ overflow", cur_index);
+      return FAILED;
+    }
+    OmFileContext tmp_ctx;
+    tmp_ctx.model_data_len_ += partition.size;
+    tmp_ctx.partition_datas_.push_back(partition);
+    model_contexts_.push_back(tmp_ctx);
+  } else {
+    model_contexts_[cur_index].model_data_len_ += partition.size;
+    model_contexts_[cur_index].partition_datas_.push_back(partition);
+  }
+  return SUCCESS;
+}
+
 Status OmFileSaveHelper::SaveModel(const SaveParam &save_param, const char *output_file, ModelBufferData &model,
                                    bool is_offline) {
   (void)save_param.cert_file;
@@ -198,6 +339,10 @@ Status OmFileSaveHelper::SaveModel(const SaveParam &save_param, const char *outp
 
 Status OmFileSaveHelper::SaveModelToFile(const char *output_file, ModelBufferData &model, bool is_offline) {
 #if !defined(NONSUPPORT_SAVE_TO_FILE)
+  if (context_.partition_datas_.empty()) {
+    GE_CHK_BOOL_EXEC(!model_contexts_.empty(), return FAILED, "mode contexts empty");
+    context_ = model_contexts_.front();
+  }
   uint32_t model_data_len = context_.model_data_len_;
   if (model_data_len == 0) {
     GELOGE(domi::PARAM_INVALID, "Model data len error! should not be 0");
@@ -231,4 +376,53 @@ Status OmFileSaveHelper::SaveModelToFile(const char *output_file, ModelBufferDat
   return SUCCESS;
 #endif
 }
+
+Status OmFileSaveHelper::SaveRootModel(const SaveParam &save_param, const char *output_file,
+                                       ModelBufferData &model, bool is_offline) {
+  (void)save_param.cert_file;
+  (void)save_param.ek_file;
+  (void)save_param.encode_mode;
+  (void)save_param.hw_key_file;
+  (void)save_param.pri_key_file;
+
+#if !defined(NONSUPPORT_SAVE_TO_FILE)
+  vector<ModelPartitionTable *> model_partition_tabels;
+  vector<vector<ModelPartition>> all_model_partitions;
+  for (size_t ctx_index = 0; ctx_index < model_contexts_.size(); ++ctx_index) {
+    auto &cur_ctx = model_contexts_[ctx_index];
+    uint32_t cur_model_data_len = cur_ctx.model_data_len_;
+    if (cur_model_data_len == 0) {
+      GELOGE(domi::PARAM_INVALID, "Model data len error! should not be 0");
+      return domi::PARAM_INVALID;
+    }
+
+    auto tmp_table = GetPartitionTable(ctx_index);
+    if (tmp_table == nullptr) {
+      GELOGE(ge::GE_GRAPH_SAVE_FAILED, "SaveModelToFile execute failed: partition_table is NULL.");
+      return ge::GE_GRAPH_SAVE_FAILED;
+    }
+    uint32_t size_of_table = SIZE_OF_MODEL_PARTITION_TABLE(*tmp_table);
+    FMK_UINT32_ADDCHECK(size_of_table, cur_model_data_len)
+    FMK_UINT32_ADDCHECK(size_of_table + cur_model_data_len, model_header_.length)
+    model_header_.length += size_of_table + cur_model_data_len;
+    model_partition_tabels.push_back(tmp_table);
+    all_model_partitions.push_back(cur_ctx.partition_datas_);
+    GELOGD("sizeof(ModelPartitionTable):%u, cur_model_data_len:%u, cur_context_index:%zu",
+           size_of_table, cur_model_data_len, ctx_index);
+  }
+  Status ret;
+  if (is_offline) {
+    ret = FileSaver::SaveToFile(output_file, model_header_, model_partition_tabels, all_model_partitions);
+  } else {
+    GELOGW("do not support save ge root model to buff now");
+    return FAILED;
+  }
+  if (ret == SUCCESS) {
+    GELOGD("Save model success without encrypt.");
+  }
+  return ret;
+#else
+  return SUCCESS;
+#endif
+}
 }  // namespace ge
diff --git a/ge/common/op/ge_op_utils.cc b/ge/common/op/ge_op_utils.cc
index 579190d6..fc2990b6 100644
--- a/ge/common/op/ge_op_utils.cc
+++ b/ge/common/op/ge_op_utils.cc
@@ -357,7 +357,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void OpUtils::TransDataHWCK2KCH
   const char *w_data = (const char *)input;
 
   int64_t count = h * w * c * k;
-  GE_IF_BOOL_EXEC(count <= 0, GELOGW("Count value must be greater than 0, but count = %ld", count); return );
+  GE_IF_BOOL_EXEC(count <= 0, GELOGW("Count value must be greater than 0, but count = %ld", count); return);
   float *buf = new (std::nothrow) float[count]();
   GE_RT_VOID_CHECK_NOTNULL(buf);
   float *src_buff = nullptr;
diff --git a/ge/common/profiling/ge_profiling.cc b/ge/common/profiling/ge_profiling.cc
new file mode 100644
index 00000000..43ed6434
--- /dev/null
+++ b/ge/common/profiling/ge_profiling.cc
@@ -0,0 +1,198 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/profiling/ge_profiling.h"
+#include "runtime/base.h"
+#include "common/profiling/profiling_manager.h"
+#include "framework/common/debug/ge_log.h"
+#include "framework/common/debug/log.h"
+#include "graph/load/graph_loader.h"
+#include "init/gelib.h"
+#include "framework/common/ge_inner_error_codes.h"
+
+namespace {
+const uint32_t kDeviceListIndex = 3;
+const std::string kDeviceNums = "devNums";
+const std::string kDeviceIdList = "devIdList";
+const std::string kProfilingInit = "prof_init";
+const std::string kProfilingFinalize = "prof_finalize";
+const std::string kProfilingStart = "prof_start";
+const std::string kProfilingStop = "prof_stop";
+const std::string kProfModelSubscribe = "prof_model_subscribe";
+const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe";
+const std::string kRtSetDeviceRegName = "profiling";
+
+const std::map<ProfCommandHandleType, std::string> kProfCommandTypeMap = {
+    {kProfCommandhandleInit, kProfilingInit},
+    {kProfCommandhandleStart, kProfilingStart},
+    {kProfCommandhandleStop, kProfilingStop},
+    {kProfCommandhandleFinalize, kProfilingFinalize},
+    {kProfCommandhandleModelSubscribe, kProfModelSubscribe},
+    {kProfCommandhandleModelUnsubscribe, kProfModelUnsubscribe}};
+}  // namespace
+
+bool TransProfConfigToParam(const ProfCommandHandleData &profCommand, vector<string> &prof_config_params) {
+  prof_config_params.clear();
+  prof_config_params.emplace_back(kDeviceNums);
+  prof_config_params.emplace_back(std::to_string(profCommand.devNums));
+  prof_config_params.emplace_back(kDeviceIdList);
+  std::string devID = "";
+  if (profCommand.devNums == 0) {
+    GELOGW("The device num is invalid.");
+    return false;
+  }
+  for (uint32_t i = 0; i < profCommand.devNums; i++) {
+    devID.append(std::to_string(profCommand.devIdList[i]));
+    if (i != profCommand.devNums - 1) {
+      devID.append(",");
+    }
+  }
+
+  prof_config_params.push_back(devID);
+  return true;
+}
+
+bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) {
+  if (deviceid_list == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "deviceIdList is nullptr");
+    return false;
+  }
+  if (device_nums == 0 || device_nums > MAX_DEV_NUM) {
+    GELOGE(ge::PARAM_INVALID, "The device nums: %u is invalid.", device_nums);
+    return false;
+  }
+
+  // real device num
+  int32_t dev_count = 0;
+  rtError_t rt_err = rtGetDeviceCount(&dev_count);
+  if (rt_err != RT_ERROR_NONE) {
+    GELOGE(ge::INTERNAL_ERROR, "Get the Device count fail.");
+    return false;
+  }
+
+  if (device_nums > static_cast<uint32_t>(dev_count)) {
+    GELOGE(ge::PARAM_INVALID, "Device num(%u) is not in range 1 ~ %d.", device_nums, dev_count);
+    return false;
+  }
+
+  std::unordered_set<uint32_t> record;
+  for (size_t i = 0; i < device_nums; ++i) {
+    uint32_t dev_id = deviceid_list[i];
+    if (dev_id >= static_cast<uint32_t>(dev_count)) {
+      GELOGE(ge::PARAM_INVALID, "Device id %u is not in range 0 ~ %d(exclude %d)", dev_id, dev_count, dev_count);
+      return false;
+    }
+    if (record.count(dev_id) > 0) {
+      GELOGE(ge::PARAM_INVALID, "Device id %u is duplicatedly set", dev_id);
+      return false;
+    }
+    record.insert(dev_id);
+  }
+  return true;
+}
+
+ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) {
+  if (func == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "Msprof ctrl callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) {
+    GELOGW("Msprof ctrl callback is exist, just ignore it.");
+  } else {
+    ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func);
+  }
+  return ge::SUCCESS;
+}
+
+ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
+  if (func == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofSetDeviceCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  // Pass MsprofSetDeviceCallback to runtime
+  ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast<rtDeviceStateCallback>(func));
+  if (rt_ret != ge::SUCCESS) {
+    GELOGE(rt_ret, "Pass MsprofSetDeviceCallback to runtime failed!");
+    return rt_ret;
+  }
+  return ge::SUCCESS;
+}
+
+ge::Status RegProfReporterCallback(MsprofReporterCallback func) {
+  if (func == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofReporterCallback != nullptr) {
+    GELOGW("Msprof reporter callback is exist, just ignore it.");
+  } else {
+    GELOGI("GE register Msprof reporter callback.");
+    ge::ProfilingManager::Instance().SetMsprofReporterCallback(func);
+    // Pass MsprofReporterCallback to runtime
+    ge::Status rt_ret = rtSetMsprofReporterCallback(func);
+    if (rt_ret != ge::SUCCESS) {
+      GELOGE(rt_ret, "Pass MsprofReporterCallback to runtime failed!!");
+      return rt_ret;
+    }
+    // Pass MsprofReporterCallback to hccl
+  }
+  return ge::SUCCESS;
+}
+
+ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) {
+  if (type != kProfCommandhandleFinalize) {
+    GE_CHECK_NOTNULL(data);
+  }
+  ProfCommandHandleData *prof_config_param = reinterpret_cast<ProfCommandHandleData *>(data);
+  auto iter = kProfCommandTypeMap.find(type);
+  if (iter == kProfCommandTypeMap.end()) {
+    GELOGW("The prof comand type is invalid.");
+    return ge::PARAM_INVALID;
+  }
+  std::vector<string> prof_params;
+  if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
+    if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) {
+      return ge::FAILED;
+    }
+  
+    if (!TransProfConfigToParam(*prof_config_param, prof_params)) {
+      GELOGE(ge::PARAM_INVALID, "Transfer profilerConfig to string vector failed");
+      return ge::PARAM_INVALID;
+    }
+  }
+  ge::GraphLoader graph_loader;
+  ge::Command command;
+  command.cmd_params.clear();
+  command.cmd_type = iter->second;
+  command.cmd_params = prof_params;
+  if (type != kProfCommandhandleFinalize) {
+    command.module_index = prof_config_param->profSwitch;
+  }
+  GELOGI("GE commandhandle execute, Command Type: %s, data type config: 0x%llx", iter->second.c_str(),
+         command.module_index);
+  if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) {
+    GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str());
+  }
+  ge::Status ret = graph_loader.CommandHandle(command);
+  if (ret != ge::SUCCESS) {
+    GELOGE(ret, "Handle profiling command failed");
+    return ge::FAILED;
+  }
+
+  GELOGI("Successfully execute profiling command type: %d, command 0x%llx.", type, command.module_index);
+  return ge::SUCCESS;
+}
+
diff --git a/ge/common/profiling/ge_runner_profiling.cc b/ge/common/profiling/ge_runner_profiling.cc
new file mode 100644
index 00000000..067aafe3
--- /dev/null
+++ b/ge/common/profiling/ge_runner_profiling.cc
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/profiling/ge_runner_profiling.h"
+#include "init/gelib.h"
+
+bool IsInitialize() {
+  std::shared_ptr<ge::GELib> instance_ptr = ge::GELib::GetInstance();
+  if (instance_ptr == nullptr || instance_ptr->InitFlag() == false) {
+    return false;
+  }
+  return true;
+}
diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc
index 2f0f061f..1fc4dba6 100644
--- a/ge/common/profiling/profiling_manager.cc
+++ b/ge/common/profiling/profiling_manager.cc
@@ -24,16 +24,11 @@
 #include "graph/load/new_model_manager/davinci_model.h"
 
 namespace {
-const char *const kJobID = "jobID";
-const char *const kDeviceID = "deviceID";
-const char *const kStartCfg = "startCfg";
-const char *const kFeatures = "features";
-const char *const kConf = "conf";
-const char *const kEvents = "events";
-const char *const kAiCoreEvents = "ai_core_events";
-const char *const kName = "name";
-const char *const kTraceID = "traceId";
-const char *const kProfDir = "resultPath";
+const char *const kTrainingTrace = "training_trace";
+const char *const kFpPoint = "fp_point";
+const char *const kBpPoint = "bp_point";
+
+#ifdef DAVINCI_SUPPORT_PROFILING
 const size_t kReportMaxLen = 2048;
 const int32_t kMaxDeviceNum = 256;
 const std::string kConfigNumsdev = "devNums";
@@ -42,10 +37,15 @@ const std::string kProfStart = "prof_start";
 const std::string kProfStop = "prof_stop";
 const std::string kProfModelSubscribe = "prof_model_subscribe";
 const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe";
+#endif
 }  // namespace
 
 namespace ge {
-ProfilingManager::ProfilingManager() : subscribe_count_(0) {}
+ProfilingManager::ProfilingManager()
+    : is_load_profiling_(false), is_execute_profiling_(false), is_training_trace_(false), subscribe_count_(0) {
+  prof_cb_.msprofCtrlCallback = nullptr;
+  prof_cb_.msprofReporterCallback = nullptr;
+}
 
 ProfilingManager::~ProfilingManager() {}
 
@@ -58,44 +58,29 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
 #ifdef DAVINCI_SUPPORT_PROFILING
   vector<int32_t>().swap(device_id_);
   subscribe_count_ = 0;
-  job_id_ = options.job_id;
-
-  GELOGI("ProfilingManager::Init  job_id:%s", job_id_.c_str());
-
+  GELOGI("ProfilingManager::Init  job_id:%s", options.job_id.c_str());
 
-
-  Status ret;
-  if (!recv_profiling_config_.empty()) {
-    GELOGI("Profiling json config from acl:%s", recv_profiling_config_.c_str());
-    ret = InitFromAclCfg(recv_profiling_config_);
-  } else {
-    ret = InitFromOptions(options);
-    if (ret == SUCCESS && is_load_profiling_) {
-      device_id_.push_back(options.device_id);
-    }
-  }
+  struct MsprofGeOptions prof_conf = {{ 0 }};
+  Status ret = InitFromOptions(options, prof_conf);
   if (ret != SUCCESS) {
     GELOGE(ret, "Failed to init profiling.");
     return ret;
   }
 
-  if (is_load_profiling_) {
-    // register Framework to profiling
-    int result = Msprof::Engine::Init(GE_PROFILING_MODULE, &engine_);
-    if (result != 0) {
-      GELOGE(FAILED, "Register profiling engine failed.");
-      return FAILED;
+  if (is_execute_profiling_) {
+    if (prof_cb_.msprofCtrlCallback == nullptr) {
+      GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr.");
+      return ge::PARAM_INVALID;
     }
-    // profiling startup first time
-    GELOGI("Begin to init profiling, device num %zu", device_id_.size());
-    for (size_t i = 0; i < device_id_.size(); ++i) {
-      ret = StartProfiling(0, device_id_[i]);
-      if (ret != SUCCESS) {
-        GELOGW("Profiling start failed on device %d.", device_id_[i]);
-        continue;
-      }
-      GELOGI("Profiling init succ on device %d.", device_id_[i]);
+    int32_t cb_ret = prof_cb_.msprofCtrlCallback(
+        static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
+        static_cast<void *>(&prof_conf), sizeof(MsprofGeOptions));
+    if (cb_ret != 0) {
+      GELOGE(FAILED, "Call msprofCtrlCallback failed, type:%u, return:%d",
+             static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), cb_ret);
+      return FAILED;
     }
+    GELOGI("Profiling init success");
   } else {
     GELOGI("The profiling is off, skip the initialization");
   }
@@ -103,288 +88,120 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
   return SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::InitFromAclCfg(
-    const std::string &config) {
+ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOptions &prof_conf) {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  try {
-    is_load_profiling_ = false;
-    is_execute_profiling_ = false;
-    profiling_opts_.clear();
-    op_trace_conf_.clear();
-    Json start_prof_conf = Json::parse(config);
-    Json &prof_conf = start_prof_conf[kStartCfg][0];
-    job_id_ = prof_conf[kJobID];
-    auto iter = prof_conf.find(kProfDir);
-    if (iter != prof_conf.end()) {
-      prof_dir_ = prof_conf[kProfDir];
-    }
-    Json &device_id = prof_conf[kDeviceID];
-    if (device_id.size() != 0) {
-      vector<int32_t>().swap(device_id_);
-      bool is_all = false;
-      for (size_t i = 0; i < device_id.size(); i++) {
-        std::string device_id_str = device_id[i].get<std::string>();
-        if (device_id_str == "all") {
-          is_all = true;
-          break;
-        }
-        device_id_.push_back(std::stoi(device_id_str));
-      }
-      if (is_all) {
-        int32_t count = 0;
-        rtError_t rt_err = rtGetDeviceCount(&count);
-        if (rt_err != RT_ERROR_NONE) {
-          GELOGE(FAILED, "Call rtGetDeviceCount to get device failed.");
-        }
+  // enable profiling by env
+  char env_profiling_mode[MMPA_MAX_PATH] = { 0x00 };
+  is_execute_profiling_ = false;
 
-        vector<int32_t>().swap(device_id_);
-        for (int32_t i = 0; i < count; ++i) {
-          device_id_.push_back(i);
-        }
-      }
+  if (options.profiling_mode == "1" && !options.profiling_options.empty()) {
+    // enable profiling by ge option
+    if (strncpy_s(prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX, options.profiling_options.c_str(),
+                  MSPROF_OPTIONS_DEF_LEN_MAX - 1) != EOK) {
+      GELOGE(INTERNAL_ERROR, "copy profiling_options failed.");
+      return INTERNAL_ERROR;
     }
-
-    Json &features = prof_conf[kFeatures];
-    if (ParseFeaturesFromAclCfg(features) != SUCCESS) {
-      GELOGE(FAILED, "Parse feature from acl cfg failed.");
-      return FAILED;
+    is_execute_profiling_ = true;
+    GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(), prof_conf.options,
+           options.profiling_options.c_str());
+  } else {
+    (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH);
+    (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX);
+    // The env is invalid
+    if ((strcmp("true", env_profiling_mode) != 0) || (strcmp(prof_conf.options, "\0") == 0)) {
+      return SUCCESS;
     }
-    is_load_profiling_ = true;
+    // enable profiling by env
     is_execute_profiling_ = true;
-  } catch (...) {
-    GELOGE(FAILED, "Json conf is not invalid !");
+    GELOGI("The profiling in env is %s, %s", env_profiling_mode, prof_conf.options);
+  }
+
+  if (!is_execute_profiling_) {
+    return SUCCESS;
+  }
+
+  // Parse json str for bp fp
+  Status ret = ParseOptions(prof_conf.options);
+  if (ret != ge::SUCCESS) {
+    GELOGE(ge::PARAM_INVALID, "Parse training trace param failed.");
     return ge::PARAM_INVALID;
   }
+
+  if (strncpy_s(prof_conf.jobId, MSPROF_OPTIONS_DEF_LEN_MAX, options.job_id.c_str(), MSPROF_OPTIONS_DEF_LEN_MAX - 1) !=
+      EOK) {
+    GELOGE(INTERNAL_ERROR, "copy job_id failed.");
+    return INTERNAL_ERROR;
+  }
+  GELOGI("Job id: %s, original job id: %s.", prof_conf.jobId, options.job_id.c_str());
 #endif
   return ge::SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::ParseFeaturesFromAclCfg(
-    const Json &features) {
-#ifdef DAVINCI_SUPPORT_PROFILING
+ge::Status ProfilingManager::ParseOptions(const std::string &options) {
+  if (options.empty()) {
+    GELOGE(ge::PARAM_INVALID, "Profiling options is empty.");
+    return ge::PARAM_INVALID;
+  }
   try {
-    for (size_t i = 0; i < features.size(); ++i) {
-      const Json &feature = features[i];
-      if ((feature.find(kName) == feature.end()) || feature[kName].is_null()) {
-        continue;
-      }
-      const std::string &name = feature[kName];
-      if (name == "op_trace") {
-        const Json &conf = feature[kConf];
-        const Json &events = conf[0][kEvents];
-        const std::string &ai_core_events = events[0][kAiCoreEvents];
-        GELOGI("Op trace config from acl ai_core_events:%s", ai_core_events.c_str());
-        is_op_trace_ = true;
-        ProfMgrConf prof_mgr_conf;
-        int result = ProfMgrGetConf(ai_core_events, &prof_mgr_conf);
-        if (result != 0) {
-          GELOGE(FAILED, "ProfMgrGetConf failed.");
-          return FAILED;
-        }
-        op_trace_conf_ = prof_mgr_conf.conf;
-        op_trace_iter_num_ = static_cast<int32_t>(op_trace_conf_.size());
-        GELOGI("Op trace profiling iter num %d,", op_trace_iter_num_);
-      } else if (name == "task_trace") {
-        is_op_trace_ = false;
-        if (feature.find(kConf) != feature.end()) {
-          const Json &conf = feature[kConf];
-          std::stringstream task_trace_conf;
-          task_trace_conf << conf;
-          task_trace_conf_ = task_trace_conf.str();
-        }
-        GELOGI("Task trace config from acl");
-      } else if (name == "system_trace") {
-        is_op_trace_ = false;
-        const Json &conf = feature[kConf];
-        std::stringstream system_trace_conf;
-        system_trace_conf << conf;
-        system_trace_conf_ = system_trace_conf.str();
-        GELOGI("System trace config from acl");
-      }
-      profiling_opts_.push_back(name);
+    Json prof_options = Json::parse(options);
+    if (options.find(kTrainingTrace) == std::string::npos) {
+      return ge::SUCCESS;
     }
+    const std::string training_trace = prof_options[kTrainingTrace];
+    if (training_trace.empty()) {
+      GELOGI("Training trace will not take effect.");
+      return ge::SUCCESS;
+    }
+    GELOGI("GE profiling training trace:%s", training_trace.c_str());
+    if (training_trace != "on") {
+      GELOGE(ge::PARAM_INVALID, "Training trace param:%s is invalid.", training_trace.c_str());
+      return ge::PARAM_INVALID;
+    }
+    fp_point_ = prof_options[kFpPoint];
+    bp_point_ = prof_options[kBpPoint];
+    if (!fp_point_.empty() && !bp_point_.empty()) {
+      GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str());
+    }
+    is_training_trace_ = true;
   } catch (...) {
-    GELOGE(ge::PARAM_INVALID, "Json conf feature is not invalid !");
+    GELOGE(FAILED, "Json prof_conf options is invalid.");
     return ge::PARAM_INVALID;
   }
-#endif
   return ge::SUCCESS;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::InitFromOptions(const Options &options) {
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProfiling() {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  // enable profiling support two ways: env and front end
-  char profiling_mode_temp[MMPA_MAX_PATH] = { 0x00 };
-  char prof_options_temp[MMPA_MAX_PATH] = { 0x00 };
-  (void)mmGetEnv("PROFILING_MODE", profiling_mode_temp, MMPA_MAX_PATH);
-  (void)mmGetEnv("PROFILING_OPTIONS", prof_options_temp, MMPA_MAX_PATH );
-  const char *profiling_mode = profiling_mode_temp;
-  const char *prof_options = prof_options_temp;
-  if ((profiling_mode == nullptr) || (strcmp("true", profiling_mode) != 0) || (prof_options == nullptr)) {
-    is_load_profiling_ = false;
-    is_execute_profiling_ = false;
-  } else {
-    std::string prof_options_str = std::string(prof_options);
-    profiling_opts_ = StringUtils::Split(prof_options_str, ':');
-    is_load_profiling_ = true;
-    is_execute_profiling_ = true;
-    GELOGI("The profiling in env is %s, %s", profiling_mode, prof_options);
-  }
-  if (!is_load_profiling_) {
-    const std::string enable_profiling = "1";
-    if (options.profiling_mode != enable_profiling || options.profiling_options.empty()) {
-      is_load_profiling_ = false;
-      is_execute_profiling_ = false;
-      return SUCCESS;
-    } else {
-      profiling_opts_ = StringUtils::Split(options.profiling_options, ':');
-      is_load_profiling_ = true;
-      is_execute_profiling_ = true;
-      GELOGI("The profiling in options is %s, %s", options.profiling_mode.c_str(), options.profiling_options.c_str());
-    }
-  }
-  // features:'training_trace', 'task_trace' or 'op_trace'  etc
-  if (!profiling_opts_.empty()) {
-    if (profiling_opts_[0] == "op_trace") {
-      is_op_trace_ = true;
-      // op trace get conf
-      ProfMgrConf prof_mgr_conf;
-      int result = ProfMgrGetConf("", &prof_mgr_conf);
-      if (result != 0) {
-        GELOGE(FAILED, "ProfMgrGetConf failed.");
-        return FAILED;
-      }
-      op_trace_conf_ = prof_mgr_conf.conf;
-      op_trace_iter_num_ = static_cast<int32_t>(op_trace_conf_.size());
-      GELOGI("op trace profiling iter num %d,", op_trace_iter_num_);
-    } else {
-      is_op_trace_ = false;
-      op_trace_iter_num_ = 1;
+  uint64_t module = GetProfilingModule();
+  // The following if case will not be executed in normal case, inc case of ProfStopProfiling is abnormal
+  int32_t device_num = static_cast<int32_t>(device_id_.size());
+  if (device_num != 0) {
+    auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
+    if (device_id_ptr == nullptr) {
+      GELOGE(FAILED, "Stop profiling: device id ptr is null.");
+      return;
     }
-  }
-#endif
-  return ge::SUCCESS;
-}
-
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::StartProfiling(int32_t iter_num,
-                                                                                             int32_t device_id) {
-#ifdef DAVINCI_SUPPORT_PROFILING
-  if (!profiling_opts_.empty()) {
-    GELOGI("Start profiling index is %d", iter_num);
-    // current one docker only use one device
-    Json p_device;
-
-    try {
-      // profiling need physical_device_id
-      p_device[kDeviceID] = std::to_string(device_id);
-      p_device[kJobID] = job_id_;
-      p_device[kTraceID] = std::to_string(GetContext().TraceId());
-      if (!prof_dir_.empty()) {
-        p_device[kProfDir] = prof_dir_;
-        GELOGI("Prof dir: %s.", prof_dir_.c_str());
-      }
-
-      Json features;
-      if (is_op_trace_) {
-        Json f;
-        f[kName] = "op_trace";
-        Json conf;
-        if (op_trace_conf_.size() <= static_cast<size_t>(iter_num)) {
-          GELOGE(FAILED, "Op trace iter num is invalid!");
-          return FAILED;
-        }
-        Json events;
-        events[0] = nlohmann::json::parse(op_trace_conf_[iter_num]);
-        conf[0][kEvents] = events;
-        f[kConf] = conf;
-        features[0] = f;
-        if (iter_num == 0) {
-          is_load_ = true;
-        }
-      } else {
-        for (std::vector<std::string>::size_type i = 0; i < profiling_opts_.size(); i++) {
-          Json f;
-          if (profiling_opts_[i] == "system_trace") {
-            f[kConf] = nlohmann::json::parse(system_trace_conf_);
-          } else if (profiling_opts_[i] == "task_trace") {
-            if (!task_trace_conf_.empty()) {
-              f[kConf] = nlohmann::json::parse(task_trace_conf_);
-            }
-          }
-          f[kName] = profiling_opts_[i];
-          features[i] = f;
-        }
-        is_load_ = true;
-      }
-      p_device[kFeatures] = features;
-      // only one device, but sProfMgrStartUp API require for device list
-      Json devices;
-      devices[0] = p_device;
-
-      Json start_cfg;
-      start_cfg[kStartCfg] = devices;
-
-      // convert json to string
-      std::stringstream ss;
-      ss << start_cfg;
-      send_profiling_config_ = ss.str();
-      GELOGI("Profiling config %s\n", send_profiling_config_.c_str());
-    } catch (...) {
-      GELOGE(FAILED, "Op trace json conf is not invalid !");
-      return FAILED;
+    for (int32_t i = 0; i < device_num; i++) {
+      device_id_ptr[i] = static_cast<uint32_t>(device_id_[i]);
     }
-
-    // runtime startup for profiling
-    uint64_t module = GetProfilingModule();
-    int32_t device_num = 1;
-    uint32_t device_id_rt = static_cast<uint32_t>(device_id);
-    GE_CHK_RT_RET(rtProfilerStart(module, device_num, &device_id_rt));
-
-    // call profiling startup API
-    ProfMgrCfg prof_cfg = {send_profiling_config_};
-    void *prof_handle = ProfMgrStartUp(&prof_cfg);
-    if (prof_handle == nullptr) {
-      GELOGW("ProfMgrStartUp failed on device %d ", device_id);
-      return FAILED;
+    rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret);
     }
-    GELOGD("StartProfiling, prof_handle: %p", prof_handle);
-    prof_handle_vec_.push_back(prof_handle);
   }
-#endif
-  return SUCCESS;
-}
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProfiling() {
-#ifdef DAVINCI_SUPPORT_PROFILING
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  if (reporter != nullptr) {
-    int ret = reporter->Flush();
-    GELOGI("Report data end, ret is %d", ret);
+  // stop profiling
+  if (prof_cb_.msprofCtrlCallback == nullptr) {
+      GELOGE(ge::PARAM_INVALID, "MsprofCtrlCallback callback is nullptr.");
+      return;
   }
-  uint64_t module = GetProfilingModule();
-  int32_t device_num = static_cast<int32_t>(device_id_.size());
-  auto device_id_ptr = std::unique_ptr<uint32_t[]>(new (std::nothrow) uint32_t[device_num]);
-  if (device_id_ptr == nullptr) {
-    GELOGE(FAILED, "Stop profiling: device id ptr is null.");
+  int32_t cb_ret = prof_cb_.msprofCtrlCallback(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE),
+                                               nullptr, 0);
+  if (cb_ret != 0) {
+    GELOGW("call msprofCtrlCallback failed, type:%u, return:%d",
+           static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), cb_ret);
     return;
   }
-  for (int32_t i = 0; i < device_num; i++) {
-    device_id_ptr[i] = static_cast<uint32_t>(device_id_[i]);
-  }
-  rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
-  if (rt_ret != RT_ERROR_NONE) {
-    GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret);
-  }
-
-  for (size_t i = 0; i < prof_handle_vec_.size(); ++i) {
-    int result = ProfMgrStop(prof_handle_vec_[i]);
-    if (result != 0) {
-      GELOGW("ProfMgr stop return fail:%d, handle:%p", result, prof_handle_vec_[i]);
-    }
-  }
-  vector<void *>().swap(prof_handle_vec_);
-  is_load_ = false;
-  recv_profiling_config_ = "";
   GELOGI("Stop Profiling success.");
 #endif
 }
@@ -392,12 +209,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::StopProf
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ProfilingTaskDescInfo(
     uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info, const int32_t &device_id) {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  if (reporter == nullptr) {
-    GELOGI("Profiling report is nullptr!");
-    return;
-  }
-
   std::string data;
   for (const auto &task : task_desc_info) {
     std::string model_name = task.model_name;
@@ -405,14 +216,18 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
     uint32_t block_dim = task.block_dim;
     uint32_t task_id = task.task_id;
     uint32_t stream_id = task.stream_id;
+    std::string shape_type = task.shape_type;
+    int64_t cur_iter_num = task.cur_iter_num;
     data = model_name.append(" ")
                      .append(op_name).append(" ")
-                     .append(std::to_string(block_dim).append(" ")
+                     .append(std::to_string(block_dim)).append(" ")
                      .append(std::to_string(task_id)).append(" ")
                      .append(std::to_string(stream_id)).append(" ")
-                     .append(std::to_string(model_id)).append("\n"));
+                     .append(std::to_string(model_id)).append(" ")
+                     .append(shape_type).append(" ")
+                     .append(std::to_string(cur_iter_num)).append("\n");
 
-    Msprof::Engine::ReporterData reporter_data{};
+    ReporterData reporter_data{};
     reporter_data.deviceId = device_id;
     reporter_data.data = (unsigned char *)data.c_str();
     reporter_data.dataLen = data.size();
@@ -422,9 +237,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
       return;
     }
 
-    ret = reporter->Report(&reporter_data);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Reporter data of task_desc_info fail!");
+    int32_t cb_ret = CallMsprofReport(reporter_data);
+    if (cb_ret != 0) {
+      GELOGE(cb_ret, "Reporter data of task_desc_info failed, ret:%d", cb_ret);
       return;
     }
   }
@@ -436,9 +251,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ProfilingGraphDescInfo(
     uint32_t model_id, const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info, const int32_t &device_id) {
 #ifdef DAVINCI_SUPPORT_PROFILING
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return;);
-
   std::string data;
   for (const auto &graph : compute_graph_desc_info) {
     data.append("model_name:")
@@ -493,64 +305,54 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Profilin
     }
 
     data.append(" model_id:").append(std::to_string(model_id));
-
+    data.append(" task_id:").append(std::to_string(graph.task_id));
+    data.append(" stream_id:").append(std::to_string(graph.stream_id));
     data.append("\n");
 
-    Msprof::Engine::ReporterData reporter_data{};
-    Report(device_id, data, *reporter, reporter_data);
-
+    GraphDescReport(device_id, data);
     data.clear();
   }
 #endif
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::Report(
-    const int32_t &device_id, const string &data, Msprof::Engine::Reporter &reporter,
-    Msprof::Engine::ReporterData &reporter_data) {
+void ProfilingManager::GraphDescReport(const int32_t &device_id, const string &data) {
 #ifdef DAVINCI_SUPPORT_PROFILING
+  ReporterData reporter_data{};
+  int ret = -1;
+  int32_t cb_ret = -1;
   size_t index = data.size() / kReportMaxLen;
   if (index >= 1) {
     reporter_data.deviceId = device_id;
-    int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
+    ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
     GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag of graph_desc_info memcpy error!"); return;);
     for (size_t i = 0; i < index; ++i) {
       reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * i;
       reporter_data.dataLen = kReportMaxLen;
-      ret = reporter.Report(&reporter_data);
-      GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;);
+      cb_ret = CallMsprofReport(reporter_data);
+      GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data of graph_desc_info failed, ret:%d", cb_ret); return;);
     }
     reporter_data.dataLen = data.size() - kReportMaxLen * index;
     if (reporter_data.dataLen != 0) {
       reporter_data.data = (unsigned char *)data.c_str() + kReportMaxLen * index;
-      ret = reporter.Report(&reporter_data);
-      GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;);
+      cb_ret = CallMsprofReport(reporter_data);
+      GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data of graph_desc_info failed, ret:%d", cb_ret); return;);
     }
   } else {
     reporter_data.deviceId = device_id;
     reporter_data.data = (unsigned char *)data.c_str();
     reporter_data.dataLen = data.size();
-    int ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
+    ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, "graph_desc_info", sizeof("graph_desc_info"));
     GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag of graph_desc_info memcpy error!"); return;);
 
-    ret = reporter.Report(&reporter_data);
-    GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Reporter data of graph_desc_info fail!"); return;);
-  }
-#endif
-}
-
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit(const std::string &module) const {
-#ifdef DAVINCI_SUPPORT_PROFILING
-  int ret = Msprof::Engine::UnInit(module);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "profiling plugin uninit failed, ret:%d", ret);
+    cb_ret = CallMsprofReport(reporter_data);
+    GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data of graph_desc_info failed, ret:%d", cb_ret); return;);
   }
 #endif
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportProfilingData(
     uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
-    const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info,
-    bool check_device) {
+    const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info) {
 #ifdef DAVINCI_SUPPORT_PROFILING
   int32_t logic_device_id = 0;
   rtError_t rt_ret = rtGetDevice(&logic_device_id);
@@ -559,13 +361,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportPr
     return;
   }
   GELOGD("current logic_device_id:%d", logic_device_id);
-  if (check_device) {
-    auto ret = std::find(device_id_.begin(), device_id_.end(), logic_device_id);
-    if (ret == device_id_.end()) {
-      GELOGE(FAILED, "get valid phy_device_id failed, profiling report failed.");
-      return;
-    }
-  }
   GELOGD("start ProfilingTaskDescInfo.");
   ProfilingTaskDescInfo(model_id, task_desc_info, logic_device_id);
   GELOGD("start ProfilingGraphDescInfo.");
@@ -574,11 +369,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportPr
 #endif
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::SetProfilingConfig(
-    const std::string &profiling_cfg) {
-  recv_profiling_config_ = profiling_cfg;
-}
-
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t ProfilingManager::GetProfilingModule() {
   uint64_t module = PROF_MODEL_EXECUTE_MASK |
                     PROF_RUNTIME_API_MASK |
@@ -594,9 +384,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t ProfilingManager::GetP
   return module;
 }
 
-void ProfilingManager::UpdateSubscribeDeviceModuleMap(std::string prof_type,
-                                                      uint32_t device_id,
-                                                      uint64_t module) {
+void ProfilingManager::UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module) {
 #ifdef DAVINCI_SUPPORT_PROFILING
   if (prof_type == kProfModelSubscribe) {
     if (subs_dev_module_.find(device_id) != subs_dev_module_.end()) {
@@ -608,9 +396,13 @@ void ProfilingManager::UpdateSubscribeDeviceModuleMap(std::string prof_type,
       subs_dev_module_[device_id] = dev_info;
     }
   } else if (prof_type == kProfModelUnsubscribe) {
-    if (subs_dev_module_.find(device_id) != subs_dev_module_.end()) {
-      if (subs_dev_module_[device_id].subscribe_count > 0) {
-        subs_dev_module_[device_id].subscribe_count--;
+    auto iter = subs_dev_module_.find(device_id);
+    if (iter != subs_dev_module_.end()) {
+      if (iter->second.subscribe_count > 0) {
+        iter->second.subscribe_count--;
+      }
+      if (iter->second.subscribe_count == 0) {
+        subs_dev_module_.erase(iter);
       }
     }
   } else {
@@ -626,10 +418,11 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
   uint64_t model_load_mask = module & PROF_MODEL_LOAD_MASK;
   if ((subscribe_count_ == 0) && (model_load_mask == PROF_MODEL_LOAD_MASK)) {
     // register framework to profiling
-    int32_t result = Msprof::Engine::Init(GE_PROFILING_MODULE, &engine_);
-    if (result != SUCCESS) {
-      GELOGE(FAILED, "Register profiling engine failed.");
-      return FAILED;
+    // register Framework to profiling
+    int32_t cb_ret = PluginInit();
+    if (cb_ret != 0) {
+      GELOGE(cb_ret, "profiling plugin init failed, ret:%d", cb_ret);
+      return cb_ret;
     }
     GELOGI("Prof subscribe: model load profiling on.");
   }
@@ -647,7 +440,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
   UpdateSubscribeDeviceModuleMap(kProfModelSubscribe, device[0], module);
 
   // Report profiling data
-  Status p_ret = davinci_model->ReportProfilingData(false);
+  Status p_ret = davinci_model->ReportProfilingData();
   if (p_ret != SUCCESS) {
     GELOGE(p_ret, "Report profiling data failed.");
     return p_ret;
@@ -672,6 +465,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
   auto iter = subs_dev_module_.find(device[0]);
   if (iter != subs_dev_module_.end()) {
     if (subs_dev_module_[device[0]].subscribe_count == 1) {
+      // The same device_id, only stop at last time
       rtError_t rt_ret = rtProfilerStop(subs_dev_module_[device[0]].module, dev_num, device);
       if (rt_ret != RT_ERROR_NONE) {
         GELOGE(FAILED, "Runtime profiler stop failed.");
@@ -679,15 +473,15 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfMo
       }
     }
     UpdateSubscribeDeviceModuleMap(kProfModelUnsubscribe, device[0], subs_dev_module_[device[0]].module);
+  } else {
+    GELOGE(FAILED, "The device_id:%u has not been subscribed, do not need to cancel.", device[0]);
+    return FAILED;
   }
 
   subscribe_count_--;
   if (subscribe_count_ == 0) {
-    int32_t ret = Msprof::Engine::UnInit(GE_PROFILING_MODULE);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Profiling plugin uninit failed, ret:%d", ret);
-      return ret;
-    }
+    // profiling plugin uninit at last subscription
+    PluginUnInit();
   }
 #endif
   return SUCCESS;
@@ -700,11 +494,12 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfIn
 
   if (model_load_mask == PROF_MODEL_LOAD_MASK) {
     // register Framework to profiling
-    int32_t result = Msprof::Engine::Init(GE_PROFILING_MODULE, &engine_);
-    if (result != SUCCESS) {
-      GELOGE(FAILED, "Register profiling engine failed.");
-      return FAILED;
+    int32_t cb_ret = PluginInit();
+    if (cb_ret != 0) {
+      GELOGE(cb_ret, "profiling plugin init failed, ret:%d", cb_ret);
+      return cb_ret;
     }
+
     int32_t device_num = -1;
     rtError_t rt_ret = rtProfilerStart(model_load_mask, device_num, nullptr);
     if (rt_ret != RT_ERROR_NONE) {
@@ -719,7 +514,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfIn
   if (training_trace_mask == PROF_TRAINING_TRACE_MASK) {
     is_training_trace_ = true;
   }
-  is_acl_api_mode_ = true;
   GELOGI("Prof init success.");
 #endif
   return SUCCESS;
@@ -730,19 +524,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfFi
   std::lock_guard<std::mutex> lock(mutex_);
   is_load_profiling_ = false;
   is_training_trace_ = false;
-  is_acl_api_mode_ = false;
+  is_execute_profiling_ = false;
+
+  // profiling plugin uninit
+  PluginUnInit();
 
-  int32_t ret = Msprof::Engine::UnInit(GE_PROFILING_MODULE);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "Profiling plugin uninit failed, ret:%d", ret);
-  }
   int32_t dev_num = -1;
   rtError_t rt_ret = rtProfilerStop(PROF_MODEL_LOAD_MASK, dev_num, nullptr);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(FAILED, "Runtime profiler stop failed.");
     return FAILED;
   }
-
   for (auto device_id_module : device_id_module_map_) {
     if (device_id_module.second != 0) {
       uint32_t device_id = static_cast<uint32_t>(device_id_module.first);
@@ -792,6 +584,7 @@ Status ProfilingManager::ProfParseDeviceId(const std::map<std::string, std::stri
         return FAILED;
       } catch (std::out_of_range &) {
         GELOGE(FAILED, "Device id: %s is  out of range.", decvice_id[i].c_str());
+        return FAILED;
       } catch (...) {
         GELOGE(FAILED, "Device id: %s cannot change to int.", decvice_id[i].c_str());
         return FAILED;
@@ -818,6 +611,7 @@ Status ProfilingManager::ProfParseParam(const std::map<std::string, std::string>
       return FAILED;
     } catch (std::out_of_range &) {
       GELOGE(FAILED, "Device num: %s is  out of range.", iter->second.c_str());
+      return FAILED;
     } catch (...) {
       GELOGE(FAILED, "Device num: %s cannot change to int.", iter->second.c_str());
       return FAILED;
@@ -844,6 +638,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
     uint64_t module, const std::map<std::string, std::string> &config_para) {
 #ifdef DAVINCI_SUPPORT_PROFILING
   std::lock_guard<std::mutex> lock(mutex_);
+  uint64_t training_trace_mask = module & PROF_TRAINING_TRACE_MASK;
+  if (training_trace_mask == PROF_TRAINING_TRACE_MASK) {
+    is_training_trace_ = true;
+  }
   int32_t device_num = 0;
   vector<int32_t> device_list;
   if (ProfParseParam(config_para, device_num, device_list) != SUCCESS) {
@@ -859,7 +657,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
   for (int32_t i = 0; i < device_num; i++) {
     device_id_ptr[i] = static_cast<uint32_t>(device_list[i]);
   }
-  GELOGD("Runtime config param: 0x%llx, device num: %d.", module, device_num);
+  GELOGI("Runtime config param: 0x%llx, device num: %d.", module, device_num);
 
   rtError_t rt_ret = rtProfilerStart(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
@@ -878,7 +676,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
     GELOGW("Prof start: load model module is invalid.");
   }
   UpdateDeviceIdModuleMap(kProfStart, module, device_list);
-  GELOGD("Prof start profiling success.");
+  GELOGI("Prof start profiling success.");
 #endif
   return SUCCESS;
 }
@@ -901,7 +699,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
   for (int32_t i = 0; i < device_num; i++) {
     device_id_ptr[i] = static_cast<uint32_t>(device_list[i]);
   }
-  GELOGD("Prof stop: runtime config param: 0x%llx, device num: %d", module, device_num);
+  GELOGI("Prof stop: runtime config param: 0x%llx, device num: %d", module, device_num);
   rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get());
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(FAILED, "Prof stop: runtime profiler config proc failed.");
@@ -921,7 +719,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::ProfSt
     GELOGW("Prof stop: load model module is invalid.");
   }
   UpdateDeviceIdModuleMap(kProfStop, module, device_list);
-  GELOGD("Prof stop profiling success.");
+  GELOGI("Prof stop profiling success.");
 #endif
   return SUCCESS;
 }
@@ -963,47 +761,104 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::Profilin
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(rt_ret, "Runtime get logic_device_id failed, current logic_device_id:%d", logic_device_id);
   }
-  GELOGD("Current logic_device_id:%d", logic_device_id);
+  GELOGI("Current logic_device_id:%d", logic_device_id);
 
   bool execute_model_prof_on = false;
   auto iter = std::find(device_id_.begin(), device_id_.end(), logic_device_id);
   if (iter != device_id_.end()) {
     execute_model_prof_on = true;
   }
-  GELOGD("Flag is_execute_profiling: %d, execute_model_prof_on: %d", is_execute_profiling_, execute_model_prof_on);
-  return is_execute_profiling_ || execute_model_prof_on;
+  GELOGI("Flag is_execute_profiling: %d, execute_model_prof_on: %d", is_execute_profiling_, execute_model_prof_on);
+  return  execute_model_prof_on;
 }
 
-/**
- * @brief Profiling PluginImpl
- */
-// PluginImpl static variable init
-Msprof::Engine::Reporter *PluginImpl::reporter_ = nullptr;
-
-PluginImpl::PluginImpl(const std::string &module) : module_(module) { GELOGI("Create PluginImpl\n"); }
-
-int PluginImpl::Init(const Msprof::Engine::Reporter *reporter) {
-  GELOGI("PluginImpl init");
-  reporter_ = const_cast<Msprof::Engine::Reporter *>(reporter);
-  return 0;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() const {
+  if (prof_cb_.msprofReporterCallback == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  return prof_cb_.msprofReporterCallback(
+      static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
+      static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_INIT),
+      nullptr, 0);
 }
 
-int PluginImpl::UnInit() {
-  GELOGI("PluginImpl Uninit");
-  reporter_ = nullptr;
-  return 0;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::PluginUnInit() const {
+#ifdef DAVINCI_SUPPORT_PROFILING
+  if (prof_cb_.msprofReporterCallback == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return;
+  }
+  int32_t cb_ret = prof_cb_.msprofReporterCallback(
+      static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
+      static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT),
+      nullptr, 0);
+  if (cb_ret != 0) {
+    GELOGW("profiling plugin uninit failed, ret:%d", cb_ret);
+  }
+#endif
 }
 
-Msprof::Engine::PluginIntf *ProfilingEngineImpl::CreatePlugin() {
-  GELOGI(" Create Plugin");
-  return new (std::nothrow) PluginImpl(GE_PROFILING_MODULE);
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::CallMsprofReport(
+    ReporterData &reporter_data) const {
+  if (prof_cb_.msprofReporterCallback == nullptr) {
+    GELOGE(ge::PARAM_INVALID, "MsprofReporterCallback callback is nullptr.");
+    return ge::PARAM_INVALID;
+  }
+  return prof_cb_.msprofReporterCallback(
+      static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
+      static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT),
+      static_cast<void *>(&reporter_data), sizeof(ReporterData));
 }
 
-int ProfilingEngineImpl::ReleasePlugin(Msprof::Engine::PluginIntf *plugin) {
-  if (plugin != nullptr) {
-    delete plugin;
-    plugin = nullptr;
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::GetFpBpPoint(
+    std::string &fp_point, std::string &bp_point) {
+  // Env or options mode, fp_point_/bp_point_ have initiliazed on profiling init
+  if (!fp_point_.empty() && !bp_point_.empty()) {
+    fp_point = fp_point_;
+    bp_point = bp_point_;
+    GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(),
+           fp_point.c_str());
+    return;
+  }
+  // ProfApi mode and training trace is set
+  // Parse options first
+  char env_profiling_options[MSPROF_OPTIONS_DEF_LEN_MAX] = { 0x00 };
+  bool is_profiling_valid = false;
+  std::string profiling_options;
+  if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_OPTIONS, profiling_options) == SUCCESS &&
+      !profiling_options.empty()) {
+    is_profiling_valid = true;
+  } else {
+    INT32 ret = mmGetEnv("PROFILING_OPTIONS", env_profiling_options, MSPROF_OPTIONS_DEF_LEN_MAX);
+    if (ret != EN_OK) {
+      GELOGI("PROFILING_OPTIONS env is not exist.");
+      return;
+    }
+    GELOGI("Parse env PROFILING_OPTIONS:%s.", env_profiling_options);
+    profiling_options = env_profiling_options;
+    is_profiling_valid = true;
+  }
+  if (is_profiling_valid) {
+    try {
+      Json prof_options = Json::parse(profiling_options);
+
+      fp_point_ = prof_options[kFpPoint];
+      bp_point_ = prof_options[kBpPoint];
+
+      fp_point = fp_point_;
+      bp_point = bp_point_;
+      if (!fp_point_.empty() && !bp_point_.empty()) {
+        GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str());
+      }
+    } catch (...) {
+      GELOGW("Json prof options is invalid.");
+      return;
+    }
   }
-  return 0;
+
+  return;
 }
+
+
 }  // namespace ge
diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h
index 66cefc32..22fa8f8c 100755
--- a/ge/common/profiling/profiling_manager.h
+++ b/ge/common/profiling/profiling_manager.h
@@ -26,9 +26,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "external/register/register_types.h"
-#include "toolchain/prof_engine.h"
-#include "toolchain/prof_mgr_core.h"
-#include "toolchain/prof_acl_api.h"
+#include "toolchain/prof_callback.h"
 
 using std::map;
 using std::string;
@@ -37,35 +35,33 @@ using Json = nlohmann::json;
 
 namespace {
   const std::string GE_PROFILING_MODULE = "Framework";
+  // DataTypeConfig MASK
+  const uint64_t PROF_ACL_API_MASK = 0x0001;
+  const uint64_t PROF_TASK_TIME_MASK = 0x0002;
+  const uint64_t PROF_AICORE_METRICS_MASK = 0x0004;
+  const uint64_t PROF_AICPU_TRACE_MASK = 0x0008;
+  const uint64_t PROF_MODEL_EXECUTE_MASK = 0x0010;
+  const uint64_t PROF_RUNTIME_API_MASK = 0x0020;
+  const uint64_t PROF_RUNTIME_TRACE_MASK = 0x0040;
+  const uint64_t PROF_SCHEDULE_TIMELINE_MASK = 0x0080;
+  const uint64_t PROF_SCHEDULE_TRACE_MASK = 0x0100;
+  const uint64_t PROF_AIVECTORCORE_METRICS_MASK = 0x0200;
+  const uint64_t PROF_SUBTASK_TIME_MASK = 0x0400;
+  const uint64_t PROF_TRAINING_TRACE_MASK = 0x0800;
+  const uint64_t PROF_HCCL_TRACE_MASK = 0x1000;
+  const uint64_t PROF_DATA_PROCESS_MASK = 0x2000;
+  const uint64_t PROF_MODEL_LOAD_MASK = 0x8000000000000000;
+
 }  // namespace
 namespace ge {
 struct DeviceSubsInfo {
   uint64_t module;
   uint32_t subscribe_count;
 };
-// register Plugin
-class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY PluginImpl : public Msprof::Engine::PluginIntf {
- public:
-  explicit PluginImpl(const std::string &module);
-  ~PluginImpl() {}
-
-  int Init(const Msprof::Engine::Reporter *reporter);
-  int UnInit();
-  static Msprof::Engine::Reporter *GetPluginReporter() { return reporter_; }
 
- private:
-  static Msprof::Engine::Reporter *reporter_;
-  std::string module_;
-};
-
-// register Engine
-class ProfilingEngineImpl : public Msprof::Engine::EngineIntf {
- public:
-  ProfilingEngineImpl() {}
-  ~ProfilingEngineImpl() {}
-
-  Msprof::Engine::PluginIntf *CreatePlugin();
-  int ReleasePlugin(Msprof::Engine::PluginIntf *plugin);
+struct MsprofCallback {
+  MsprofCtrlCallback msprofCtrlCallback;
+  MsprofReporterCallback msprofReporterCallback;
 };
 
 class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
@@ -73,68 +69,55 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
   ProfilingManager();
   virtual ~ProfilingManager();
   static ProfilingManager &Instance();
-  ge::Status Init(const Options &options);
-  ge::Status InitFromOptions(const Options &options);
-  ge::Status InitFromAclCfg(const std::string &config);
-  ge::Status StartProfiling(int32_t iter, int32_t device_id);
-  void UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module);
-  ge::Status ProfModelSubscribe(uint64_t module, void *model);
-  ge::Status ProfModelUnsubscribe(void *model);
-  ge::Status ProfInit(uint64_t module);
-  ge::Status ProfFinalize();
-  ge::Status ProfStartProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
-  ge::Status ProfStopProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
+  Status Init(const Options &options);
+  Status ProfInit(uint64_t module);
+  Status ProfFinalize();
+  Status ProfStartProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
+  Status ProfStopProfiling(uint64_t module, const std::map<std::string, std::string> &config_para);
+  Status ProfModelSubscribe(uint64_t module, void *model);
+  Status ProfModelUnsubscribe(void *model);
   void StopProfiling();
-  bool ProfilingOpTraceOn() const { return is_op_trace_; }
-  bool ProfilingLoadFlag() const { return is_load_; }
   bool ProfilingTrainingTraceOn() const { return is_training_trace_; }
   bool ProfilingModelLoadOn() const { return is_load_profiling_; }
   bool ProfilingModelExecuteOn() const;
-  bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } // only used  by command pattern
-  bool IsAclApiMode() const { return is_acl_api_mode_; }
-  int32_t GetOpTraceIterNum() const { return op_trace_iter_num_; }
+  // is_execute_profiling_ only used by ge option and env
+  bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; }
   void ReportProfilingData(uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
-                           const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info,
-                           bool check_device);
-  void Report(const int32_t &device_id, const string &data, Msprof::Engine::Reporter &reporter,
-              Msprof::Engine::ReporterData &reporter_data);
+                           const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info);
   void ProfilingTaskDescInfo(uint32_t model_id, const std::vector<TaskDescInfo> &task_desc_info,
                              const int32_t &device_id);
   void ProfilingGraphDescInfo(uint32_t model_id, const std::vector<ComputeGraphDescInfo> &compute_graph_desc_info,
                               const int32_t &device_id);
-  void SetProfilingConfig(const string &profiling_cfg);
-  vector<int32_t> GetProfilingDeviceId() const { return  device_id_; }
-  void PluginUnInit(const std::string &module) const;
+  Status PluginInit() const;
+  void PluginUnInit() const;
+  Status CallMsprofReport(ReporterData &reporter_data) const;
+  struct MsprofCallback &GetMsprofCallback() { return prof_cb_; }
+  void SetMsprofCtrlCallback(MsprofCtrlCallback func) { prof_cb_.msprofCtrlCallback = func; }
+  void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; }
+  void GetFpBpPoint(std::string &fp_point, std::string &bp_point);
  private:
-  ge::Status ParseFeaturesFromAclCfg(const Json &feature);
-  ge::Status ProfParseParam(const std::map<std::string, std::string> &config_para, int32_t &device_num,
-                            vector<int32_t> &device_list);
-  ge::Status ProfParseDeviceId(const std::map<std::string, std::string> &config_para,
+  Status InitFromOptions(const Options &options, MsprofGeOptions &prof_conf);
+  Status ParseOptions(const std::string &options);
+  Status ProfParseParam(const std::map<std::string, std::string> &config_para, int32_t &device_num,
+                        vector<int32_t> &device_list);
+  Status ProfParseDeviceId(const std::map<std::string, std::string> &config_para,
                                vector<int32_t> &device_list);
   uint64_t GetProfilingModule();
+  void GraphDescReport(const int32_t &device_id, const string &data);
   void UpdateDeviceIdModuleMap(string prof_type, uint64_t module, const vector<int32_t> &device_list);
-  bool is_load_profiling_ = false;
-  bool is_execute_profiling_ = false;
-  bool is_op_trace_ = false;
-  bool is_load_ = false;
-  bool is_training_trace_ = false;
-  bool is_acl_api_mode_ = false;
-  int32_t op_trace_iter_num_ = 0;
-  string job_id_;
-  string prof_dir_;
+  void UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module);
+
+  bool is_load_profiling_;
+  bool is_execute_profiling_;
+  bool is_training_trace_;
   vector<int32_t> device_id_;
-  vector<string> op_trace_conf_;
-  vector<string> profiling_opts_;
-  vector<void *> prof_handle_vec_;
-  string recv_profiling_config_;
-  string send_profiling_config_;
-  string system_trace_conf_;
-  string task_trace_conf_;
-  const ProfilingEngineImpl engine_;
   map<int32_t, uint64_t> device_id_module_map_; // key: device_id, value: profiling on module
   map<uint32_t, DeviceSubsInfo> subs_dev_module_; // key: device_id, value: profiling on module
   uint32_t subscribe_count_;
   std::mutex mutex_;
+  MsprofCallback prof_cb_;
+  std::string fp_point_;
+  std::string bp_point_;
 };
 }  // namespace ge
 #endif  // GE_COMMON_PROFILING_PROFILING_MANAGER_H_
diff --git a/ge/common/proto/ge_ir.proto b/ge/common/proto/ge_ir.proto
index e7bfe0cb..12989a54 100644
--- a/ge/common/proto/ge_ir.proto
+++ b/ge/common/proto/ge_ir.proto
@@ -30,6 +30,7 @@ enum DataType
     DT_RESOURCE  = 23;         // resource type
     DT_STRING_REF = 24;        // string_ref type
     DT_DUAL      = 25;              /**< dual output type */
+    DT_VARIANT = 26;           // variant type
 }
 
 message AttrDef
diff --git a/ge/common/proto/op_mapping_info.proto b/ge/common/proto/op_mapping_info.proto
index e23b7ebe..7fb6f84b 100644
--- a/ge/common/proto/op_mapping_info.proto
+++ b/ge/common/proto/op_mapping_info.proto
@@ -15,6 +15,7 @@ message Output {
     int32 original_output_data_type = 7;
     int32 original_output_format = 8;
     uint64 size = 9;
+    Shape origin_shape = 10;
 }
 
 message Input {
@@ -23,6 +24,7 @@ message Input {
     Shape shape = 3;
     uint64 address = 4;
     uint64 size = 5;
+    Shape origin_shape = 6;
 }
 
 enum BufferType {
diff --git a/ge/common/proto/tensorflow/attr_value.proto b/ge/common/proto/tensorflow/attr_value.proto
index 1cc67d62..438d7163 100644
--- a/ge/common/proto/tensorflow/attr_value.proto
+++ b/ge/common/proto/tensorflow/attr_value.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/function.proto b/ge/common/proto/tensorflow/function.proto
index 075897c6..44681e32 100644
--- a/ge/common/proto/tensorflow/function.proto
+++ b/ge/common/proto/tensorflow/function.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/graph.proto b/ge/common/proto/tensorflow/graph.proto
index d639a7d6..73bfc6ee 100644
--- a/ge/common/proto/tensorflow/graph.proto
+++ b/ge/common/proto/tensorflow/graph.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/graph_library.proto b/ge/common/proto/tensorflow/graph_library.proto
index e393d38d..7bca0838 100644
--- a/ge/common/proto/tensorflow/graph_library.proto
+++ b/ge/common/proto/tensorflow/graph_library.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/node_def.proto b/ge/common/proto/tensorflow/node_def.proto
index b9bc97ee..50cf5cac 100644
--- a/ge/common/proto/tensorflow/node_def.proto
+++ b/ge/common/proto/tensorflow/node_def.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/op_def.proto b/ge/common/proto/tensorflow/op_def.proto
index 3485d045..7f0e8ce2 100644
--- a/ge/common/proto/tensorflow/op_def.proto
+++ b/ge/common/proto/tensorflow/op_def.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/resource_handle.proto b/ge/common/proto/tensorflow/resource_handle.proto
index a3452351..91c46c9a 100644
--- a/ge/common/proto/tensorflow/resource_handle.proto
+++ b/ge/common/proto/tensorflow/resource_handle.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/tensor.proto b/ge/common/proto/tensorflow/tensor.proto
index d0a4d024..48eeb6c4 100644
--- a/ge/common/proto/tensorflow/tensor.proto
+++ b/ge/common/proto/tensorflow/tensor.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/tensor_shape.proto b/ge/common/proto/tensorflow/tensor_shape.proto
index 4225a2e3..3a6d8c5a 100644
--- a/ge/common/proto/tensorflow/tensor_shape.proto
+++ b/ge/common/proto/tensorflow/tensor_shape.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 // Protocol buffer representing the shape of tensors.
 
 syntax = "proto3";
diff --git a/ge/common/proto/tensorflow/types.proto b/ge/common/proto/tensorflow/types.proto
index ba7a72b3..f40e49cb 100644
--- a/ge/common/proto/tensorflow/types.proto
+++ b/ge/common/proto/tensorflow/types.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/proto/tensorflow/versions.proto b/ge/common/proto/tensorflow/versions.proto
index 48061218..4e81548f 100644
--- a/ge/common/proto/tensorflow/versions.proto
+++ b/ge/common/proto/tensorflow/versions.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/common/types.cc b/ge/common/types.cc
index 54dc769f..268e7caa 100644
--- a/ge/common/types.cc
+++ b/ge/common/types.cc
@@ -480,6 +480,9 @@ REGISTER_OPTYPE_DEFINE(HVDWAIT, "HorovodWait");
 // aicpu op for online_infer dynamic_dims
 REGISTER_OPTYPE_DEFINE(GETDYNAMICDIMS, "GetDynamicDims");
 
+// profiling training trace node
+REGISTER_OPTYPE_DEFINE(PROFILINGTRAININGTRACE, "ProfilingTrainingTrace");
+
 const std::string MODEL_ATTR_TASKS = "tasks";
 const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR = "task_gen_base_addr";
 const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR = "task_gen_weight_addr";
@@ -801,7 +804,7 @@ const uint32_t XRGB_CHN_NUM = 4;
 ///
 const bool DEFAULT_GLOBAL_POOLING = false;
 
-const uint32_t MODEL_VERSION = 0x10000000; ///< Model version 1.0///
+const uint32_t MODEL_VERSION = 0x20000000; ///< Model version 2.0///
 
 // Eltwise's input size
 const int ELTWISE_MIN_INPUT_SIZE = 2;
diff --git a/ge/common/util.cc b/ge/common/util.cc
index 480be3c1..0a343a83 100644
--- a/ge/common/util.cc
+++ b/ge/common/util.cc
@@ -51,14 +51,15 @@ namespace {
  * If such an exception is encountered during operation,
  * the proto file can be divided into several small files or the limit value can be increased.
  */
-const int kProtoReadBytesLimit = INT_MAX;     // Max size of 2 GB minus 1 byte.
-const int kWarningThreshold = 536870912 * 2;  // 536870912 represent 512M
+const int kFileSizeOutLimitedOrOpenFailed = -1;
+const int kProtoReadBytesLimit = INT_MAX;  // Max size of 2 GB minus 1 byte.
+const int kWarningThreshold = 1073741824;  // 536870912 * 2 536870912 represent 512M
 
 /// The maximum length of the file.
-const uint32_t kMaxFileSizeLimit = UINT32_MAX; // 4G for now
+const uint32_t kMaxFileSizeLimit = UINT32_MAX;  // 4G for now
 const int kMaxBuffSize = 256;
 const char *const kPathValidReason = "The path can only contain 'a-z' 'A-Z' '0-9' '-' '.' '_' and chinese character";
-constexpr uint32_t kMaxConfigFileByte = 10 * 1024 * 1024;
+constexpr uint32_t kMaxConfigFileByte = 10485760;  // 10 * 1024 * 1024
 }  // namespace
 
 namespace ge {
@@ -76,7 +77,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromBinaryFile(co
   std::string real_path = RealPath(file);
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), return false, "pb file path '%s' not valid", file);
 
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == -1, return false, "file size not valid.");
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == kFileSizeOutLimitedOrOpenFailed, return false,
+                                 "file size not valid.");
 
   std::ifstream fs(real_path, std::ifstream::in | std::ifstream::binary);
   if (!fs.is_open()) {
@@ -118,20 +120,20 @@ long GetFileLength(const std::string &input_file) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), return -1, "input_file path '%s' not valid", input_file.c_str());
   unsigned long long file_length = 0;
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
-      mmGetFileSize(input_file.c_str(), &file_length) != EN_OK,
-      ErrorManager::GetInstance().ATCReportErrMessage("E19001", {"file", "errmsg"}, {input_file, strerror(errno)});
-      return -1, "Open file[%s] failed. %s", input_file.c_str(), strerror(errno));
+    mmGetFileSize(input_file.c_str(), &file_length) != EN_OK,
+    ErrorManager::GetInstance().ATCReportErrMessage("E19001", {"file", "errmsg"}, {input_file, strerror(errno)});
+    return kFileSizeOutLimitedOrOpenFailed, "Open file[%s] failed. %s", input_file.c_str(), strerror(errno));
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((file_length == 0),
                                  ErrorManager::GetInstance().ATCReportErrMessage("E19015", {"filepath"}, {input_file});
                                  return -1, "File[%s] size is 0, not valid.", input_file.c_str());
 
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(file_length > kMaxFileSizeLimit,
-                                 ErrorManager::GetInstance().ATCReportErrMessage(
-                                     "E19016", {"filepath", "filesize", "maxlen"},
-                                     {input_file, std::to_string(file_length), std::to_string(kMaxFileSizeLimit)});
-                                 return -1, "File[%s] size %lld is out of limit: %d.", input_file.c_str(), file_length,
-                                        kMaxFileSizeLimit);
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+    file_length > kMaxFileSizeLimit, ErrorManager::GetInstance().ATCReportErrMessage(
+                                       "E19016", {"filepath", "filesize", "maxlen"},
+                                       {input_file, std::to_string(file_length), std::to_string(kMaxFileSizeLimit)});
+    return kFileSizeOutLimitedOrOpenFailed, "File[%s] size %lld is out of limit: %d.", input_file.c_str(), file_length,
+           kMaxFileSizeLimit);
   return static_cast<long>(file_length);
 }
 
@@ -187,7 +189,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadBytesFromBinaryFile(co
   std::streamsize size = file.tellg();
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((size <= 0), file.close(); return false, "file length <= 0, not valid.");
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > static_cast<int64_t >(kMaxFileSizeLimit), file.close();
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(size > static_cast<int64_t>(kMaxFileSizeLimit), file.close();
                                  return false, "file size %ld is out of limit: %d.", size, kMaxFileSizeLimit);
 
   file.seekg(0, std::ios::beg);  // [no need to check value]
@@ -210,8 +212,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
   GE_CHK_BOOL_EXEC(!directory_path.empty(), return -1, "directory path is empty.");
   auto dir_path_len = directory_path.length();
   if (dir_path_len >= MMPA_MAX_PATH) {
-    ErrorManager::GetInstance().ATCReportErrMessage(
-        "E19002", {"filepath", "size"}, {directory_path, std::to_string(MMPA_MAX_PATH)});
+    ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"},
+                                                    {directory_path, std::to_string(MMPA_MAX_PATH)});
     GELOGW("Path[%s] len is too long, it must be less than %d", directory_path.c_str(), MMPA_MAX_PATH);
     return -1;
   }
@@ -224,8 +226,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
         if (ret != 0) {
           if (errno != EEXIST) {
             ErrorManager::GetInstance().ATCReportErrMessage("E19006", {"path"}, {directory_path});
-            GELOGW("Can not create directory %s. Make sure the directory exists and writable.",
-                   directory_path.c_str());
+            GELOGW("Can not create directory %s. Make sure the directory exists and writable.", directory_path.c_str());
             return ret;
           }
         }
@@ -265,7 +266,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromText(const ch
 
   std::string real_path = RealPath(file);
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), ErrorManager::GetInstance().ATCReportErrMessage(
-                                                        "E19000", {"path", "errmsg"}, {file, strerror(errno)});
+                                                      "E19000", {"path", "errmsg"}, {file, strerror(errno)});
                                  return false, "Path[%s]'s realpath is empty, errmsg[%s]", file, strerror(errno));
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == -1, return false, "file size not valid.");
@@ -301,13 +302,13 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromMem(const cha
   google::protobuf::io::IstreamInputStream input(&fs);
   bool ret = google::protobuf::TextFormat::Parse(&input, message);
   GE_IF_BOOL_EXEC(
-      !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file."));
+    !ret, GELOGE(ret, "Call [google::protobuf::TextFormat::Parse] func ret fail, please check your text file."));
 
   return ret;
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp() {
-  mmTimeval tv {};
+  mmTimeval tv{};
   int ret = mmGetTimeOfDay(&tv, nullptr);
   GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed: ret=%d", ret);
   auto total_use_time = tv.tv_usec + tv.tv_sec * 1000000;  // 1000000: seconds to microseconds
@@ -315,7 +316,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp()
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint32_t GetCurrentSecondTimestap() {
-  mmTimeval tv {};
+  mmTimeval tv{};
   int ret = mmGetTimeOfDay(&tv, nullptr);
   GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed: ret=%d", ret);
   auto total_use_time = tv.tv_sec;  // seconds
@@ -350,8 +351,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInt64MulOverflow(int6
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string RealPath(const char *path) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(path == nullptr, return "", "path pointer is NULL.");
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(strlen(path) >= MMPA_MAX_PATH,
-      ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"}, {path, std::to_string(MMPA_MAX_PATH)});
-      return "", "Path[%s] len is too long, it must be less than %d", path, MMPA_MAX_PATH);
+                                 ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"},
+                                                                                 {path, std::to_string(MMPA_MAX_PATH)});
+                                 return "", "Path[%s] len is too long, it must be less than %d", path, MMPA_MAX_PATH);
 
   // Nullptr is returned when the path does not exist or there is no permission
   // Return absolute path when path is accessible
@@ -385,16 +387,16 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const
   // Path section: Support upper and lower case letters, numbers dots(.) chinese and underscores
   // File name section: Support upper and lower case letters, numbers, underscores chinese and dots(.)
 #ifdef __GNUC__
-        std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
+  std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
 #else
-        std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
+  std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
 #endif
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
-      !ValidateStr(real_path, mode),
-      ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
-                                                      {atc_param, real_path, kPathValidReason});
-      return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), real_path.c_str(), kPathValidReason);
+    !ValidateStr(real_path, mode),
+    ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
+                                                    {atc_param, real_path, kPathValidReason});
+    return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), real_path.c_str(), kPathValidReason);
 
   // The absolute path points to a file that is not readable
   if (mmAccess2(real_path.c_str(), M_R_OK) != EN_OK) {
@@ -416,24 +418,25 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckOutputPathValid(const
   }
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(strlen(file_path.c_str()) >= MMPA_MAX_PATH,
-      ErrorManager::GetInstance().ATCReportErrMessage(
-          "E19002", {"filepath", "size"}, {file_path, std::to_string(MMPA_MAX_PATH)});
-      return "", "Path[%s] len is too long, it must be less than %d", file_path.c_str(), MMPA_MAX_PATH);
+                                 ErrorManager::GetInstance().ATCReportErrMessage(
+                                   "E19002", {"filepath", "size"}, {file_path, std::to_string(MMPA_MAX_PATH)});
+                                 return "", "Path[%s] len is too long, it must be less than %d", file_path.c_str(),
+                                        MMPA_MAX_PATH);
 
   // A regular matching expression to verify the validity of the input file path
   // Path section: Support upper and lower case letters, numbers dots(.) chinese and underscores
   // File name section: Support upper and lower case letters, numbers, underscores chinese and dots(.)
 #ifdef __GNUC__
-     std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
+  std::string mode = "^[\u4e00-\u9fa5A-Za-z0-9./_-]+$";
 #else
-     std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
+  std::string mode = "^[a-zA-Z]:([\\\\/][^\\s\\\\/:*?<>\"|][^\\\\/:*?<>\"|]*)*([/\\\\][^\\s\\\\/:*?<>\"|])?$";
 #endif
 
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
-      !ValidateStr(file_path, mode),
-      ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
-                                                      {atc_param, file_path, kPathValidReason});
-      return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), file_path.c_str(), kPathValidReason);
+    !ValidateStr(file_path, mode),
+    ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
+                                                    {atc_param, file_path, kPathValidReason});
+    return false, "Invalid value for %s[%s], %s.", atc_param.c_str(), file_path.c_str(), kPathValidReason);
 
   std::string real_path = RealPath(file_path.c_str());
   // Can get absolute path (file exists)
diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt
index d7dfdc84..755bdf97 100644
--- a/ge/executor/CMakeLists.txt
+++ b/ge/executor/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRC_LIST
     "../common/dump/dump_properties.cc"
     "../common/dump/dump_manager.cc"
     "../common/dump/dump_op.cc"
+    "../common/profiling/ge_profiling.cc"
     "../graph/load/graph_loader.cc"
     "../graph/execute/graph_execute.cc"
     "../omm/csa_interact.cc"
@@ -27,6 +28,7 @@ set(SRC_LIST
     "../graph/manager/trans_var_data_utils.cc"
     "../graph/manager/util/debug.cc"
     "../graph/manager/rdma_pool_allocator.cc"
+    "../graph/manager/host_mem_allocator.cc"
     "../hybrid/node_executor/aicpu/aicpu_ext_info.cc"
     "../model/ge_model.cc"
     "../model/ge_root_model.cc"
@@ -161,7 +163,7 @@ set(SRC_LIST
 add_library(ge_executor STATIC ${SRC_LIST} ${PROTO_HDRS})
 
 target_compile_options(ge_executor PRIVATE
-    $<$<OR:$<STREQUAL:${TARGET_SYSTEM_NAME},Linux>,$<STREQUAL:${TARGET_SYSTEM_NAME},Android>>:-fvisibility=hidden -O2 -Werror -Wno-deprecated-declarations>
+    $<$<OR:$<STREQUAL:${TARGET_SYSTEM_NAME},Linux>,$<STREQUAL:${TARGET_SYSTEM_NAME},Android>>:-fvisibility=hidden -O2 -Werror -Wno-deprecated-declarations -fno-common>
     $<$<AND:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,$<STREQUAL:${CMAKE_CONFIGURATION_TYPES},Debug>>:/MTd>
     $<$<AND:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,$<STREQUAL:${CMAKE_CONFIGURATION_TYPES},Release>>:/MT>
 )
@@ -172,6 +174,7 @@ target_compile_definitions(ge_executor PRIVATE
     google=ascend_private
     $<IF:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,OS_TYPE=WIN,OS_TYPE=0>
     $<$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>:SECUREC_USING_STD_SECURE_LIB=0 NOMINMAX>
+    LOG_CPP
 )
 
 target_include_directories(ge_executor PRIVATE
@@ -244,7 +247,6 @@ target_link_libraries(ge_executor_shared PRIVATE
     mmpa
     graph
     register
-    msprof
     error_manager
     ascend_hal_stub
     ascend_protobuf
diff --git a/ge/executor/ge_executor.cc b/ge/executor/ge_executor.cc
index d03a8d7b..0ea0e66d 100755
--- a/ge/executor/ge_executor.cc
+++ b/ge/executor/ge_executor.cc
@@ -39,8 +39,6 @@
 #include "graph/manager/graph_var_manager.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "opskernel_manager/ops_kernel_builder_manager.h"
-#include "graph/opsproto_manager.h"
-#include "ge_local_engine/engine/host_cpu_engine.h"
 
 using std::string;
 using std::vector;
@@ -209,46 +207,6 @@ bool IsDynmaicDimsSizeMatchModel(const vector<uint64_t> cur_dynamic_dims,
 
 namespace ge {
 bool GeExecutor::isInit_ = false;
-class ModelListenerAdapter : public ModelListener {
- public:
-  domi::Status OnComputeDone(uint32_t model_id, uint32_t dataIndex, uint32_t resultCode,
-                             std::vector<ge::OutputTensorInfo> &outputs) {
-    if (listener == nullptr) {
-      GELOGE(ge::FAILED, "listener is null.");
-      return FAILED;
-    }
-    return listener->OnComputeDone(model_id, dataIndex, resultCode, outputs);
-  }
-
-  std::shared_ptr<ge::ModelListener> listener;
-};
-
-static void InitOpsProtoManger() {
-  string opsproto_path;
-  const char *path_env = std::getenv("ASCEND_OPP_PATH");
-  if (path_env != nullptr) {
-    string path = path_env;
-    string file_path = RealPath(path.c_str());
-    if (file_path.empty()) {
-      GELOGE(FAILED, "File path %s is invalid.", path.c_str());
-      return;
-    }
-    opsproto_path = (path + "/op_proto/custom/" + ":") + (path + "/op_proto/built-in/");
-    GELOGI("Get opsproto so path from env : %s", path.c_str());
-  } else {
-    string path_base = PluginManager::GetPath();
-    GELOGI("path_base is %s", path_base.c_str());
-    path_base = path_base.substr(0, path_base.rfind('/'));
-    path_base = path_base.substr(0, path_base.rfind('/') + 1);
-    opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/");
-  }
-
-  GELOGI("Get opsproto path is %s", opsproto_path.c_str());
-  OpsProtoManager *manager = OpsProtoManager::Instance();
-  map<string, string> option_tmp;
-  option_tmp.emplace(std::pair<string, string>(string("ge.opsProtoLibPath"), opsproto_path));
-  (void)manager->Initialize(option_tmp);
-}
 
 GeExecutor::GeExecutor() {}
 
@@ -259,16 +217,6 @@ Status GeExecutor::Initialize() {
     return ge::SUCCESS;
   }
 
-  OpTilingManager::GetInstance().LoadSo();
-
-  Status initHostCpuEngineStatus = HostCpuEngine::GetInstance().Initialize();
-  if (initHostCpuEngineStatus != SUCCESS) {
-    GELOGE(initHostCpuEngineStatus, "Failed to initialize HostCpuEngine");
-    return initHostCpuEngineStatus;
-  }
-
-  InitOpsProtoManger();
-
   std::vector<rtMemType_t> mem_type(1, RT_MEMORY_HBM);
   mem_type.push_back(RT_MEMORY_P2P_DDR);
   auto ret = MemManager::Instance().Initialize(mem_type);
@@ -283,7 +231,8 @@ Status GeExecutor::Initialize() {
   // Start profiling
   Options profiling_options;
   profiling_options.device_id = 0;
-  profiling_options.job_id = "";
+  // job id need to be set, the value is meaningless;
+  profiling_options.job_id = "1";
   ProfilingManager::Instance().Init(profiling_options);
 
   isInit_ = true;
@@ -303,7 +252,7 @@ Status GeExecutor::Finalize() {
   // Stop profiling
   if (ProfilingManager::Instance().ProfilingOn()) {
     ProfilingManager::Instance().StopProfiling();
-    ProfilingManager::Instance().PluginUnInit(GE_PROFILING_MODULE);
+    ProfilingManager::Instance().PluginUnInit();
   }
 
   GELOGI("Uninit GeExecutor over.");
@@ -572,60 +521,6 @@ Status GeExecutor::SetDynamicAippData(uint32_t model_id, void *dynamic_input_add
   return SUCCESS;
 }
 
-// Load model
-Status GeExecutor::LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key,
-                                    int32_t priority, std::shared_ptr<ge::ModelListener> listener) {
-  GELOGI("load model offline begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  string filePath = RealPath(path.c_str());
-  if (filePath.empty()) {
-    GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID,
-           "File path is invalid. please check your text file '%s'.", path.c_str());
-    return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID;
-  }
-
-  std::shared_ptr<ModelListenerAdapter> listener_adapter = MakeShared<ModelListenerAdapter>();
-  if (listener_adapter == nullptr) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelListenerAdapter make shared failed!");
-    return ACL_ERROR_GE_MEMORY_ALLOCATION;
-  }
-  listener_adapter->listener = listener;
-
-  Status ret = GraphLoader::LoadModelFromFile(path, key, priority, listener_adapter, model_id);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "[GeExecutor] LoadModelFromFile failed");
-    return ACL_ERROR_GE_LOAD_MODEL;
-  }
-  return SUCCESS;
-}
-
-Status GeExecutor::LoadModel(uint32_t &model_id, const ModelData &model_data,
-                             std::shared_ptr<ge::ModelListener> listener) {
-  GELOGI("Load model begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  std::shared_ptr<ModelListenerAdapter> listener_adapter = MakeShared<ModelListenerAdapter>();
-  if (listener_adapter == nullptr) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "ModelListenerAdapter make shared failed!");
-    return ACL_ERROR_GE_MEMORY_ALLOCATION;
-  }
-  listener_adapter->listener = listener;
-
-  Status ret = GraphLoader::LoadModel(model_data, listener_adapter, model_id);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "[GeExecutor] LoadModel failed.");
-    return ACL_ERROR_GE_LOAD_MODEL;
-  }
-  return ret;
-}
-
 Status GeExecutor::UnloadModel(uint32_t model_id) {
   GELOGD("unload model %u begin.", model_id);
   if (!isInit_) {
@@ -635,10 +530,11 @@ Status GeExecutor::UnloadModel(uint32_t model_id) {
   Status ret = GraphLoader::DestroyAicpuSessionForInfer(model_id);
   if (ret != SUCCESS) {
     GELOGE(ret, "[GraphLoader] DestroyAicpuSessionForInfer failed. model id: %u", model_id);
-    return ACL_ERROR_GE_INTERNAL_ERROR;
+    return ret;
   }
 
-  std::shared_ptr<hybrid::HybridDavinciModel> hybrid_davinci_model = ModelManager::GetInstance()->GetHybridModel(model_id);
+  std::shared_ptr<hybrid::HybridDavinciModel> hybrid_davinci_model =
+      ModelManager::GetInstance()->GetHybridModel(model_id);
   if (hybrid_davinci_model != nullptr) {
     uint64_t session_id = hybrid_davinci_model->GetSessionId();
     VarManagerPool::Instance().RemoveVarManager(session_id);
@@ -652,26 +548,11 @@ Status GeExecutor::UnloadModel(uint32_t model_id) {
   ret = GraphLoader::UnloadModel(model_id);
   if (ret != SUCCESS) {
     GELOGE(ret, "[GraphLoader] DestroyAicpuSessionForInfer failed. model id: %u", model_id);
-    return ACL_ERROR_GE_UNLOAD_MODEL;
+    return ret;
   }
   return SUCCESS;
 }
 
-Status GeExecutor::RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data) {
-  GELOGI("run model begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  InputData inputs;
-  GetDomiInputData(input_data, inputs);
-  OutputData outputs;
-  GetDomiOutputData(output_data, outputs);
-
-  return GraphExecutor::DataInput(inputs, outputs);
-}
-
 // Get input and output descriptor
 Status GeExecutor::GetModelDescInfo(uint32_t model_id, std::vector<ge::TensorDesc> &input_desc,
                                     std::vector<ge::TensorDesc> &output_desc, bool new_model_desc) {
@@ -795,7 +676,7 @@ Status GeExecutor::GetAIPPInfo(uint32_t model_id, uint32_t index, AippConfigInfo
     GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "not inited yet!");
     return ACL_ERROR_GE_EXEC_NOT_INIT;
   }
-  Status ret = GraphExecutor::GetAIPPInfo(model_id, index, aipp_info);
+  Status ret = GraphExecutor::GetAippInfo(model_id, index, aipp_info);
   if (ret != SUCCESS) {
     GELOGW("GetAIPPInfo is not success.");
     return ret;
@@ -832,43 +713,6 @@ Status GeExecutor::GetModelAttr(uint32_t model_id, std::vector<std::string> &dyn
   return SUCCESS;
 }
 
-Status GeExecutor::GetModelDescInfoForZeroCopy(uint32_t model_id, std::vector<ge::TensorDesc> &input_desc,
-                                               std::vector<TensorDesc> &output_desc) {
-  GELOGI("get model desc info for zero copy begin.");
-  if (!isInit_) {
-    GELOGE(ACL_ERROR_GE_EXEC_NOT_INIT, "GeExecutor has not been initialized!");
-    return ACL_ERROR_GE_EXEC_NOT_INIT;
-  }
-
-  std::vector<InputOutputDescInfo> input_desc_infos;
-  std::vector<InputOutputDescInfo> output_desc_infos;
-  std::vector<uint32_t> input_formats;
-  std::vector<uint32_t> output_formats;
-
-  Status ret = GraphExecutor::GetInputOutputDescInfoForZeroCopy(model_id, input_desc_infos, output_desc_infos,
-                                                                input_formats, output_formats);
-  if (ret != domi::SUCCESS) {
-    GELOGE(ret, "Get DescInfo from zero copy failed. ret = %u", ret);
-    return ACL_ERROR_GE_GET_TENSOR_INFO;
-  }
-
-  if (input_formats.size() != input_desc_infos.size()) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "input_formats.size() != input_desc_infos.size().");
-    return ACL_ERROR_GE_PARAM_INVALID;
-  }
-
-  if (output_formats.size() != output_desc_infos.size()) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "output_formats.size() != output_desc_infos.size().");
-    return ACL_ERROR_GE_PARAM_INVALID;
-  }
-
-  GetGeTensorDescFromDomiInfo(input_desc, input_desc_infos, input_formats);
-  GetGeTensorDescFromDomiInfo(output_desc, output_desc_infos, output_formats);
-
-  GELOGI("get model desc info from zero copy end.");
-  return ge::SUCCESS;
-}
-
 Status GeExecutor::CommandHandle(const Command &command) {
   Status ret = GraphLoader::CommandHandle(command);
   if (ret != SUCCESS) {
diff --git a/ge/executor/module.mk b/ge/executor/module.mk
index 9566ca64..87abdade 100644
--- a/ge/executor/module.mk
+++ b/ge/executor/module.mk
@@ -8,12 +8,14 @@ local_ge_executor_src_files :=  \
     ../common/dump/dump_op.cc \
     ../common/ge/plugin_manager.cc \
     ../common/ge/op_tiling_manager.cc \
+    ../common/profiling/ge_profiling.cc \
     ../graph/load/graph_loader.cc \
     ../graph/execute/graph_execute.cc \
     ../omm/csa_interact.cc \
     ../graph/manager/graph_manager_utils.cc \
     ../graph/manager/graph_var_manager.cc \
     ../graph/manager/rdma_pool_allocator.cc \
+    ../graph/manager/host_mem_allocator.cc \
     ../graph/manager/graph_mem_allocator.cc \
     ../graph/manager/graph_caching_allocator.cc \
     ../graph/manager/trans_var_data_utils.cc \
@@ -177,7 +179,6 @@ local_ge_executor_shared_library :=        \
     libmmpa                                \
     libgraph                               \
     libregister                            \
-    libmsprof                              \
     liberror_manager                       \
 
 local_ge_executor_ldflags := -lrt -ldl     \
@@ -234,7 +235,6 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libmmpa                                \
     libgraph                               \
     libregister                            \
-    libmsprof                              \
     liberror_manager                       \
     stub/libascend_hal                     \
 
@@ -272,7 +272,6 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libruntime                             \
     libslog                                \
     libmmpa                                \
-    libmsprof                              \
 
 LOCAL_LDFLAGS += $(local_ge_executor_ldflags)
 
@@ -304,7 +303,6 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libruntime                             \
     libslog                                \
     libmmpa                                \
-    libmsprof                              \
 
 ifeq ($(device_os),android)
 LOCAL_LDFLAGS += -ldl
diff --git a/ge/executor/proto/dump_task.proto b/ge/executor/proto/dump_task.proto
index b1e346cd..ee1c6f47 100644
--- a/ge/executor/proto/dump_task.proto
+++ b/ge/executor/proto/dump_task.proto
@@ -28,6 +28,7 @@ enum OutputDataType {
     DT_RESOURCE = 23;
     DT_STRING_REF = 24;
     DT_DUAL = 25;
+    DT_VARIANT = 26;
 }
 
 enum OutputFormat {
@@ -108,4 +109,5 @@ message DumpData{
     repeated OpOutput output = 3;
     repeated OpInput input = 4;
     repeated OpBuffer buffer = 5;
+    string op_name = 6;
 }
diff --git a/ge/executor/proto/ge_ir.proto b/ge/executor/proto/ge_ir.proto
index e7bfe0cb..12989a54 100644
--- a/ge/executor/proto/ge_ir.proto
+++ b/ge/executor/proto/ge_ir.proto
@@ -30,6 +30,7 @@ enum DataType
     DT_RESOURCE  = 23;         // resource type
     DT_STRING_REF = 24;        // string_ref type
     DT_DUAL      = 25;              /**< dual output type */
+    DT_VARIANT = 26;           // variant type
 }
 
 message AttrDef
diff --git a/ge/executor/proto/op_mapping_info.proto b/ge/executor/proto/op_mapping_info.proto
index e23b7ebe..7fb6f84b 100644
--- a/ge/executor/proto/op_mapping_info.proto
+++ b/ge/executor/proto/op_mapping_info.proto
@@ -15,6 +15,7 @@ message Output {
     int32 original_output_data_type = 7;
     int32 original_output_format = 8;
     uint64 size = 9;
+    Shape origin_shape = 10;
 }
 
 message Input {
@@ -23,6 +24,7 @@ message Input {
     Shape shape = 3;
     uint64 address = 4;
     uint64 size = 5;
+    Shape origin_shape = 6;
 }
 
 enum BufferType {
diff --git a/ge/ge_inference.mk b/ge/ge_inference.mk
index 0987f148..6f9e60db 100755
--- a/ge/ge_inference.mk
+++ b/ge/ge_inference.mk
@@ -64,6 +64,7 @@ GRAPH_MANAGER_LOCAL_SRC_FILES := \
     graph/manager/graph_var_manager.cc \
     graph/manager/host_mem_manager.cc \
     graph/manager/rdma_pool_allocator.cc \
+    graph/manager/host_mem_allocator.cc \
     graph/manager/graph_mem_allocator.cc \
     graph/manager/graph_caching_allocator.cc \
 
@@ -102,6 +103,7 @@ OMG_HOST_SRC_FILES := \
     graph/passes/net_output_pass.cc \
     graph/passes/replace_transshape_pass.cc \
     graph/passes/constant_fuse_same_pass.cc \
+    graph/passes/fuse_data_nodes_with_common_input_pass.cc \
     graph/passes/print_op_pass.cc \
     graph/passes/no_use_reshape_remove_pass.cc \
     graph/passes/iterator_op_pass.cc \
@@ -109,6 +111,7 @@ OMG_HOST_SRC_FILES := \
     graph/passes/atomic_addr_clean_pass.cc \
     graph/passes/mark_same_addr_pass.cc \
     graph/passes/mark_graph_unknown_status_pass.cc \
+    graph/passes/dynamic_single_op_reset_shape_pass.cc \
     graph/passes/mark_agnostic_pass.cc \
     graph/common/omg_util.cc \
     graph/common/bcast.cc \
@@ -164,6 +167,7 @@ OMG_HOST_SRC_FILES := \
     host_kernels/slice_d_kernel.cc \
     host_kernels/dynamic_stitch_kernel.cc \
     host_kernels/identity_kernel.cc \
+    host_kernels/reformat_kernel.cc \
     graph/passes/stop_gradient_pass.cc \
     graph/passes/prevent_gradient_pass.cc \
     graph/passes/identity_pass.cc \
@@ -189,9 +193,12 @@ OMG_HOST_SRC_FILES := \
     graph/passes/control_trigger_pass.cc \
     graph/passes/cond_pass.cc \
     graph/passes/cond_remove_pass.cc \
+    graph/passes/remove_same_const_pass.cc \
+    graph/passes/useless_control_out_remove_pass.cc \
     graph/passes/for_pass.cc \
     graph/passes/enter_pass.cc \
-    graph/passes/assign_pass.cc \
+    graph/passes/assign_remove_pass.cc \
+    graph/passes/inplace_support_check_pass.cc \
     graph/passes/addn_pass.cc \
     graph/passes/common_subexpression_elimination_pass.cc \
     graph/passes/transop_symmetry_elimination_pass.cc \
diff --git a/ge/ge_local_engine/CMakeLists.txt b/ge/ge_local_engine/CMakeLists.txt
index 615a968f..7189e8ff 100755
--- a/ge/ge_local_engine/CMakeLists.txt
+++ b/ge/ge_local_engine/CMakeLists.txt
@@ -26,6 +26,7 @@ add_library(ge_local_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
 
 target_compile_options(ge_local_engine PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(ge_local_engine PRIVATE
@@ -55,10 +56,8 @@ target_link_libraries(ge_local_engine PRIVATE
     -Wl,--no-as-needed
     graph
     ascend_protobuf
-    register
     c_sec
     slog
-    runtime
     -Wl,--as-needed
 )
 
@@ -67,6 +66,7 @@ add_library(atc_ge_local_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
 
 target_compile_options(atc_ge_local_engine PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(atc_ge_local_engine PRIVATE
@@ -97,10 +97,8 @@ target_link_libraries(atc_ge_local_engine PRIVATE
     -Wl,--no-as-needed
     graph
     ascend_protobuf
-    register
     c_sec
     slog
-    runtime_compile
     -Wl,--as-needed
 )
 
@@ -114,6 +112,7 @@ add_library(ge_local_opskernel_builder SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO_HDR
 
 target_compile_options(ge_local_opskernel_builder PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(ge_local_opskernel_builder PRIVATE
@@ -154,6 +153,7 @@ add_library(atc_ge_local_opskernel_builder SHARED ${OPS_KERNEL_SRC_LIST} ${PROTO
 
 target_compile_options(atc_ge_local_opskernel_builder PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(atc_ge_local_opskernel_builder PRIVATE
@@ -199,10 +199,12 @@ add_library(ge_local_opskernel_builder_static STATIC ${OPS_KERNEL_SRC_LIST} ${PR
 
 target_compile_options(ge_local_opskernel_builder_static PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(ge_local_opskernel_builder_static PRIVATE
     google=ascend_private
+    LOG_CPP
 )
 
 target_include_directories(ge_local_opskernel_builder_static PRIVATE
diff --git a/ge/ge_local_engine/engine/host_cpu_engine.cc b/ge/ge_local_engine/engine/host_cpu_engine.cc
index f1e152f4..06dc2b96 100755
--- a/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/ge/ge_local_engine/engine/host_cpu_engine.cc
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #include "host_cpu_engine.h"
-#include <dlfcn.h>
 #include "graph/common/omg_util.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_adapter.h"
@@ -31,35 +30,21 @@ namespace {
   case (DTYPE): {                                                                                                      \
     GeTensorPtr ge_tensor = nullptr;                                                                                   \
     if (need_create_flag) {                                                                                            \
-      GELOGI("node:%s allocate output %zu start, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));  \
-      std::unique_ptr<TYPE[]> buf(new (std::nothrow) TYPE[data_num]());                                                \
-      if (buf == nullptr) {                                                                                            \
-        GELOGE(MEMALLOC_FAILED, "New sizeof(T) * data_num(%zu) memory failed",                                         \
-               static_cast<size_t>(sizeof(TYPE) * data_num));                                                          \
-        return MEMALLOC_FAILED;                                                                                        \
-      }                                                                                                                \
-      ge_tensor = MakeShared<GeTensor>(out_desc);                                                                      \
+      uint64_t size = data_num * sizeof(TYPE);                                                                         \
+      ge_tensor = MakeShared<GeTensor>(out_desc, size);                                                                \
       GE_CHECK_NOTNULL(ge_tensor);                                                                                     \
-      GELOGI("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, data_num * sizeof(TYPE));\
-      if (ge_tensor->SetData(reinterpret_cast<uint8_t *>(buf.get()), data_num * sizeof(TYPE)) != GRAPH_SUCCESS) {      \
-        GELOGE(MEMALLOC_FAILED, "Set data for output %zu of node %s failed.", i, op_desc->GetName().c_str());          \
-        return MEMALLOC_FAILED;                                                                                        \
-      }                                                                                                                \
+      GELOGD("node:%s allocate output %zu success, size=%lld", op_desc->GetName().c_str(), i, size);                   \
       ge_tensor->MutableTensorDesc().SetDataType(out_desc.GetDataType());                                              \
       ge_tensor->MutableTensorDesc().SetShape(out_desc.GetShape());                                                    \
-      outputs.emplace_back(ge_tensor);                                                                                 \
     } else {                                                                                                           \
       ge_tensor = outputs[i];                                                                                          \
       GE_CHECK_NOTNULL(ge_tensor);                                                                                     \
-      GELOGI("node:%s existed output %zu, addr=%p, size=%lld", op_desc->GetName().c_str(), i,                          \
-             reinterpret_cast<const uint8_t *>(ge_tensor->GetData().data()), ge_tensor->GetData().size());             \
+      GELOGD("node:%s existed output %zu", op_desc->GetName().c_str(), i);                                             \
     }                                                                                                                  \
     auto tensor = TensorAdapter::AsTensor(*ge_tensor);                                                                 \
     auto tensor_name = op_desc->GetOutputNameByIndex(i);                                                               \
     GE_RETURN_WITH_LOG_IF_TRUE(tensor_name.empty(), "Failed to get output name. node = %s, index = %zu",               \
                                op_desc->GetName().c_str(), i);                                                         \
-    GELOGD("Successfully inserted output tensor. node = %s, index = %zu, output name = %s, addr = %p, size = %zu",     \
-           op_desc->GetName().c_str(), i, tensor_name.c_str(), tensor.GetData(), tensor.GetSize());                    \
     named_outputs.emplace(tensor_name, tensor);                                                                        \
     break;                                                                                                             \
   }
@@ -96,8 +81,8 @@ Status GetDataNumber(const GeTensorDesc &out_desc, uint64_t &data_num) {
 
 void HostCpuEngine::CloseSo() {
   for (auto handle : lib_handles_) {
-    if (dlclose(handle) != 0) {
-      GELOGW("failed to close handle, message: %s", dlerror());
+    if (mmDlclose(handle) != 0) {
+      GELOGW("failed to close handle, message: %s", mmDlerror());
     }
   }
   lib_handles_.clear();
@@ -236,16 +221,30 @@ Status HostCpuEngine::Run(NodePtr &node, const vector<ConstGeTensorPtr> &inputs,
   GELOGD("Run node by host cpu engine. node name = %s", node->GetName().c_str());
   std::unique_ptr<HostCpuOp> op_kernel;
   GE_CHK_STATUS_RET_NOLOG(FindOpKernel(node, op_kernel));
-
   std::map<std::string, const Tensor> named_inputs;
-  std::vector<GeTensorPtr> tmp_outputs;
-  tmp_outputs.swap(outputs);
   std::map<std::string, Tensor> named_outputs;
   auto op_desc = node->GetOpDesc();
   GE_CHK_STATUS_RET_NOLOG(PrepareInputs(op_desc, inputs, named_inputs));
-  GE_CHK_STATUS_RET_NOLOG(PrepareOutputs(op_desc, tmp_outputs, named_outputs));
+  GE_CHK_STATUS_RET_NOLOG(PrepareOutputs(op_desc, outputs, named_outputs));
   GE_CHK_STATUS_RET_NOLOG(RunInternal(op_desc, *op_kernel, named_inputs, named_outputs));
 
+  std::vector<GeTensorPtr> tmp_outputs;
+  for (size_t i = 0; i < op_desc->GetOutputsSize(); i++) {
+    auto tensor_name = op_desc->GetOutputNameByIndex(i);
+    if (tensor_name.empty()) {
+      GELOGE(INTERNAL_ERROR, "Failed to get output name. node = %s, index = %zu", op_desc->GetName().c_str(), i);
+      return INTERNAL_ERROR;
+    }
+    auto iter = named_outputs.find(tensor_name);
+    if (iter == named_outputs.end()) {
+       GELOGE(INTERNAL_ERROR, "Failed to get output tensor. node = %s, index = %zu, tensor_name = %s",
+              op_desc->GetName().c_str(), i, tensor_name.c_str());
+      return INTERNAL_ERROR;
+    }
+    auto ge_tensor = MakeShared<GeTensor>(TensorAdapter::AsGeTensor(iter->second));
+    GE_CHECK_NOTNULL(ge_tensor);
+    tmp_outputs.emplace_back(ge_tensor);
+  }
   GELOGD("Run node by host cpu engine successfully. name node = %s", node->GetName().c_str());
   outputs.swap(tmp_outputs);
   return SUCCESS;
@@ -323,13 +322,13 @@ Status HostCpuEngine::LoadLibs(std::vector<std::string> &lib_paths) {
 
 Status HostCpuEngine::LoadLib(const std::string &lib_path) {
   GELOGI("To invoke dlopen on lib: %s", lib_path.c_str());
-  auto handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+  auto handle = mmDlopen(lib_path.c_str(), MMPA_RTLD_NOW | MMPA_RTLD_GLOBAL);
   if (handle == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), dlerror());
+    GELOGE(INTERNAL_ERROR, "Failed to invoke dlopen. path = %s, error = %s", lib_path.c_str(), mmDlerror());
     return INTERNAL_ERROR;
   }
 
-  auto initialize = (Status (*)(const HostCpuContext &))dlsym(handle, "Initialize");
+  auto initialize = (Status (*)(const HostCpuContext &))mmDlsym(handle, "Initialize");
   if (initialize != nullptr) {
     GELOGI("Invoke function Initialize in lib: %s", lib_path.c_str());
     if (initialize(HostCpuContext()) != SUCCESS) {
diff --git a/ge/ge_runner.mk b/ge/ge_runner.mk
index a2679ed1..460d5068 100644
--- a/ge/ge_runner.mk
+++ b/ge/ge_runner.mk
@@ -29,6 +29,8 @@ LIBGE_LOCAL_SRC_FILES := \
     common/dump/dump_manager.cc \
     common/dump/dump_properties.cc \
     common/dump/dump_op.cc \
+    common/profiling/ge_profiling.cc \
+    common/profiling/ge_runner_profiling.cc \
     engine_manager/dnnengine_manager.cc \
     ge_local_engine/engine/host_cpu_engine.cc \
     generator/ge_generator.cc \
@@ -92,6 +94,7 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/manager/graph_var_manager.cc \
     graph/manager/host_mem_manager.cc \
     graph/manager/rdma_pool_allocator.cc \
+    graph/manager/host_mem_allocator.cc \
     graph/manager/memory_api.cc \
     graph/manager/model_manager/event_manager.cc        \
     graph/manager/trans_var_data_utils.cc \
@@ -111,6 +114,7 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/passes/atomic_addr_clean_pass.cc \
     graph/passes/mark_same_addr_pass.cc \
     graph/passes/mark_graph_unknown_status_pass.cc \
+    graph/passes/dynamic_single_op_reset_shape_pass.cc \
     graph/passes/mark_agnostic_pass.cc \
     graph/partition/dynamic_shape_partition.cc \
     graph/partition/stage_partition.cc \
@@ -123,13 +127,17 @@ LIBGE_LOCAL_SRC_FILES := \
     graph/passes/compile_nodes_pass.cc \
     graph/passes/constant_folding_pass.cc \
     graph/passes/constant_fuse_same_pass.cc \
+    graph/passes/fuse_data_nodes_with_common_input_pass.cc \
+    graph/passes/remove_same_const_pass.cc \
+    graph/passes/useless_control_out_remove_pass.cc \
     graph/passes/control_trigger_pass.cc \
     graph/passes/dimension_adjust_pass.cc \
     graph/passes/dimension_compute_pass.cc \
     graph/passes/dropout_pass.cc \
     graph/passes/hccl_group_pass.cc \
     graph/passes/enter_pass.cc \
-    graph/passes/assign_pass.cc \
+    graph/passes/assign_remove_pass.cc \
+    graph/passes/inplace_support_check_pass.cc \
     graph/passes/flow_ctrl_pass.cc \
     graph/passes/global_step_insert_pass.cc \
     host_kernels/transpose_kernel.cc \
@@ -170,6 +178,7 @@ LIBGE_LOCAL_SRC_FILES := \
     host_kernels/sub_kernel.cc \
     host_kernels/transdata_kernel.cc \
     host_kernels/unpack_kernel.cc \
+    host_kernels/reformat_kernel.cc \
     graph/passes/folding_pass.cc \
     graph/passes/get_original_format_pass.cc \
     graph/passes/guarantee_const_pass.cc \
@@ -306,7 +315,6 @@ LIBGE_LOCAL_SRC_FILES := \
 LIBCLIENT_LOCAL_SRC_FILES := \
     proto/ge_api.proto \
     client/ge_api.cc \
-    client/ge_prof.cc \
 
 RUNNER_LOCAL_C_INCLUDES := \
     $(LOCAL_PATH) ./ \
@@ -371,7 +379,7 @@ LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)
 
 LOCAL_STATIC_LIBRARIES := libge_memory \
                           libadump_server \
-                          libmsprofiler \
+                          libmsprofiler_fwk \
                           libmmpa \
 
 LOCAL_SHARED_LIBRARIES := \
@@ -381,7 +389,6 @@ LOCAL_SHARED_LIBRARIES := \
     libgraph \
     libregister \
     libge_common \
-    libmsprof \
     liberror_manager \
 
 LOCAL_LDFLAGS := -lrt -ldl
@@ -408,7 +415,6 @@ endif
 LOCAL_C_INCLUDES := $(RUNNER_LOCAL_C_INCLUDES)
 
 LOCAL_SRC_FILES := ../../out/ge/lib64/stub/ge_api.cc \
-                   ../../out/ge/lib64/stub/ge_prof.cc \
                    ../../out/ge/lib64/stub/ge_ir_build.cc \
 
 LOCAL_SHARED_LIBRARIES :=
@@ -464,7 +470,6 @@ LOCAL_SHARED_LIBRARIES := \
     libc_sec \
     libslog \
     libmmpa \
-    libmsprof \
 
 LOCAL_LDFLAGS := -lrt -ldl
 
@@ -497,7 +502,6 @@ LOCAL_SHARED_LIBRARIES := \
     libc_sec \
     libslog \
     libmmpa \
-    libmsprof \
 
 LOCAL_LDFLAGS := -lrt -ldl
 
diff --git a/ge/ge_runtime/CMakeLists.txt b/ge/ge_runtime/CMakeLists.txt
index ce1b89ea..56b5ab41 100644
--- a/ge/ge_runtime/CMakeLists.txt
+++ b/ge/ge_runtime/CMakeLists.txt
@@ -23,10 +23,13 @@ add_library(ge_runtime SHARED ${GE_SRC_LIST})
 target_compile_options(ge_runtime PRIVATE
     -Werror
     -O2
+    -Wno-deprecated-declarations
+    -fno-common
 )
 
 target_compile_definitions(ge_runtime PRIVATE 
     PROTOBUF_INLINE_NOT_IN_HEADERS=0
+    LOG_CPP
 )
 
 target_include_directories(ge_runtime PRIVATE
diff --git a/ge/ge_runtime/runtime_model.cc b/ge/ge_runtime/runtime_model.cc
index fb0f3e85..b30ca1bf 100644
--- a/ge/ge_runtime/runtime_model.cc
+++ b/ge/ge_runtime/runtime_model.cc
@@ -28,15 +28,16 @@
 
 namespace ge {
 namespace model_runner {
+const int kOffsetUnit = 8;
 RuntimeModel::~RuntimeModel() {
   GELOGI("RuntimeModel destructor start");
 
-  // Release task first, hccl task hold stream
-  task_list_.clear();
-
   // Unbind rtModel from all task related streams
   RtModelUnbindStream();
 
+  // Release task first, hccl task hold stream
+  task_list_.clear();
+
   // Release all task related streams
   RtStreamDestory();
 
@@ -495,7 +496,7 @@ bool RuntimeModel::InitConstantInfo(std::shared_ptr<DavinciModel> &davinci_model
         return false;
       }
       uint64_t *buff = reinterpret_cast<uint64_t *>(const_cast<char *>(constant->weight_data.data()));
-      int64_t offset = elem_num * 8;
+      int64_t offset = elem_num * kOffsetUnit;
       uintptr_t hbm_raw_data_base_addr = reinterpret_cast<uintptr_t>(constant->output_addrs[0]) + offset;
       for (int64_t i = elem_num - 1; i >= 0; --i) {
         buff[i] = hbm_raw_data_base_addr + (buff[i] - buff[0]);
diff --git a/ge/generator/ge_generator.cc b/ge/generator/ge_generator.cc
index 16d63f6b..d032965b 100644
--- a/ge/generator/ge_generator.cc
+++ b/ge/generator/ge_generator.cc
@@ -47,6 +47,8 @@ const char *const kEngineNameDefault = "default";
 const char *const kVectorEngine = "VectorEngine";
 const char *const kAIcoreEngine = "AIcoreEngine";
 const char *const kFileNameSuffix = "online";
+const size_t kDynamicDimSize = 1;
+const int64_t kDynamicDimValue = -2;
 
 std::map<ge::OpEngineType, std::string> engine_type_map{
     {ge::ENGINE_SYS, kEngineNameDefault}, {ge::ENGINE_AICORE, kAIcoreEngine}, {ge::ENGINE_VECTOR, kVectorEngine}};
@@ -156,7 +158,12 @@ static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, GeTen
   }
 
   string op_type;
-  if (!AttrUtils::GetStr(tensor, kAttrOpType, op_type) || op_type.empty()) {
+  bool is_const = false;
+  (void)AttrUtils::GetBool(tensor, CONST_ATTR_NAME_INPUT, is_const);
+  if (is_const) {
+    GELOGD("Get input[%d] is const", index);
+    op_type = CONSTANTOP;
+  } else if (!AttrUtils::GetStr(tensor, kAttrOpType, op_type) || op_type.empty()) {
     op_type = DATA;
   }
 
@@ -165,6 +172,18 @@ static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, GeTen
   if (data_op == nullptr) {
     return FAILED;
   }
+  if (is_const) {
+    ConstGeTensorPtr tensor_value;
+    if (!AttrUtils::GetTensor(tensor, ge::ATTR_NAME_WEIGHTS, tensor_value)) {
+      GELOGE(FAILED, "Get value failed, node name:%s.", tensor.GetName().c_str());
+      return FAILED;
+    }
+    if (!AttrUtils::SetTensor(data_op, ge::ATTR_NAME_WEIGHTS, tensor_value)) {
+      GELOGE(FAILED, "Set attr ATTR_NAME_WEIGHTS fail.");
+      return FAILED;
+    }
+  }
+
   (void)AttrUtils::SetBool(data_op, "_is_single_op", true);
 
   GE_CHK_BOOL_EXEC(data_op->AddInputDesc(tensor) == GRAPH_SUCCESS, return FAILED, "Add input desc fail.");
@@ -231,6 +250,61 @@ static void GetOpsProtoPath(string &opsproto_path) {
   opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/");
 }
 
+static Status CheckShapeReset(const OpDescPtr &op_desc, bool &change_shape_flag) {
+  GE_CHECK_NOTNULL_EXEC(op_desc, return PARAM_INVALID);
+  change_shape_flag = false;
+  for (size_t i = 0; i < op_desc->GetAllInputsDesc().size(); i++) {
+    auto input_desc = op_desc->MutableInputDesc(static_cast<uint32_t>(i));
+    GE_CHECK_NOTNULL(input_desc);
+    // pass scalar input desc
+    auto dims = input_desc->GetShape().GetDims();
+    if (dims.size() == kDynamicDimSize && dims[0] == kDynamicDimValue) {
+      change_shape_flag = true;
+    }
+  }
+  for (size_t i = 0; i < op_desc->GetAllOutputsDesc().size(); i++) {
+    auto output_desc = op_desc->MutableOutputDesc(static_cast<uint32_t>(i));
+    GE_CHECK_NOTNULL(output_desc);
+    // pass scalar output desc
+    auto dims = output_desc->GetShape().GetDims();
+    if (dims.size() == kDynamicDimSize && dims[0] == kDynamicDimValue) {
+      change_shape_flag = true;
+    }
+  }
+  return SUCCESS;
+}
+
+static Status ResetTensorVecShape(const vector<GeTensor> &inputs, vector<GeTensor> &inputs_dynamic) {
+  for (auto input : inputs) {
+    auto input_desc = input.GetTensorDesc();
+    GeShape shape_ori = input_desc.GetShape();
+
+    std::vector<int64_t> dynamic_shape_dims = {kDynamicDimValue};
+    GeShape dynamic_shape(dynamic_shape_dims);
+    std::vector<std::pair<int64_t, int64_t>> dynamic_shape_range;
+
+    ge::GeTensor inputTensor;
+    ge::GeTensorDesc desc(input_desc);
+
+    bool is_const = false;
+    (void)AttrUtils::GetBool(input_desc, CONST_ATTR_NAME_INPUT, is_const);
+    if (!is_const && shape_ori.GetDims().size() > 0) {
+      int64_t storage_format = FORMAT_NCHW;
+      if (ge::AttrUtils::GetInt(desc, ge::ATTR_NAME_STORAGE_FORMAT, storage_format) &&
+          !ge::AttrUtils::SetListInt(desc, ge::ATTR_NAME_STORAGE_SHAPE, dynamic_shape_dims)) {
+        GELOGE(FAILED, "Set attr ATTR_NAME_STORAGE_SHAPE fail.");
+        return FAILED;
+      }
+      desc.SetShape(dynamic_shape);
+      desc.SetShapeRange(dynamic_shape_range);
+    }
+
+    inputTensor.SetTensorDesc(desc);
+    inputs_dynamic.push_back(inputTensor);
+  }
+  return SUCCESS;
+}
+
 class GeGenerator::Impl {
  public:
   Impl(OmgContext &omg_context) : omg_context_(omg_context) {}
@@ -240,6 +314,8 @@ class GeGenerator::Impl {
 
   Status SaveModel(const string &file_name_prefix, GeModelPtr &models, ModelBufferData &model);
 
+  Status SaveRootModel(const string &file_name_prefix, GeRootModelPtr &model, ModelBufferData &model_buff);
+
   Status SaveParams(GeModelPtr &ge_model, const string &type, const map<string, GeAttrValue> &attrs,
                     const vector<GeTensor> &inputs, const vector<GeTensor> &outputs);
 
@@ -260,6 +336,7 @@ class GeGenerator::Impl {
   bool GetVersionFromPath(const std::string &file_path, std::string &version);
   bool SetAtcVersionInfo(AttrHolder &obj);
   bool SetOppVersionInfo(AttrHolder &obj);
+  bool SetOmSystemInfo(AttrHolder &obj);
 };
 
 Status GeGenerator::Initialize(const map<string, string> &options) {
@@ -470,6 +547,30 @@ bool GeGenerator::Impl::SetOppVersionInfo(AttrHolder &obj) {
   return true;
 }
 
+bool GeGenerator::Impl::SetOmSystemInfo(AttrHolder &obj) {
+  std::string soc_version;
+  (void)ge::GetContext().GetOption(ge::SOC_VERSION, soc_version);
+  GELOGI("SetOmSystemInfo soc_version: %s", soc_version.c_str());
+  if (!ge::AttrUtils::SetStr(obj, "soc_version", soc_version)) {
+    GELOGW("SetStr of soc_version failed.");
+    return false;
+  }
+
+  std::string framework_type;
+  (void)ge::GetContext().GetOption(ge::FRAMEWORK_TYPE, framework_type);
+  GELOGI("SetOmSystemInfo framework_type: %s", framework_type.c_str());
+  auto iter = ge::kFwkTypeToStr.find(framework_type);
+  if (iter == ge::kFwkTypeToStr.end()) {
+    GELOGW("Can not find framework_type in the map.");
+    return false;
+  }
+  if (!ge::AttrUtils::SetStr(obj, "framework_type", iter->second)) {
+    GELOGW("SetStr of framework_type failed.");
+    return false;
+  }
+  return true;
+}
+
 Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_prefix, const vector<GeTensor> &inputs,
                                   ModelBufferData &model, bool is_offline) {
   rtContext_t ctx = nullptr;
@@ -507,17 +608,18 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
   GE_CHECK_NOTNULL(ge_root_model->GetRootGraph());
   ModelHelper model_helper;
   string model_name = "";
-  Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(), model_name);
+  Status name_ret = model_helper.GetModelNameFromMergedGraphName(ge_root_model->GetRootGraph()->GetName(),
+                                                                 model_name);
   if (name_ret != SUCCESS) {
     ErrorManager::GetInstance().ATCReportErrMessage("E10000", {"parameter"}, {"output"});
-    GELOGE(FAILED, "Get model_name failed. Param --output is invalid");
+    GELOGE(FAILED, "Get model_name failed. Param --output is invalid.");
     return PARAM_INVALID;
   }
   map<string, GeModelPtr> name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
   GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()];
-  GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model can not be null");
+  GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model cannot be null");
   ge_model->SetName(model_name);
-  ret = impl_->SaveModel(file_name_prefix, ge_model, model);
+  ret = impl_->SaveRootModel(file_name_prefix, ge_root_model, model);
   if (ret != SUCCESS) {
     GELOGE(ret, "Save model failed");
     if (impl_->graph_manager_.Finalize() != SUCCESS) {
@@ -567,6 +669,9 @@ Status GeGenerator::CheckForSingleOp(OpDescPtr &op_desc, const vector<GeTensor>
 Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                                   const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
                                   bool is_offline) {
+  if (!is_offline) {
+    (void)AttrUtils::SetBool(op_desc, ATTR_DYNAMIC_SHAPE_SINGLE_AICPU, true);
+  }
 
   if (CheckForSingleOp(op_desc, inputs, outputs) != SUCCESS) {
     GELOGE(PARAM_INVALID, "input param is invalid when build single op!");
@@ -594,40 +699,11 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
 
   // 2. Create ComputeGraph.
   string name = ge::CurrentTimeInStr() + "_" + model_file_name;
-  ge::ComputeGraphPtr compute_graph = MakeShared<ComputeGraph>(name);
-  GE_CHECK_NOTNULL_EXEC(compute_graph, return INTERNAL_ERROR);
-
-  // 3. Add Node to ComputeGraph.
-  NodePtr op_node = compute_graph->AddNode(op_desc);
-  GE_CHECK_NOTNULL_EXEC(op_node, return INTERNAL_ERROR);
-
-  // 4. Create InputData node.
-  int32_t arg_index = 0;
-  if (inputs.empty()) {
-    for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
-      GE_CHECK_NOTNULL_EXEC(input_desc, return INTERNAL_ERROR);
-      if (!IsNeedConnectInputOpForSingleOp(*input_desc)) {
-        continue;
-      }
-      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false));
-      arg_index++;
-    }
-  } else {
-    for (const auto &in_desc : inputs) {
-      GeTensorDesc input_desc = in_desc.GetTensorDesc();
-      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, input_desc, arg_index, true));
-      arg_index++;
-    }
-  }
-
-  // 5. Create Output node.
-  if (!outputs.empty()) {
-    GE_CHK_STATUS_RET_NOLOG(AddOutputs(compute_graph, op_node, outputs));
+  Graph graph;
+  if (BuildSingleOpGraph(op_desc, inputs, outputs, name, graph) != ge::SUCCESS) {
+    GELOGE(GRAPH_FAILED, "make graph fail.");
+    return GRAPH_FAILED;
   }
-
-  // dump ComputeGraph.
-  compute_graph->Dump();
-  Graph graph = ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph);
   GELOGI("ATC parser success in single op build.");
 
   GeRootModelPtr ge_root_model = nullptr;
@@ -644,7 +720,18 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
   }
   GeModelPtr &ge_model = name_to_ge_model.begin()->second;
   GELOGD("The opType in op_desc_tmp is [%s]", op_desc_tmp->GetType().c_str());
-  GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs, outputs));
+
+  bool dynamic_flag = false;
+  if (CheckShapeReset(op_desc, dynamic_flag) == SUCCESS && dynamic_flag) {
+    vector<GeTensor> inputs_dynamic;
+    vector<GeTensor> outputs_dynamic;
+    GE_CHK_STATUS_RET_NOLOG(ResetTensorVecShape(inputs, inputs_dynamic));
+    GE_CHK_STATUS_RET_NOLOG(ResetTensorVecShape(outputs, outputs_dynamic));
+    GE_CHK_STATUS_RET_NOLOG(
+      impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs_dynamic, outputs_dynamic));
+  } else {
+    GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs, outputs));
+  }
   GE_CHK_STATUS_RET_NOLOG(impl_->SaveModel(model_file_name, ge_model, model_buff));
   return SUCCESS;
 }
@@ -683,6 +770,46 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor
   return BuildSingleOp(op_desc, inputs, outputs, kFileNameSuffix, engine_type, model_buff, false);
 }
 
+Status GeGenerator::BuildSingleOpGraph(OpDescPtr &op_desc, const vector<GeTensor> &inputs,
+                                       const vector<GeTensor> &outputs, std::string graph_name, Graph &graph) {
+  ge::ComputeGraphPtr compute_graph = MakeShared<ComputeGraph>(graph_name);
+  GE_CHECK_NOTNULL_EXEC(compute_graph, return INTERNAL_ERROR);
+
+  // 1. Add Node to ComputeGraph.
+  NodePtr op_node = compute_graph->AddNode(op_desc);
+  GE_CHECK_NOTNULL_EXEC(op_node, return INTERNAL_ERROR);
+
+  // 2. Create InputData node.
+  int32_t arg_index = 0;
+  if (inputs.empty()) {
+    for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
+      GE_CHECK_NOTNULL_EXEC(input_desc, return INTERNAL_ERROR);
+      if (!IsNeedConnectInputOpForSingleOp(*input_desc)) {
+        continue;
+      }
+      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, *input_desc, arg_index, false));
+      arg_index++;
+    }
+  } else {
+    for (const auto &in_desc : inputs) {
+      GeTensorDesc input_desc = in_desc.GetTensorDesc();
+      GE_CHK_STATUS_RET_NOLOG(AddInputs(compute_graph, op_node, input_desc, arg_index, true));
+      arg_index++;
+    }
+  }
+
+  // 3. Create Output node.
+  if (!outputs.empty()) {
+    GE_CHK_STATUS_RET_NOLOG(AddOutputs(compute_graph, op_node, outputs));
+  }
+
+  // dump ComputeGraph node.
+  compute_graph->Dump();
+  graph = ge::GraphUtils::CreateGraphFromComputeGraph(compute_graph);
+
+  return SUCCESS;
+}
+
 Status GeGenerator::Impl::SaveParams(GeModelPtr &ge_model, const string &type, const map<string, GeAttrValue> &attrs,
                                      const vector<GeTensor> &inputs, const vector<GeTensor> &outputs) {
   GE_CHECK_NOTNULL_EXEC(ge_model, return PARAM_INVALID);
@@ -712,6 +839,47 @@ Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, GeModelPtr &
   return SUCCESS;
 }
 
+Status GeGenerator::Impl::SaveRootModel(const string &file_name_prefix, GeRootModelPtr &ge_root_model,
+                                        ModelBufferData &model_buff) {
+  bool is_unknown_shape = false;
+  auto ret = ge_root_model->CheckIsUnknownShape(is_unknown_shape);
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "Check root model is unkonwn shape failed");
+    return FAILED;
+  }
+  GELOGD("begin save root model, cur model is unkonwn shape model ? : %d", is_unknown_shape);
+  GE_CHK_BOOL_EXEC(!ge_root_model->GetSubgraphInstanceNameToModel().empty(), return FAILED,
+                   "ge root model has no sub model")
+  GeModelPtr model_root = nullptr;
+  if (is_unknown_shape) {
+    model_root = make_shared<GeModel>();
+    model_root->SetGraph(GraphUtils::CreateGraphFromComputeGraph(ge_root_model->GetRootGraph()));
+    ge_root_model->SetSubgraphInstanceNameToModel(ge_root_model->GetRootGraph()->GetName(), model_root);
+    model_root->SetName(ge_root_model->GetRootGraph()->GetName());
+  } else {
+    model_root = ge_root_model->GetSubgraphInstanceNameToModel().begin()->second;
+  }
+  // set atc version
+  if (!SetAtcVersionInfo(*(model_root.get()))) {
+    GELOGW("SetPackageVersionInfo of atc failed!");
+  }
+  // set opp version
+  if (!SetOppVersionInfo(*(model_root.get()))) {
+    GELOGW("SetPackageVersionInfo of ops failed!");
+  }
+  if (!SetOmSystemInfo(*(model_root.get()))) {
+    GELOGW("SetOmsystemInfo failed!");
+  }
+  ModelHelper model_helper;
+  model_helper.SetSaveMode(is_offline_);
+  ret = model_helper.SaveToOmRootModel(ge_root_model, save_param_, file_name_prefix, model_buff, is_unknown_shape);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Save to om model failed");
+    return ret;
+  }
+  return SUCCESS;
+}
+
 Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector<GeTensor> &inputs,
                                      GeRootModelPtr &ge_root_model) {
   static std::atomic<GraphId> atomic_graph_id(0);
diff --git a/ge/graph/build/graph_builder.cc b/ge/graph/build/graph_builder.cc
index bdb02b3a..ed77a7f1 100644
--- a/ge/graph/build/graph_builder.cc
+++ b/ge/graph/build/graph_builder.cc
@@ -15,6 +15,7 @@
  */
 
 #include "graph/build/graph_builder.h"
+#include "graph/build/memory/graph_mem_assigner.h"
 #include "common/ge/ge_util.h"
 #include "common/helper/model_helper.h"
 #include "graph/build/logical_stream_allocator.h"
@@ -200,7 +201,7 @@ Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfo
   bool is_dynamic_shape = false;
   // To be compatible with the old process, do not verify the return value temporarily.
   (void)AttrUtils::GetBool(comp_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape);
-  if (is_dynamic_shape) {
+  if (is_dynamic_shape || comp_graph->GetGraphUnknownFlag()) {
     GE_CHK_STATUS_RET(
         BuildForDynamicShapeGraph(comp_graph, subgraph_ptr_list, ge_root_model_ptr, ge_model_ptr, session_id),
         "Build for dynamic shape graph failed.");
@@ -270,16 +271,78 @@ Status GraphBuilder::BuildForKnownShapeGraph(ComputeGraphPtr &comp_graph, std::v
   return SUCCESS;
 }
 
+Status GraphBuilder::SetConstantInputOffset(ComputeGraphPtr &comp_graph) {
+  for (auto &node : comp_graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(node);
+    auto op_desc = node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+    auto num_inputs = op_desc->GetInputsSize();
+    std::vector<int64_t> input_offsets(num_inputs, 0);
+    int valid_input_index = -1;
+    for (uint32_t i = 0; i < node->GetAllInDataAnchorsSize(); ++i) {
+      auto in_anchor = node->GetInDataAnchor(i);
+      auto peer_out_anchor = in_anchor->GetPeerOutAnchor();
+      if (peer_out_anchor == nullptr) {
+        continue;
+      }
+
+      ++valid_input_index;
+      auto peer_node = peer_out_anchor->GetOwnerNode();
+      if (peer_node == nullptr) {
+        continue;
+      }
+
+      if (peer_node->GetType() != CONSTANT) {
+        continue;
+      }
+
+      std::vector<GeTensorPtr> weights = OpDescUtils::MutableWeights(peer_node);
+      if (weights.empty()) {
+        GELOGE(FAILED, "weights size of node %s is empty", node->GetName().c_str());
+        return FAILED;
+      }
+      GeTensorPtr weight = weights[0];
+      GE_CHECK_NOTNULL(weight);
+      int64_t input_offset = 0;
+      (void) TensorUtils::GetDataOffset(weight->MutableTensorDesc(), input_offset);
+      // valid_input_index must smaller than num_inputs
+      input_offsets[valid_input_index] = input_offset;
+      GELOGD("[%s] input[%u] is const, offset = %ld", node->GetName().c_str(), valid_input_index, input_offset);
+    }
+
+    op_desc->SetInputOffset(input_offsets);
+    std::vector<int64_t> output_offsets(op_desc->GetOutputsSize(), 0);
+    op_desc->SetOutputOffset(output_offsets);
+  }
+  return SUCCESS;
+}
+
 Status GraphBuilder::BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                                                uint64_t session_id) {
   GELOGI("Begin to build unknown shape graph[%s].", comp_graph->GetName().c_str());
+  Graph2SubGraphInfoList subgraph_map;
+  ge::ModelBuilder builder(session_id, comp_graph, subgraph_map, stream_max_parallel_num_, hcom_parallel_, build_mode_);
+  GE_DUMP(comp_graph, "BeforePreBuildModel");
+  GE_TIMESTAMP_START(PreBuildModel);
+  GE_CHK_STATUS_RET(builder.PreBuildModel(), "Graph[%s] builder PreBuildModel() return fail.",
+                    comp_graph->GetName().c_str());
+  GE_TIMESTAMP_END(PreBuildModel, "GraphBuilder::PreBuildModel");
+  GE_DUMP(comp_graph, "AfterPreBuildModel");
+
   GE_TIMESTAMP_START(CalcOpParam);
   GE_CHK_STATUS_RET(CalcOpParam(comp_graph), "Graph[%s] builder CalcOpParam() return fail.",
                     comp_graph->GetName().c_str());
   GE_TIMESTAMP_END(CalcOpParam, "GraphBuilder::CalcOpParam");
   GE_DUMP(comp_graph, "AfterCalcOpParam");
-  Graph2SubGraphInfoList subgraph_map;
-  ge::ModelBuilder builder(session_id, comp_graph, subgraph_map, stream_max_parallel_num_, hcom_parallel_, build_mode_);
+
+  GE_TIMESTAMP_START(SetConstantInputOffset);
+  GE_CHK_STATUS_RET(SetConstantInputOffset(comp_graph),
+                    "Graph[%s] failed to set constant input offset.", comp_graph->GetName().c_str());
+  GE_TIMESTAMP_END(SetConstantInputOffset, "GraphBuilder::SetConstantInputOffset");
+  GE_TIMESTAMP_START(MergeWeights);
+  GE_CHK_STATUS_RET(builder.MergeWeights(), "Graph[%s] failed to merge weights.", comp_graph->GetName().c_str());
+  GE_TIMESTAMP_END(MergeWeights, "GraphBuilder::MergeWeights");
+
   ModelPtr model_ptr = MakeShared<ge::Model>();
   if (model_ptr == nullptr) {
     return MEMALLOC_FAILED;
@@ -349,7 +412,8 @@ static Status GenerateTaskForConstant(const std::shared_ptr<ComputeGraph> &graph
           GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str());
           std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy";
           if (InsertMemcpyNode(graph, peer_out_anchor, {in_data_anchor}, name) != SUCCESS) {
-            GELOGE(FAILED, "Insert memcpy between %s and %s failed.", in_node->GetName().c_str(), node->GetName().c_str());
+            GELOGE(FAILED, "Insert memcpy between %s and %s failed.",
+                   in_node->GetName().c_str(), node->GetName().c_str());
             return FAILED;
           }
         }
@@ -359,6 +423,52 @@ static Status GenerateTaskForConstant(const std::shared_ptr<ComputeGraph> &graph
   return SUCCESS;
 }
 
+Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) {
+  bool original_unknown_shape_flag = com_graph->GetGraphUnknownFlag();
+  com_graph->SetGraphUnknownFlag(false);
+
+  GELOGD("Start to mark profiling task attr for fp and bp.");
+  TaskGenerator task_generator;
+  ProfilingPoint profiling_point;
+  std::vector<uint32_t> all_reduce_node_index;
+  Status ret = task_generator.FindProfilingNodeIndex(com_graph, profiling_point, all_reduce_node_index);
+  com_graph->SetGraphUnknownFlag(original_unknown_shape_flag);
+  if (ret != SUCCESS) {
+    GELOGW("Find profiling node index failed.");
+  }
+  if (profiling_point.fp_index == 0 || profiling_point.bp_index == 0 || profiling_point.end_index.empty()) {
+    GELOGD("No need to mark fp bp profiling task attr.");
+    return SUCCESS;
+  }
+  // mark profiling task attr for node
+  uint32_t node_index = 0;
+  for (const auto &node : com_graph->GetAllNodes()) {
+    OpDescPtr op_desc = node->GetOpDesc();
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    node_index++;
+    if (profiling_point.fp_index == node_index) {
+       GELOGI("The first fp node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
+      (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, true);
+    }
+    if (profiling_point.bp_index == node_index) {
+      GELOGI("The bp node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
+      (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true);
+    }
+    for (size_t i = 0; i < all_reduce_node_index.size(); i++) {
+      if (all_reduce_node_index[i] == node_index) {
+        GELOGI("The all reduce node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
+        (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, true);
+        continue;
+      }
+    }
+    if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) {
+      GELOGI("The end node of dynamic graph is %s, idx %u", op_desc->GetName().c_str(), node_index);
+      (void)ge::AttrUtils::SetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, true);
+    }
+  }
+  return SUCCESS;
+}
+
 Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph,
                                                std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
                                                GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr,
@@ -374,10 +484,21 @@ Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph,
                         op_desc->GetName().c_str());
     }
   }
-  //
-  for (auto &sub_graph : comp_graph->GetAllSubgraphs()) {
+
+  // Set fp bp profiling task attr for graph
+  if (MarkFpBpProfilingTaskAttr(comp_graph) != SUCCESS) {
+    GELOGE(FAILED, "Set fp bp profiling task attr for graph.");
+    return FAILED;
+  }
+
+  auto all_graphs = comp_graph->GetAllSubgraphs();
+  if (all_graphs.empty()) {
+    all_graphs.push_back(comp_graph);
+  }
+  for (auto &sub_graph : all_graphs) {
     // exclude functional subgraph in known subgraph
-    if (sub_graph->GetParentGraph() != comp_graph && !sub_graph->GetParentGraph()->GetGraphUnknownFlag()) {
+    if (sub_graph->GetParentGraph() != nullptr && sub_graph->GetParentGraph() != comp_graph &&
+        !sub_graph->GetParentGraph()->GetGraphUnknownFlag()) {
       continue;
     }
 
@@ -475,7 +596,7 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr
 }
 
 Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) {
-  // set input_desc.size = src_node.output_desc.size
+  // Set the size of input_desc to 'src_node.output_desc.size'
   if (node_ptr->GetType() == DATA) {
     bool is_unknown_shape = false;
     GE_CHK_STATUS_RET(ge::NodeUtils::GetNodeUnknownShapeStatus(*node_ptr, is_unknown_shape),
@@ -498,7 +619,7 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) {
     GE_IF_BOOL_EXEC(src_op == nullptr, continue);
     auto node_op_desc = node_ptr->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
-    // set dst_node.input_desc = src_node.output_desc
+    // Set the input_desc of dst_node to 'src_node.output_desc'
     auto output_desc = src_op->GetOutputDescPtr(peer_out_anchor->GetIdx());
     int64_t size = 0;
     GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_desc, size) != SUCCESS, GELOGI("Get size failed!"));
@@ -512,7 +633,6 @@ Status GraphBuilder::SetInputSize(const ge::NodePtr &node_ptr) {
     auto input_desc = node_op_desc->MutableInputDesc(in_data_anchor->GetIdx());
     GE_CHECK_NOTNULL(input_desc);
     (void) ge::TensorUtils::SetSize(*input_desc, size);
-    GE_CHK_STATUS_RET(node_op_desc->UpdateInputDesc(in_data_anchor->GetIdx(), *input_desc));
     GELOGD("%s input desc, dim_size: %zu, mem_size: %ld, format: %s, type: %s.", node_ptr->GetName().c_str(),
            input_desc->GetShape().GetDimNum(), size, TypeUtils::FormatToSerialString(input_desc->GetFormat()).c_str(),
            TypeUtils::DataTypeToSerialString(input_desc->GetDataType()).c_str());
diff --git a/ge/graph/build/graph_builder.h b/ge/graph/build/graph_builder.h
index 329f3ebc..524b60e0 100644
--- a/ge/graph/build/graph_builder.h
+++ b/ge/graph/build/graph_builder.h
@@ -60,6 +60,7 @@ class GraphBuilder {
   Status UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph, ge::NodePtr &parent_node_ptr);
   Status CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc);
   Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list);
+  Status MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph);
   Status BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
                                    GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr,
                                    uint64_t session_id = INVALID_SESSION_ID);
@@ -67,6 +68,7 @@ class GraphBuilder {
                                  GeModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID);
   Status BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                                    uint64_t session_id = INVALID_SESSION_ID);
+  Status SetConstantInputOffset(ComputeGraphPtr &comp_graph);
   Status AddOutputMemTypeForNode(const NodePtr &node);
   Status BuildForHostCpuGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                               uint64_t session_id = INVALID_SESSION_ID);
diff --git a/ge/graph/build/memory/CMakeLists.txt b/ge/graph/build/memory/CMakeLists.txt
deleted file mode 100644
index bdd869a9..00000000
--- a/ge/graph/build/memory/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-set(SRC_LIST
-    "memory_assigner.cc"
-    "graph_mem_assigner.cc"
-    "binary_block_mem_assigner.cc"
-    "block_mem_assigner.cc"
-    "hybrid_mem_assigner.cc"
-    "max_block_mem_assigner.cc"
-    "var_mem_assign_util.cc"
-)
-
-############ libge_memory.a ############
-add_library(ge_memory STATIC ${SRC_LIST})
-
-target_compile_options(ge_memory PRIVATE
-    -Werror
-    -O2
-)
-
-target_compile_definitions(ge_memory PRIVATE
-    google=ascend_private
-)
-
-target_link_libraries(ge_memory PRIVATE
-    $<BUILD_INTERFACE:intf_pub>
-    ascend_protobuf
-    c_sec
-)
-
-target_include_directories(ge_memory PRIVATE
-    ${CMAKE_CURRENT_LIST_DIR}
-    ${GE_CODE_DIR}/ge
-    ${GE_CODE_DIR}/inc
-    ${GE_CODE_DIR}/inc/external
-    ${METADEF_DIR}/inc
-    ${METADEF_DIR}/inc/external
-    ${METADEF_DIR}/inc/external/graph
-    ${GE_CODE_DIR}/inc/framework
-    #### yellow zone ####
-    ${GE_CODE_DIR}/../inc
-    #### blue zone ####
-    ${GE_CODE_DIR}/third_party/fwkacllib/inc
-)
diff --git a/ge/graph/build/memory/binary_block_mem_assigner.cc b/ge/graph/build/memory/binary_block_mem_assigner.cc
index ecd2488c..97a0aed6 100644
--- a/ge/graph/build/memory/binary_block_mem_assigner.cc
+++ b/ge/graph/build/memory/binary_block_mem_assigner.cc
@@ -21,8 +21,8 @@
 namespace {
 const uint32_t kRangeCeilInterval = 2;
 const uint32_t kLogBase = 2;
-const int64_t kLargeBlockSize = 8 * 1024 * 1024;
-const int64_t kLargeBlockRangeSize = 10;
+const int64_t kLargeBlockSize = 8388608;   // 8 * 1024 * 1024
+const int64_t kLargeBlockRangeSize = 2;
 }  // namespace
 
 namespace ge {
@@ -69,19 +69,21 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
     GELOGW("Vector all_memory_size is empty!");
     return SUCCESS;
   }
-  if ((all_memory_size.front() == 0) || (log(kLogBase) == 0)) {
-    GELOGE(FAILED, "dividend is 0!");
+  if ((all_memory_size.front() <= 0) || (log(kLogBase) == 0)) {
+    GELOGE(FAILED, "Memory size:%ld is invalid.", all_memory_size.front());
     return FAILED;
   }
+  // Memory size is 512 aligned, so it is not necessary to take less than 512
+  int64_t min_memory_size = (all_memory_size.back() > MEM_ALIGN_SIZE) ? MEM_ALIGN_SIZE : all_memory_size.front();
   auto range_number = static_cast<size_t>(
-    ceil(log(all_memory_size.back() / static_cast<double>(all_memory_size.front())) / log(kLogBase)));
+    ceil(log(all_memory_size.back() / static_cast<double>(min_memory_size)) / log(kLogBase)));
   range_number = (range_number == 0) ? 1 : range_number;
   GELOGD("Range number: %zu", range_number);
 
   vector<vector<int64_t>> ranges(range_number);
   GE_CHK_BOOL_EXEC((range_number != 0), return PARAM_INVALID, "range_number can't be 0.");
   size_t range_number_limit = all_memory_size.size() / range_number;
-  int64_t range_ceil = all_memory_size[0];
+  int64_t range_ceil = min_memory_size;
   for (size_t i = 1; i <= range_number; i++) {
     GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(static_cast<uint64_t>(range_ceil), kRangeCeilInterval),
                     GELOGE(FAILED, "Multiply result is out of range.");
@@ -114,7 +116,7 @@ Status BinaryBlockMemAssigner::GetMemoryRanges(vector<int64_t> &range_ceils) {
       range_ceils.push_back(range.back());
     }
   }
-  GELOGD("Range ceils: %s", ToString(range_ceils).c_str());
+  GELOGI("Range ceils: %s", ToString(range_ceils).c_str());
 
   return SUCCESS;
 }
diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc
index 00f47573..21d6a49e 100755
--- a/ge/graph/build/memory/block_mem_assigner.cc
+++ b/ge/graph/build/memory/block_mem_assigner.cc
@@ -52,7 +52,6 @@ const char *const kAttrNameWorkspaceReuseFlag = "workspace_reuse_flag";
 const char *const kL2FusionDynamicConvergeOp = "l2fusion_dynamic_converge_op";
 const char *const kOpNoReuseMem = "no_reuse_mem_flag";
 const char *const OP_NO_REUSE_MEM = "OP_NO_REUSE_MEM";
-const int kReuseMaxCount = 10;
 const int kReuseMaxOpNum = 10;
 const int kReuseMaxCharNum = 2000;
 }  // namespace
@@ -65,6 +64,95 @@ void AlignMemOffset(size_t &mem_align_size) {
   mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
 }
 
+static bool CompareLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) {
+  if (left.GetLifeBegin() < right.GetLifeBegin()) {
+    return true;
+  }
+  return false;
+}
+
+void GetLifeList(const MemoryBlock &block, std::vector<NodeTypeIndex> &life_list, bool child) {
+  for (auto &node : block.NodeTypeIndexList()) {
+    life_list.emplace_back(node);
+  }
+
+  if (child) {
+    for (auto child_block : block.ChildBlockList()) {
+      if (child_block == nullptr) {
+        continue;
+      }
+      if (block.stream_id_ != child_block->stream_id_ || !block.same_stream_ || !child_block->same_stream_) {
+        life_list.clear();
+        return;
+      }
+      GetLifeList(*child_block, life_list, child);
+    }
+  }
+}
+
+bool CrossLifeTime(const NodeTypeIndex &left, const NodeTypeIndex &right) {
+  if ((left.node == nullptr) || (right.node == nullptr)) {
+    return true;
+  }
+  auto left_node_op_desc = left.node->GetOpDesc();
+  auto right_node_op_desc = right.node->GetOpDesc();
+  if ((left_node_op_desc != nullptr) && (right_node_op_desc != nullptr)) {
+    if (left.GetLifeBegin() < right.GetLifeBegin()) {
+      if (left.life_time_end >= right.GetLifeBegin()) {
+        return true;
+      }
+    } else if (left.GetLifeBegin() == right.GetLifeBegin()) {
+      return true;
+    } else {
+      if (right.life_time_end >= left.GetLifeBegin()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+///
+/// When child block's life time are not cross with parent block, they can be reused(only same stream).
+/// |-----------------------------parent block---------------------|
+/// |------child block1--------------||------child block2------|
+/// |--child block1-1-|
+///
+bool CanIntervalLifeReuse(MemoryBlock &parent_block, MemoryBlock &child_block) {
+  // judge by interval life time, only same stream can be judged by interval life time
+  if (parent_block.stream_id_ != child_block.stream_id_ || !parent_block.same_stream_ || !child_block.same_stream_
+      || parent_block.NodeTypeIndexList().empty() || child_block.NodeTypeIndexList().empty()) {
+    return false;
+  }
+
+  // quick judge by front and back node
+  if (CrossLifeTime(parent_block.NodeTypeIndexList().front(), child_block.NodeTypeIndexList().front())) {
+    return false;
+  }
+  if (CrossLifeTime(parent_block.NodeTypeIndexList().back(), child_block.NodeTypeIndexList().back())) {
+    return false;
+  }
+
+  std::vector<NodeTypeIndex> life_list;
+  GetLifeList(parent_block, life_list, false);
+  GetLifeList(child_block, life_list, true);
+  if (life_list.empty()) {
+    return false;
+  }
+  std::sort(life_list.begin(), life_list.end(), CompareLifeTime);
+  size_t pre_life_end = 0;
+  for (auto &node : life_list) {
+    auto node_op_desc = node.node->GetOpDesc();
+    if (node_op_desc != nullptr && pre_life_end >= static_cast<size_t>(node_op_desc->GetId())) {
+      // life time cross
+      return false;
+    }
+    pre_life_end = node.life_time_end;
+  }
+  GELOGI("Block size[%zu, %zu] life time are not cross.", parent_block.Size(), child_block.Size());
+  return true;
+}
+
 void MemoryBlock::SetHeadOffset(size_t offset) {
   head_offset_ = offset;
   size_t child_offset = head_offset_;
@@ -125,20 +213,12 @@ size_t MemoryBlock::AlignSize() const {
   return align_block_size;
 }
 
-bool MemoryBlock::IsSameLabel(std::string &first_batch_label) {
-  if (node_type_index_list_.empty()) {
+bool MemoryBlock::IsSameBatchLabel() {
+  // only same batch label can reuse
+  if (batch_label_.empty() || node_type_index_list_.empty()) {
     return false;
   }
 
-  auto node_op_desc = node_type_index_list_[0].node->GetOpDesc();
-  if (node_op_desc == nullptr) {
-    return false;
-  }
-  // not all op has ATTR_NAME_BATCH_LABEL, no need check return value, only check out parameter
-  (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, first_batch_label);
-  if (first_batch_label.empty()) {
-    return false;
-  }
   bool all_same_label = true;
   for (size_t index = 1; index < node_type_index_list_.size(); ++index) {
     if (node_type_index_list_[index].node == nullptr) {
@@ -147,8 +227,9 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) {
     std::string batch_label;
     auto index_op_desc = node_type_index_list_[index].node->GetOpDesc();
     GE_IF_BOOL_EXEC(index_op_desc == nullptr, continue);
+    // not all op has ATTR_NAME_BATCH_LABEL, no need check return value, only check out parameter
     (void)ge::AttrUtils::GetStr(index_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-    if (first_batch_label != batch_label) {
+    if (batch_label_ != batch_label) {
       all_same_label = false;
       break;
     }
@@ -197,7 +278,7 @@ void MemoryBlock::AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLi
 }
 
 void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life) {
-  if (CanNotLifeReuse(this) || CanNotLifeReuse(block)) {
+  if (CanNotLifeReuse(this) || CanNotLifeReuse(block) || (batch_label_ != block->batch_label_)) {
     return;
   }
   if (block->continuous_block_) {
@@ -207,16 +288,27 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_
   MemoryBlock *parent = nullptr;
   MemoryBlock *child = nullptr;
   // merge small block to large block
-  if (block->GetDependLifeBegin(stream_id_, total_node_depend_stream_life) > GetLifeEnd()) {
-    if ((child_offset_ + block->AlignSize()) <= AlignSize()) {
-      parent = this;
-      child = block;
-    } else if ((block->child_offset_ + AlignSize()) <= block->AlignSize()) {
-      parent = block;
-      child = this;
+  // noalign size         802816 + 802816 = 1605632       can reuse
+  // after 32 align size  802848 + 802848 > 1605664       can't reuse
+  // after 512 align size 803328 + 803328 > 1606144       can't reuse
+  // so                   803328 + 803328 = 1606144 + 512 can reuse
+  if ((child_offset_ + block->AlignSize()) <= (AlignSize() + MEM_ALIGN_SIZE)) {
+    parent = this;
+    child = block;
+  } else if ((block->child_offset_ + AlignSize()) <= (block->AlignSize() + MEM_ALIGN_SIZE)) {
+    parent = block;
+    child = this;
+  }
+
+  if ((parent != nullptr) && (child != nullptr)) {
+    // Different streams must use stream dependency to judge the life cycle
+    // In case same stream if it has child block, can judge all the child block's life time in CanIntervalLifeReuse
+    bool can_block_life_reuse = (child->child_blocks_.empty()
+        && (block->GetDependLifeBegin(stream_id_, total_node_depend_stream_life) > GetLifeEnd()));
+    if (!can_block_life_reuse && !CanIntervalLifeReuse(*parent, *child)) {
+      return;
     }
-  }
-  if ((parent != nullptr) && (child != nullptr) && child->child_blocks_.empty()) {
+
     parent->child_blocks_.emplace_back(child);
     parent->child_offset_ += child->AlignSize();
     child->deleted_block_ = true;
@@ -230,12 +322,7 @@ void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_
 size_t MemoryBlock::GetLifeBegin() {
   size_t life_time = 0;
   if (!node_type_index_list_.empty()) {
-    if (node_type_index_list_.front().node != nullptr) {
-      auto node_op_desc = node_type_index_list_.front().node->GetOpDesc();
-      if (node_op_desc != nullptr) {
-        life_time = node_op_desc->GetId();
-      }
-    }
+      life_time = node_type_index_list_.front().GetLifeBegin();
   }
   return life_time;
 }
@@ -261,6 +348,7 @@ size_t MemoryBlock::GetDependLifeBegin(int64_t stream_id, DependStreamLife &tota
 void AddDependLife(const ge::NodePtr &org_node, const ge::NodePtr &node, int64_t stream_id,
                    std::map<int64_t, size_t> &depend_stream_life, DependStreamLife &total_node_depend_stream_life) {
   GE_CHECK_NOTNULL_EXEC(node, return);
+  GE_CHECK_NOTNULL_EXEC(org_node, return);
   auto node_desc = node->GetOpDesc();
   GE_CHECK_NOTNULL_EXEC(node_desc, return);
   auto node_id = node_desc->GetId();
@@ -321,7 +409,7 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_
   depend_stream_life_[stream_id_] = GetLifeBegin();
 }
 
-size_t MemoryBlock::GetLifeEnd() {
+size_t MemoryBlock::GetLifeEnd() const {
   if (!node_type_index_list_.empty()) {
     return node_type_index_list_.back().life_time_end;
   }
@@ -415,60 +503,98 @@ BlockMemAssigner::~BlockMemAssigner() {
   }
 }
 
+void GetMaxBatchAllMemorySize(std::map<std::string, vector<int64_t>> &batch_all_memory_size,
+                              std::map<std::string, int64_t> batch_total_size, vector<int64_t> &all_memory_size,
+                              std::string &max_batch_label) {
+  // use max batch all memory size for reuse range
+  int64_t max_batch_size = 0;
+  for (const auto &it : batch_total_size) {
+    GELOGI("Batch[%s] total memory size[%ld]", it.first.c_str(), it.second);
+    // no batch label
+    if (it.first.empty()) {
+      continue;
+    }
+    if (it.second > max_batch_size) {
+      max_batch_size = it.second;
+      max_batch_label = it.first;
+    }
+  }
+  GELOGI("Max batch[%s] total memory size[%ld]", max_batch_label.c_str(), max_batch_size);
+
+  for (const auto &it : batch_all_memory_size) {
+    if (it.first.empty() || (it.first == max_batch_label)) {
+      all_memory_size.insert(all_memory_size.end(), it.second.begin(), it.second.end());
+    }
+  }
+  // all_memory_size can't be empty
+  if (all_memory_size.empty()) {
+    all_memory_size.emplace_back(MEM_ALIGN_SIZE);
+  }
+  sort(all_memory_size.begin(), all_memory_size.end());
+  GELOGD("All memory size: %s", ToString(all_memory_size).c_str());
+
+  for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) {
+    if (*iter == 0) {
+      iter = all_memory_size.erase(iter);
+    } else {
+      ++iter;
+    }
+  }
+}
+
 void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
   vector<int64_t> temp;
+  std::map<std::string, vector<int64_t>> batch_all_memory_size;
+  std::map<std::string, int64_t> batch_total_size;
   for (const NodePtr &n : compute_graph_->GetAllNodes()) {
     auto node_op_desc = n->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
 
+    if (CheckIsZeroMemNodeType(node_op_desc->GetType())) {
+      continue;
+    }
+
+    std::string batch_label;
+    (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
+
     if (node_op_desc->GetType() == ATOMICADDRCLEAN) {
       atomic_addr_clean_id_ = node_op_desc->GetId();
     }
 
     for (auto &out_anchor : n->GetAllOutDataAnchors()) {
       GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx());
-      bool reuse_input = false;
-      GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInput(output_desc, reuse_input) != SUCCESS,
-                      GELOGI("Get reuse_input failed"));
-
-      if (!reuse_input) {
-        int64_t size = 0;
-        GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed"));
-        if (anchor_to_symbol_.empty()) {
-          all_memory_size.emplace_back(size);
-        } else {
-          auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString());
-          if (iter1 == anchor_to_symbol_.end()) {
-            continue;
-          }
-          const std::string &symbol = iter1->second;
-          auto iter2 = symbol_size_.find(symbol);
-          if (iter2 == symbol_size_.end()) {
-            symbol_size_[symbol] = size;
-          } else if (size > static_cast<int64_t>(iter2->second)) {
-            iter2->second = size;
-          }
+      int64_t size = 0;
+      GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(output_desc, size) != SUCCESS, GELOGI("Get size failed"));
+      GE_IF_BOOL_EXEC(size < 0, GELOGE(FAILED, "Node:%s size:%ld is invalid, maybe it is unknown shape node.",
+                                       node_op_desc->GetName().c_str(), size);
+                      return;);
+      batch_all_memory_size[batch_label].emplace_back(size);
+      if (batch_total_size.find(batch_label) == batch_total_size.end()) {
+        batch_total_size[batch_label] = size;
+      } else {
+        batch_total_size[batch_label] += size;
+      }
+
+      if (!anchor_to_symbol_.empty()) {
+        auto iter1 = anchor_to_symbol_.find(NodeIndexIO(n, out_anchor->GetIdx(), kOut).ToString());
+        if (iter1 == anchor_to_symbol_.end()) {
+          continue;
+        }
+        const std::string &symbol = iter1->second;
+        auto iter2 = symbol_size_.find(symbol);
+        if (iter2 == symbol_size_.end()) {
+          symbol_size_[symbol] = size;
+        } else if (size > static_cast<int64_t>(iter2->second)) {
+          iter2->second = size;
         }
       }
     }
     temp.clear();
-    GetNodeWorkSpaceSize(n, temp);
-    all_memory_size.insert(all_memory_size.end(), temp.begin(), temp.end());
-  }
-  for (const auto &pair : symbol_size_) {
-    all_memory_size.emplace_back(pair.second);
-  }
-  sort(all_memory_size.begin(), all_memory_size.end());
-  GELOGD("All memory size: %s", ToString(all_memory_size).c_str());
-
-  for (auto iter = all_memory_size.begin(); iter != all_memory_size.end();) {
-    if (*iter == 0) {
-      iter = all_memory_size.erase(iter);
-    } else {
-      ++iter;
-    }
+    GetNodeWorkSpaceSize(n, temp, batch_total_size[batch_label]);
+    batch_all_memory_size[batch_label].insert(batch_all_memory_size[batch_label].end(), temp.begin(), temp.end());
   }
-
+  GELOGI("The last atomic_addr_clean node id: %ld", atomic_addr_clean_id_);
+  GetMaxBatchAllMemorySize(batch_all_memory_size, batch_total_size, all_memory_size, max_batch_label_);
   InitReuseFlag();
   PrintSymbolMap();
 }
@@ -500,45 +626,17 @@ bool IsDirectOutputNode(const NodePtr &node, int idx) {
   return false;
 }
 
-void AddReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &reusable_block_counts) {
-  string key = std::to_string(mem_block.Size());
-  key += "_" + std::to_string(mem_block.stream_id_);
-  key += "_" + std::to_string(mem_block.memory_type_);
-  auto it = reusable_block_counts.find(key);
-  if (it != reusable_block_counts.end()) {
-    it->second++;
-  } else {
-    reusable_block_counts[key] = 1;
-  }
-}
-
-void ReduceReusableBlockCount(const MemoryBlock &mem_block, map<string, uint64_t> &reusable_block_counts) {
-  string key = std::to_string(mem_block.Size());
-  key += "_" + std::to_string(mem_block.stream_id_);
-  key += "_" + std::to_string(mem_block.memory_type_);
-  auto it = reusable_block_counts.find(key);
-  if (it != reusable_block_counts.end()) {
-    if (it->second > 0) {
-      it->second--;
-    }
-  }
-}
-
-bool CanReuseBySize(const map<string, uint64_t> &reusable_block_counts, const MemoryBlock &reusable_block,
-                    size_t block_size, size_t real_size, bool continuous) {
+bool CanReuseBlock(size_t continuous_life_begin, const MemoryBlock &reusable_block, size_t block_size) {
   bool can_reuse = false;
   if (reusable_block.Size() == block_size) {
-    can_reuse = true;
-  } else {
-    string key = std::to_string(reusable_block.Size());
-    key += "_" + std::to_string(reusable_block.stream_id_);
-    key += "_" + std::to_string(reusable_block.memory_type_);
-    auto it = reusable_block_counts.find(key);
-    GE_IF_BOOL_EXEC((it != reusable_block_counts.end() && (it->second > kReuseMaxCount)) &&
-                    (reusable_block.Size() > block_size),
-                     can_reuse = true;
-                     GELOGD("Less size mem reuse, reuse block size:%zu, current block size:%zu",
-                            reusable_block.Size(), block_size););
+    // in some continuous input case, continuous first input node's is not same as topo first node.
+    if (continuous_life_begin > 0) {
+      if (continuous_life_begin > reusable_block.GetLifeEnd()) {
+        can_reuse = true;
+      }
+    } else {
+      can_reuse = true;
+    }
   }
   return can_reuse;
 }
@@ -549,6 +647,13 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
   if (n == nullptr || n->GetAllOutDataAnchors().size() <= 0) {
     return false;
   }
+  auto node_desc = n->GetOpDesc();
+  GE_IF_BOOL_EXEC(node_desc == nullptr, GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str());
+                  return false;);
+  std::vector<int64_t> offsets_for_fusion = {};
+  bool has_lx_fusion_attr =
+      AttrUtils::GetListInt(node_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion);
+
   if (static_cast<size_t>(out_index) < n->GetAllOutDataAnchors().size()) {
     auto out_anchor = n->GetOutDataAnchor(out_index);
     GE_IF_BOOL_EXEC(out_anchor == nullptr,
@@ -571,16 +676,17 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
                       return false;);
 
       // If GetBool fail, is_input_continuous is false.
-      bool is_input_continuous_no_padding = false;
-      (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT,
-                                   is_input_continuous_no_padding);
-      if (is_input_continuous_no_padding) {
+      (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous);
+      if (is_input_continuous) {
         reset_zero_copy_flag = true;
-        return false;
+        has_lx_fusion_attr = true;
+      } else {
+        (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
       }
-      (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
 
-      GE_IF_BOOL_EXEC(is_input_continuous && CheckIsZeroMemNodeType(peer_node->GetType()),
+      // lx_fusion memory only assign first input, broadcast's input some are variable some are not, reassign later
+      GE_IF_BOOL_EXEC(is_input_continuous &&
+          (CheckIsZeroMemNodeType(peer_node->GetType()) || (has_lx_fusion_attr && (peer_in_anchor->GetIdx() != 0))),
                       GELOGI("Node[%s] output[%u] no_need_assign_memory.", n->GetName().c_str(), out_index);
                       no_need_assign_memory = true;
                       return false;);
@@ -594,6 +700,10 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
           // Only set attr one times.
           if (node_continuous_input_blocks_[peer_in_node_desc->GetName()].size() == 0) {
             (void)ge::AttrUtils::SetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true);
+            // lx fusion case assign max size for first block, so reuse as none continuous
+            GE_IF_BOOL_EXEC(has_lx_fusion_attr,
+                            is_op_reuse_mem_ = IsContinuousMemoryReuse(n, peer_node, out_index);
+                            return false;);
             node_continuous_input_counts_[peer_in_node_desc->GetName()] = peer_node->GetAllInDataAnchorsSize();
           }
           peer_input_index = peer_in_anchor->GetIdx();
@@ -606,6 +716,95 @@ bool BlockMemAssigner::IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t ou
   return false;
 }
 
+bool IsContinuousInputNodeMaxLife(const NodePtr &n, uint32_t out_index) {
+  if (n == nullptr) {
+    return false;
+  }
+
+  int64_t max_node_life_time = 0;
+  int64_t continuous_input_node_life_time = 0;
+  if (static_cast<size_t>(out_index) < n->GetAllOutDataAnchors().size()) {
+    auto out_anchor = n->GetOutDataAnchor(out_index);
+    if(out_anchor == nullptr) {
+      return false;
+    }
+
+    // continuous input node's life time should be max
+    for (auto const &peer_in_anchor : out_anchor->GetPeerInDataAnchors()) {
+      if ((peer_in_anchor == nullptr) || (peer_in_anchor->GetOwnerNode() == nullptr)){
+        return false;
+      }
+      auto peer_in_node_desc = peer_in_anchor->GetOwnerNode()->GetOpDesc();
+      GE_IF_BOOL_EXEC(peer_in_node_desc == nullptr,
+                      GELOGE(FAILED, "Node[%s] output[%u] peer in node desc is null.", n->GetName().c_str(), out_index);
+      return false;);
+
+      if(peer_in_node_desc->GetId() > max_node_life_time) {
+        max_node_life_time = peer_in_node_desc->GetId();
+      }
+
+      // If GetBool fail, is_input_continuous is false.
+      bool is_input_continuous = false;
+      (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_input_continuous);
+      if (!is_input_continuous) {
+        (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
+      }
+      if (is_input_continuous) {
+        continuous_input_node_life_time = peer_in_node_desc->GetId();
+      }
+    }
+  }
+  return ((max_node_life_time != 0) && (continuous_input_node_life_time == max_node_life_time)) ;
+}
+
+///
+/// @ingroup GE
+/// @brief Check continuous memory reuseable
+/// @return void
+///
+bool BlockMemAssigner::IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index) {
+  // n,peer_node_desc have been checked
+  auto node_desc = n->GetOpDesc();
+  auto peer_node_desc = peer_node->GetOpDesc();
+  continuous_life_begin_ = static_cast<size_t>(node_desc->GetId());
+  // lx fusion case check all continuous input node, firt input node's life time should be min
+  for (const auto &in_anchor : peer_node->GetAllInDataAnchors()) {
+    if ((in_anchor == nullptr) || (in_anchor->GetPeerOutAnchor() == nullptr) ||
+        (in_anchor->GetPeerOutAnchor()->GetOwnerNode() == nullptr) ||
+        (in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc() == nullptr)) {
+      GELOGE(FAILED, "Node[%s] output[%u] peer input node desc is null.", n->GetName().c_str(), out_index);
+      return false;
+    }
+    auto peer_out_node_desc = in_anchor->GetPeerOutAnchor()->GetOwnerNode()->GetOpDesc();
+    ///
+    ///  node2 node1  node3
+    ///      |   /   / |
+    ///      node5    node6
+    /// firt input node's life time is not min
+    /// when node5's first input node2's life time is not min(node2 > node1), use node1's life time to reuse
+    ///
+    if (static_cast<size_t>(peer_out_node_desc->GetId()) < continuous_life_begin_) {
+      continuous_life_begin_ = static_cast<size_t>(peer_out_node_desc->GetId());
+      GELOGI(
+        "Node[%s] life[%ld] output[%u] is not continuous input node[%s] life[%ld]'s min life time,"
+        "min is node[%s] life[%zu]",
+        n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(),
+        peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), continuous_life_begin_);
+    }
+    // when node3's output node5's life time is not max(node6 > node5), not reuse
+    if (!IsContinuousInputNodeMaxLife(in_anchor->GetPeerOutAnchor()->GetOwnerNode(),
+                                      in_anchor->GetPeerOutAnchor()->GetIdx())) {
+      GELOGI(
+        "Node[%s] life[%ld] output[%u]'s continuous input node[%s] life[%ld]'s is not node[%s] output[%d]'s "
+        "max life node",
+        n->GetName().c_str(), node_desc->GetId(), out_index, peer_node_desc->GetName().c_str(),
+        peer_node_desc->GetId(), peer_out_node_desc->GetName().c_str(), in_anchor->GetPeerOutAnchor()->GetIdx());
+      return false;
+    }
+  }
+  return true;
+}
+
 ///
 /// @ingroup GE
 /// @brief Check pre_reuse flag & post_reuse glag for each symbol
@@ -860,38 +1059,40 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null.");
   auto node_op_desc = n->GetOpDesc();
   GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr);
+  std::string batch_label;
+  (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
+  if (batch_label.empty() || (batch_label == max_batch_label_)) {
+    size_t align_size = real_size;
+    AlignMemOffset(align_size);
+    theory_memory_size_ += align_size;
+    if (theory_memory_size_ > theory_min_memory_size_) {
+      theory_min_memory_size_ = theory_memory_size_;
+    }
+  }
 
   bool is_reuse_memory = false;
-  string ge_disable_reuse_mem_env = "0";
-  (void)ge::GetContext().GetOption(OPTION_EXEC_DISABLE_REUSED_MEMORY, ge_disable_reuse_mem_env);
-  if (ge_disable_reuse_mem_env != "1") {
+  if (ge_disable_reuse_mem_env_ != "1") {
     bool reuse_mem_flag = (mem_type == kOutput) ? IsPreReuse(n, out_index) :
                           !((workspace_reuse_flag.size() > out_index) && !workspace_reuse_flag[out_index]);
     is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) &&
                       !node_op_desc->HasAttr(kOpNoReuseMem) && reuse_mem_flag && is_op_reuse_mem;
-    auto stream_id = node_op_desc->GetStreamId();
-    if (is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty()) {
-      for (auto it = reusable_blocks_[memory_type][stream_id].begin();
-           it != reusable_blocks_[memory_type][stream_id].end(); ++it) {
+    bool do_reuse = is_reuse_memory && !continuous && !reusable_blocks_[memory_type].empty();
+    if (do_reuse) {
+      auto stream_id = node_op_desc->GetStreamId();
+      for (auto it = reusable_blocks_[memory_type][stream_id].rbegin();
+           it != reusable_blocks_[memory_type][stream_id].rend(); ++it) {
         MemoryBlock *reusable_block = *it;
         if (!IsPostReuse(reusable_block)) {
           reusable_block->reuse_mem_ = false;
           GELOGI("Unreusable block.");
           continue;
         }
-        std::string batch_label;
-        if (reusable_block->IsSameLabel(batch_label)) {
-          std::string op_label;
-          (void)ge::AttrUtils::GetStr(node_op_desc, ATTR_NAME_BATCH_LABEL, op_label);
-          if (batch_label != op_label) {
-            GELOGI("label diff, op name %s", node_op_desc->GetName().c_str());
-            continue;
-          }
-        }
+        GE_IF_BOOL_EXEC(reusable_block->batch_label_ != batch_label, continue);
 
         // A node can reuse blocks of the same stream and preorder streams
-        if (CanReuseBySize(reusable_block_counts_, *reusable_block, block_size, real_size, continuous)) {
-          reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false}, real_size, no_align_size);
+        if (CanReuseBlock(continuous_life_begin_, *reusable_block, block_size)) {
+          reusable_block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_},
+                                           real_size, no_align_size);
           if (mem_type == kOutput) {
             auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString());
             if (iter != anchor_to_symbol_.end()) {
@@ -900,8 +1101,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
           }
           reusable_block->continuous_block_ = continuous;
           reusable_block->ref_count_++;
-          ReduceReusableBlockCount(*reusable_block, reusable_block_counts_);
-          reusable_blocks_[memory_type][stream_id].erase(it);
+          reusable_blocks_[memory_type][stream_id].erase((++it).base());
           return reusable_block;
         }
       }
@@ -913,11 +1113,11 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
 
   // Data and netoutput need zero copy block
   block->is_zero_copy_ = IsZeroCopyBlock(n, continuous);
-
-  block->Init(real_size, mem_type, n, out_index, no_align_size);
+  block->AddNodeTypeIndex({n, mem_type, out_index, false, continuous_life_begin_}, real_size, no_align_size);
   block->stream_id_ = node_op_desc->GetStreamId();
   block->ref_count_++;
   block->continuous_block_ = continuous;
+  block->batch_label_ = batch_label;
   if (mem_type == kOutput) {
     auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString());
     if (iter != anchor_to_symbol_.end()) {
@@ -945,6 +1145,11 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
       return nullptr;
     }
 
+    if (CheckIsZeroMemNodeType(n->GetType())) {
+      zero_memory_list_.emplace_back(n, kOutput, index);
+      continue;
+    }
+
     int64_t size = 0;
     if (ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS) {
       GELOGI("Get size failed");
@@ -957,9 +1162,7 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
     // only apply total size in first block
     if (index != 0) {
       zero_memory_list_.emplace_back(n, kOutput, index);
-    }
-
-    if (index == 0) {
+    } else {
       NodeIndexIO node_index_io(n, index, kOut);
       auto iter = anchor_to_symbol_.find(node_index_io.ToString());
       if (iter != anchor_to_symbol_.end()) {
@@ -972,6 +1175,10 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
     }
   }
 
+  if (total_size == 0) {
+    return nullptr;
+  }
+
   auto block_size = GetBlockSize(total_size, ranges);
   GELOGI("Node[%s] continuous out memory size[%ld] block size[%zu]", node_op_desc->GetName().c_str(),
          total_size, block_size);
@@ -1006,8 +1213,23 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
   std::string symbol;
   if (IsSymbolExist(node_index_io, symbol)) {
     block = symbol_blocks_[symbol];
-    block->AddNodeTypeIndex({n, kOutput, index, true}, size, no_align_size);
+    GE_IF_BOOL_EXEC(block == nullptr, GELOGE(FAILED, "Node %s ref block is nullptr.", node_op_desc->GetName().c_str());
+        return nullptr);
+    // reduce old size
+    size_t align_size = block->Size();
+    AlignMemOffset(align_size);
+    theory_memory_size_ -= align_size;
+
+    auto block_size = GetBlockSize(size, ranges);
+    block->SetSize(block_size);
+    block->SetLifeTimeEnd(life_time_);
+    block->AddNodeTypeIndex({n, kOutput, index, true, continuous_life_begin_}, size, no_align_size);
     block->ref_count_++;
+
+    // add new size
+    align_size = block_size;
+    AlignMemOffset(align_size);
+    theory_memory_size_ += align_size;
   } else {
     int64_t max_size = size;
     int64_t memory_type = RT_MEMORY_HBM;
@@ -1060,7 +1282,6 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
       GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInputIndex(*owner_node_op_desc, dst_reuse_input_index) != SUCCESS,
                       GELOGI("Get dst_reuse_input_index failed"));
       if (dst_reuse_input && (dst_reuse_input_index == static_cast<uint32_t>(in_anchor->GetIdx()))) {
-        block->AddNodeTypeIndex({owner_node, kOutput, i, true}, block->Size(), block->Size());
         out_count_reuse_input += 1;
         reuse_input = true;
       }
@@ -1101,7 +1322,7 @@ bool IsAtomicOutputMemory(const ge::NodePtr &node, uint32_t output_index, bool i
       if (static_cast<uint32_t>(index) == output_index) {
         if (node->GetOwnerComputeGraph() != nullptr) {
           string graph_name = node->GetOwnerComputeGraph()->GetName();
-          GELOGD("[IMAS]Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(),
+          GELOGD("Atomic no assign %s name[%s] output[%ld] streamid[%ld].", graph_name.c_str(),
                  op_desc->GetName().c_str(), index, op_desc->GetStreamId());
         }
         return true;
@@ -1119,15 +1340,27 @@ bool IsKnownSubgraphData(const NodePtr &node) {
   return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX);
 }
 
-void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) {
+void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory,
+                                     bool same_stream) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null.");
   GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory");
   GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory");
   --to_release->ref_count_;
+  if (!same_stream) {
+    to_release->same_stream_ = false;
+  }
   if (to_release->ref_count_ == 0) {
-    to_release->SetLifeTimeEnd(life_time_);
-    reusable_memory.emplace_back(to_release);
-    AddReusableBlockCount(*to_release, reusable_block_counts_);
+    if (to_release->reuse_mem_ && !to_release->RealSizeList().empty()) {
+      if (to_release->batch_label_.empty() || (to_release->batch_label_ == max_batch_label_)) {
+        size_t align_size = to_release->RealSizeList().back();
+        AlignMemOffset(align_size);
+        theory_memory_size_ -= align_size;
+      }
+    }
+    if (to_release->same_stream_) {
+      to_release->SetLifeTimeEnd(life_time_);
+      reusable_memory.emplace_back(to_release);
+    }
   }
 }
 
@@ -1167,10 +1400,9 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_map<string, vec
              node_type_indexs.back().node->GetName().c_str());
 
       if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) &&
-          (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx())) &&
-          (node->GetOpDesc()->GetStreamId() == block->stream_id_)) {
-        ReleaseMemory(block, reusable_memory);
-        if (block->ref_count_ == 0) {
+          (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx()))) {
+        ReleaseMemory(block, reusable_memory, (node->GetOpDesc()->GetStreamId() == block->stream_id_));
+        if (block->ref_count_ == 0 && block->same_stream_) {
           SetLastUsedInputMemAttr(node, in_anchor->GetIdx());
         }
       }
@@ -1227,6 +1459,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
   }
 
   is_op_reuse_mem_ = true;
+  continuous_life_begin_ = 0;
   if (op_reuse_env_valid_ == true) {
     vector<string>::iterator it_name =
       std::find(op_no_reuse_mem_vec_.begin(), op_no_reuse_mem_vec_.end(), op_desc->GetName());
@@ -1267,7 +1500,8 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
     bool no_need_assign_memory = ((size == 0) || CheckIsZeroMemNodeType(node->GetType()));
     if (!no_need_assign_memory) {
       out_node_set_continuous_input =
-          IsOutNodeSetContinuousInput(node, i, peer_name, peer_input_index, no_need_assign_memory, reset_zero_copy_flag);
+          IsOutNodeSetContinuousInput(node, i, peer_name, peer_input_index,
+                                      no_need_assign_memory, reset_zero_copy_flag);
       GE_IF_BOOL_EXEC(!no_need_assign_memory,
           no_need_assign_memory = IsAtomicOutputMemory(node, i, is_atomic, out_node_set_continuous_input););
     }
@@ -1277,7 +1511,7 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
       continue;
     }
     // atomic can't be reused
-    bool need_change = is_op_reuse_mem_ && out_node_set_continuous_input && is_atomic;
+    bool need_change = is_op_reuse_mem_ && is_atomic;
     if (need_change) {
       is_op_reuse_mem_ = false;
     }
@@ -1328,7 +1562,8 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
       iter->second[stream_id].clear();
     }
     vector<int64_t> temp;
-    GetNodeWorkSpaceSize(n, temp);
+    int64_t tatal_size = 0;
+    GetNodeWorkSpaceSize(n, temp, tatal_size);
     vector<int64_t> workspace_bytes;
     vector<int64_t> tvm_workspace_memory_type;
     bool has_tvm_workspace_mem_type_attr =
@@ -1349,7 +1584,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
       bool workspace_skip_flag = false;
       if (has_tvm_workspace_mem_type_attr && tvm_workspace_memory_type[i] == RT_MEMORY_L1) {
         GELOGI(
-            "fusion: node[%s]workspace index[%zu] is not hbm type, add to zero_memory_list, workspace memory type [%ld]",
+            "fusion:node[%s]workspace index[%zu] is not hbm type, add to zero_memory_list, workspace memory type [%ld]",
             node_op_desc->GetName().c_str(), i, tvm_workspace_memory_type[i]);
         workspace_skip_flag = true;
       }
@@ -1380,9 +1615,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
     (void)mem_block;  // Fix warning
   }
 
-  bool merge_dynamic_batch = false;
-  GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), merge_dynamic_batch = MergeDynamicBatchBlocks());
-  GE_IF_BOOL_EXEC((!(ge_disable_reuse_mem_env_ == "1") && !merge_dynamic_batch), ReuseBlocksByLifeTime(ranges.size()));
+  GE_IF_BOOL_EXEC(!(ge_disable_reuse_mem_env_ == "1"), ReuseBlocksByLifeTime(ranges.size()));
   AssignContinuousBlocks();
   ResizeMemoryBlocks();
 
@@ -1402,92 +1635,19 @@ void BlockMemAssigner::CheckWorkspaceReuse(const vector<bool> &workspace_reuse_f
   }
 }
 
-void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector<int64_t> &workspace_memory) {
+void BlockMemAssigner::GetNodeWorkSpaceSize(const NodePtr &node, vector<int64_t> &workspace_memory,
+                                            int64_t &total_size) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node->GetOpDesc() == nullptr, return, "Op desc is null.");
   vector<int64_t> workspace_byte_nums = node->GetOpDesc()->GetWorkspaceBytes();
 
   GELOGD("node[%s] size:%zu", node->GetOpDesc()->GetName().c_str(), workspace_byte_nums.size());
   for (int64_t byte_size : workspace_byte_nums) {
     workspace_memory.emplace_back(byte_size);
+    total_size += byte_size;
     GELOGD("push back size:%ld", byte_size);
   }
 }
 
-// descending order
-static bool CompareBlockMaxSize(MemoryBlock *left, MemoryBlock *right) {
-  if (left == nullptr || right == nullptr) {
-    return false;
-  }
-  auto left_max_size = std::max_element(left->RealSizeList().begin(), left->RealSizeList().end());
-  if (left_max_size != left->RealSizeList().end()) {
-    auto right_max_size = std::max_element(right->RealSizeList().begin(), right->RealSizeList().end());
-    if (right_max_size == right->RealSizeList().end() || (*left_max_size > *right_max_size)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void MergeBlocks(std::vector<MemoryBlock *> &dest, std::vector<MemoryBlock *> &src) {
-  for (size_t i = 0; i < dest.size(); ++i) {
-    if (i >= src.size()) {
-      return;
-    }
-    if (dest[i] != nullptr && src[i] != nullptr) {
-      if (!dest[i]->reuse_mem_ || !src[i]->reuse_mem_) {
-        GELOGD("Diff batch's workspace can't be reused, i: %zu, dest[i]: %s, stream: %ld, src[i]: %s, stream: %ld.",
-               i, dest[i]->String().c_str(), dest[i]->stream_id_, src[i]->String().c_str(), src[i]->stream_id_);
-        continue;
-      }
-      for (auto &symbol : src[i]->SymbolList()) {
-        dest[i]->AddSymbol(symbol);
-      }
-      for (size_t j = 0; j < src[i]->NodeTypeIndexList().size(); ++j) {
-        dest[i]->AddNodeTypeIndex(src[i]->NodeTypeIndexList()[j],
-                                  src[i]->RealSizeList()[j],
-                                  src[i]->NoAlignSizeList()[j]);
-        src[i]->deleted_block_ = true;
-      }
-    }
-  }
-}
-
-bool BlockMemAssigner::MergeDynamicBatchBlocks() {
-  bool merged = false;
-  std::map<std::string, std::vector<MemoryBlock *>> dynamic_batch_blocks;
-  for (auto block : memory_blocks_) {
-    if (block == nullptr) {
-      continue;
-    }
-    std::string batch_label;
-    if (block->IsSameLabel(batch_label)) {
-      dynamic_batch_blocks[batch_label].emplace_back(block);
-    }
-  }
-
-  auto it = dynamic_batch_blocks.begin();
-  auto it_max = it;
-
-  // find max block counts
-  for (; it != dynamic_batch_blocks.end(); ++it) {
-    if (it->second.size() > it_max->second.size()) {
-      it_max = it;
-    }
-    std::sort(it->second.begin(), it->second.end(), CompareBlockMaxSize);
-  }
-  if (it_max != dynamic_batch_blocks.end()) {
-    GELOGD("MergeDynamicBatch %s block counts %zu", it_max->first.c_str(), it_max->second.size());
-  }
-  for (it = dynamic_batch_blocks.begin(); it != dynamic_batch_blocks.end(); ++it) {
-    if (it != it_max) {
-      GELOGD("MergeDynamicBatch from %s to %s", it->first.c_str(), it_max->first.c_str());
-      MergeBlocks(it_max->second, it->second);
-      merged = true;
-    }
-  }
-  return merged;
-}
-
 // asending order
 static bool CompareBlockIndex(MemoryBlock *left, MemoryBlock *right) {
   if (left == nullptr || right == nullptr) {
@@ -1597,38 +1757,93 @@ void BlockMemAssigner::ReuseBlocksByLifeTime(size_t range_size) {
   }
 }
 
+void AddBlockMemOffset(size_t &mem_offset, size_t &p2p_mem_offset, MemoryBlock &block) {
+  if (block.memory_type_ == RT_MEMORY_HBM) {
+    if (block.first_continuous_block_) {
+      mem_offset += MEM_ALIGN_SIZE;
+    }
+    block.Resize();
+    block.SetHeadOffset(mem_offset);
+    mem_offset += block.Size();
+    block.SetTailOffset(mem_offset - 1);
+  } else if (block.memory_type_ == RT_MEMORY_P2P_DDR) {
+    if (block.first_continuous_block_) {
+      p2p_mem_offset += MEM_ALIGN_SIZE;
+    }
+    block.Resize();
+    block.SetHeadOffset(p2p_mem_offset);
+    p2p_mem_offset += block.Size();
+    block.SetTailOffset(p2p_mem_offset - 1);
+  }
+}
+
+bool DynamicBatchBlockReuse(MemoryBlock &block) {
+  return (block.IsSameBatchLabel() && block.reuse_mem_);
+}
+
 ///
 /// @ingroup domi_omg
-/// @brief traverse memory size, resize, calculate offset
+/// @brief get max batch memory size, others reuse this block memory
 /// @param [in&out] memory_blocks_ memory block, after calculating offset
+/// |-dynamic batch block batch1|
+/// |-dynamic batch block batch2----|
+/// |-dynamic batch block batch3--|
 ///
-void BlockMemAssigner::ResizeMemoryBlocks() {
-  for (auto &memory_block : memory_blocks_) {
-    if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_) {
+void BlockMemAssigner::ResizeDynamicBatchBlocks() {
+  std::map<std::string, std::vector<MemoryBlock *>> dynamic_batch_blocks;
+  for (auto block : memory_blocks_) {
+    if (block == nullptr) {
       continue;
     }
-    if (memory_block->memory_type_ == RT_MEMORY_HBM) {
-      if (memory_block->first_continuous_block_) {
-        mem_offset_ += MEM_ALIGN_SIZE;
-      }
+    // when memory is not reuseable, it can't be reused by different branch
+    if (DynamicBatchBlockReuse(*block)) {
+      dynamic_batch_blocks[block->batch_label_].emplace_back(block);
+    }
+  }
 
-      memory_block->Resize();
-      memory_block->SetHeadOffset(mem_offset_);
-      mem_offset_ += memory_block->Size();
-      memory_block->SetTailOffset(mem_offset_ - 1);
-    } else if (memory_block->memory_type_ == RT_MEMORY_P2P_DDR) {
-      if (memory_block->first_continuous_block_) {
-        p2p_mem_offset_ += MEM_ALIGN_SIZE;
+  size_t max_mem_offset = mem_offset_;
+  size_t max_p2p_mem_offset = p2p_mem_offset_;
+  for (auto &batch_blocks : dynamic_batch_blocks) {
+    size_t mem_offset = mem_offset_;
+    size_t p2p_mem_offset = p2p_mem_offset_;
+    for (auto block : batch_blocks.second) {
+      if (block == nullptr || block->deleted_block_ || block->is_zero_copy_) {
+        continue;
       }
+      AddBlockMemOffset(mem_offset, p2p_mem_offset, *block);
+    }
+    if (mem_offset > max_mem_offset) {
+      max_mem_offset = mem_offset;
+    }
+    if (p2p_mem_offset > max_p2p_mem_offset) {
+      max_p2p_mem_offset = p2p_mem_offset;
+    }
+    GELOGI("Batch[%s] offset[%zu] p2p_offset[%zu]", batch_blocks.first.c_str(), mem_offset, p2p_mem_offset);
+  }
+  mem_offset_ = max_mem_offset;
+  p2p_mem_offset_ = max_p2p_mem_offset;
+}
 
-      memory_block->Resize();
-      memory_block->SetHeadOffset(p2p_mem_offset_);
-      p2p_mem_offset_ += memory_block->Size();
-      memory_block->SetTailOffset(p2p_mem_offset_ - 1);
+///
+/// @ingroup domi_omg
+/// @brief traverse memory size, resize, calculate offset
+/// @param [in&out] memory_blocks_ memory block, after calculating offset
+/// |-not dynamic batch block-||-dynamic batch block batch1|    |-zero copy block-|
+/// |-not dynamic batch block-||-dynamic batch block batch2----||-zero copy block-|
+/// |-not dynamic batch block-||-dynamic batch block batch3--|  |-zero copy block-|
+///
+void BlockMemAssigner::ResizeMemoryBlocks() {
+  for (auto &memory_block : memory_blocks_) {
+    if (memory_block == nullptr || memory_block->deleted_block_ || memory_block->is_zero_copy_
+        || DynamicBatchBlockReuse(*memory_block)) {
+      continue;
     }
+
+    AddBlockMemOffset(mem_offset_, p2p_mem_offset_, *memory_block);
   }
-  GELOGD("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu.",
-         mem_offset_, p2p_mem_offset_);
+  ResizeDynamicBatchBlocks();
+  GELOGI("mem_offset_ exclude zero_copy_memory is %zu, p2p_mem_offset_ exclude zero_copy_memory is %zu,"
+         "theory_min_memory_size %zu", mem_offset_, p2p_mem_offset_, theory_min_memory_size_);
 }
 
 ///
@@ -1641,7 +1856,7 @@ void BlockMemAssigner::ResizeMemoryBlocks() {
 /// @return Status result
 ///
 void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block,
-                   size_t real_size, size_t no_align_size, bool child_block) {
+                   size_t real_size, size_t no_align_size, int32_t child_block_level) {
   ge::OpDescPtr op_desc = node_type.node->GetOpDesc();
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(op_desc == nullptr, return, "op_desc is null.");
   string graph_name = node_type.node->GetOwnerComputeGraph()->GetName();
@@ -1689,14 +1904,16 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block,
     }
     op_desc->SetWorkspace(workspace_list);
   }
-  GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]"
-         " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d] isref[%d].", graph_name.c_str(),
-         op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),
-         block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_,
-         block->continuous_block_, block->deleted_block_, node_type.ref_input);
+  GELOGI("[IMAS]Set %s name[%s] optype[%s] %s[%u] offset to [%ld] streamid[%ld] memtype[%ld] size[%zu] realsize[%zu] "
+         "noalignsize[%zu] life time begin[%s] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d] batch[%s]",
+         graph_name.c_str(), op_desc->GetName().c_str(), node_type.node->GetType().c_str(),
+         node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),block->memory_type_,
+         block->Size(), real_size, no_align_size, node_type.GetLifeBeginDesc().c_str(), end, child_block_level,
+         block->reuse_mem_, block->continuous_block_, block->is_zero_copy_, block->same_stream_, node_type.ref_input,
+         block->batch_label_.c_str());
 }
 
-void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) {
+void SetBlockOpMemOffset(MemoryBlock *block, int32_t child_block_level) {
   if (block == nullptr) {
     return;
   }
@@ -1709,9 +1926,14 @@ void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) {
       real_size = block->RealSizeList()[index];
       no_align_size = block->NoAlignSizeList()[index];
     }
-    SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block);
+    SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block_level);
     index++;
   }
+
+  child_block_level++;
+  for (MemoryBlock *child_block : block->ChildBlockList()) {
+      SetBlockOpMemOffset(child_block, child_block_level);
+  }
 }
 
 void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) {
@@ -1724,16 +1946,13 @@ void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) {
       continue;
     }
 
-    SetBlockOpMemOffset(memory_block, false);
-    for (MemoryBlock *child_block : memory_block->ChildBlockList()) {
-      SetBlockOpMemOffset(child_block, true);
-    }
+    SetBlockOpMemOffset(memory_block, 0);
   }
 
   if (!is_zero_copy) {
     for (const NodeTypeIndex &node_type_index : zero_memory_list_) {
       MemoryBlock block(0, 0);
-      SetOffsetSize(node_type_index, &block, 0, 0, false);
+      SetOffsetSize(node_type_index, &block, 0, 0, 0);
     }
   }
 }
diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h
index f3d26c1d..78584078 100755
--- a/ge/graph/build/memory/block_mem_assigner.h
+++ b/ge/graph/build/memory/block_mem_assigner.h
@@ -39,14 +39,15 @@ using DependStreamLife = std::map<int64_t, std::map<int64_t, size_t>>;
 enum OpMemoryType { kOutput, kWorkspace };
 
 struct NodeTypeIndex {
-  NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false)
-      : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input) {}
+  NodeTypeIndex(ge::NodePtr node, OpMemoryType mem_type, uint32_t index, bool ref_input = false, size_t begin = 0)
+      : node(std::move(node)), mem_type(mem_type), index(index), ref_input(ref_input), life_time_begin(begin) {}
 
   ge::NodePtr node = nullptr;
   OpMemoryType mem_type = kOutput;
   uint32_t index = 0;
-  size_t life_time_end = kMaxLifeTime;
   bool ref_input = false;
+  size_t life_time_begin = 0;
+  size_t life_time_end = kMaxLifeTime;
   const string GetMemType() const {
     if (mem_type == kOutput) {
       return "output";
@@ -55,6 +56,34 @@ struct NodeTypeIndex {
     }
     return "unknown";
   }
+
+  size_t GetLifeBegin() const {
+    if ((node == nullptr) || (node->GetOpDesc() == nullptr)) {
+      return 0;
+    }
+
+    if ((life_time_begin > 0) && (life_time_begin < static_cast<size_t>(node->GetOpDesc()->GetId()))) {
+      return life_time_begin;
+    } else {
+      return node->GetOpDesc()->GetId();
+    }
+  }
+
+  std::string GetLifeBeginDesc() const {
+    if (node == nullptr) {
+      return "";
+    }
+    auto node_op_desc = node->GetOpDesc();
+    if (node_op_desc != nullptr) {
+      auto life_begin = GetLifeBegin();
+      if (life_begin != static_cast<size_t>(node_op_desc->GetId())) {
+        return std::to_string(life_begin) + "-" + std::to_string(node_op_desc->GetId());
+      } else {
+        return std::to_string(node_op_desc->GetId());
+      }
+    }
+    return "";
+  }
 };
 
 class MemoryBlock {
@@ -65,6 +94,7 @@ class MemoryBlock {
         stream_id_(stream_id),
         deleted_block_(false),
         reuse_mem_(reuse_mem),
+        same_stream_(true),
         input_index_(0),
         continuous_block_(false),
         first_continuous_block_(false),
@@ -85,13 +115,14 @@ class MemoryBlock {
     symbol_list_.clear();
   }
 
-  void Init(size_t real_size, OpMemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) {
-    real_size_list_.emplace_back(real_size);
-    no_align_size_list_.emplace_back(no_align_size);
-    node_type_index_list_.emplace_back(node, type, out_index, false);
-  }
   size_t Size() const { return block_size_; }
 
+  void SetSize(size_t size) {
+    if (size > block_size_) {
+      block_size_ = size;
+    }
+  }
+
   size_t AlignSize() const;
 
   void SetHeadOffset(size_t offset);
@@ -106,6 +137,12 @@ class MemoryBlock {
     node_type_index_list_.emplace_back(node_type_index);
     real_size_list_.emplace_back(real_size);
     no_align_size_list_.emplace_back(no_align_size);
+    if ((node_type_index.node != nullptr) && (node_type_index.node->GetOpDesc() != nullptr)) {
+      auto stream_id = node_type_index.node->GetOpDesc()->GetStreamId();
+      if (stream_id != stream_id_) {
+        same_stream_ = false;
+      }
+    }
   }
 
   void AddSymbol(const std::string &symbol) {
@@ -122,7 +159,7 @@ class MemoryBlock {
 
   std::string String();
 
-  bool IsSameLabel(std::string &first_batch_label);
+  bool IsSameBatchLabel();
 
   void AddContinuousLifeReuseBlock(MemoryBlock *block, DependStreamLife &total_node_depend_stream_life);
 
@@ -132,7 +169,7 @@ class MemoryBlock {
 
   size_t GetLifeBegin();
 
-  size_t GetLifeEnd();
+  size_t GetLifeEnd() const;
 
   void AddDependLifeBegin(DependStreamLife &node_depend_stream_life);
 
@@ -142,6 +179,7 @@ class MemoryBlock {
   int64_t stream_id_;
   bool deleted_block_;
   bool reuse_mem_;
+  bool same_stream_;
   uint32_t input_index_;
   bool continuous_block_;
   bool first_continuous_block_;
@@ -149,6 +187,7 @@ class MemoryBlock {
   bool is_zero_copy_;
   std::map<int64_t, size_t> depend_stream_life_;
   int64_t memory_type_;
+  std::string batch_label_;
  private:
   size_t block_size_;
   std::vector<size_t> real_size_list_;
@@ -199,6 +238,7 @@ class BlockMemAssigner : public MemAssigner {
 
   void SetOpMemOffset(bool is_zero_copy);
 
+  std::string GetMaxBatchLabel() const { return max_batch_label_; }
  protected:
   ///
   /// @ingroup domi
@@ -209,7 +249,7 @@ class BlockMemAssigner : public MemAssigner {
 
   void GetOutAndWorkSpaceMem(std::vector<int64_t> &all_memory_size);
 
-  void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector<int64_t> &workspace_memory);
+  void GetNodeWorkSpaceSize(const ge::NodePtr &node, std::vector<int64_t> &workspace_memory, int64_t &total_size);
 
   ///
   /// @ingroup GE
@@ -353,7 +393,7 @@ class BlockMemAssigner : public MemAssigner {
   /// @return void
   /// @author
   ///
-  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory);
+  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, bool same_stream = true);
 
   ///
   /// @ingroup GE
@@ -379,11 +419,11 @@ class BlockMemAssigner : public MemAssigner {
 
   ///
   /// @ingroup GE
-  /// @brief Merge memory blocks between different batchs
+  /// @brief Resize memory blocks for each batchs
   /// @return merge or not
   /// @author
   ///
-  bool MergeDynamicBatchBlocks();
+  void ResizeDynamicBatchBlocks();
 
   void AssignContinuousBlocks();
 
@@ -392,6 +432,7 @@ class BlockMemAssigner : public MemAssigner {
   bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name,
                                    uint32_t &peer_input_index, bool &no_need_assign_memory, bool &reset_zero_copy_flag);
 
+  bool IsContinuousMemoryReuse(const NodePtr &n, const NodePtr &peer_node, uint32_t out_index);
   ///
   /// @ingroup GE
   /// @|+++++++++block1++++++++|                               |+++++++++block1++++++++|
@@ -411,8 +452,6 @@ class BlockMemAssigner : public MemAssigner {
 
   std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_;
 
-  std::map<std::string, uint64_t> reusable_block_counts_;
-
   std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> stream_workspace_blocks_;
 
   std::unordered_map<std::string, std::vector<MemoryBlock *>> node_out_blocks_;
@@ -436,6 +475,18 @@ class BlockMemAssigner : public MemAssigner {
 
   int64_t atomic_addr_clean_id_ = 0;
 
+  size_t theory_min_memory_size_ = 0;
+
+  size_t theory_memory_size_ = 0;
+
+  std::string max_batch_label_;
+
+  size_t continuous_life_begin_ = 0;
+  ///
+  /// @          [stream1][nodeid]
+  /// @[nodeid]  [stream2][nodeid]
+  /// @          [stream2][nodeid]
+  ///
   DependStreamLife total_node_depend_stream_life_;
 };
 }  // namespace ge
diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc
index ad0235d5..f94eb275 100755
--- a/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/ge/graph/build/memory/graph_mem_assigner.cc
@@ -32,14 +32,12 @@
 #include "graph/utils/type_utils.h"
 
 namespace {
-const int kDataOutputIndex = 0;
 const int kAllInputAddrIsAtomic = -1;
 const int kVirtualInputNodeMemoryReuse = 0;
 const int kVirtualOutputNodeMemoryReuse = 1;
-const size_t kVirtualInputNodeOutputSize = 1;
-const size_t kVirtualOutputNodeInputSize = 1;
-const size_t kVirtualNodeDataIndex = 0;
-const char *const kMbatchNodeNameFlag = "_ascend_mbatch_batch_";
+// One state per bit cannot be repeated
+enum ContinuousType { kTypeInput = 1, kTypeInputNoPadding = 2, kTypeOutput = 4, kTypeOutputNoPadding = 8 };
+
 int64_t GetSymbolOutputOffset(const std::map<std::string, std::string> &anchor_to_symbol,
                               const std::map<std::string, std::list<ge::NodeIndexIO>> &symbol_to_anchors,
                               const ge::NodePtr &node, const uint32_t i) {
@@ -99,7 +97,7 @@ Status GraphMemoryAssigner::AssignMemory() {
   MemoryOffset memory_offset(RT_MEMORY_HBM, mem_assigner->GetMemOffset());
   memory_offset_.emplace(RT_MEMORY_HBM, memory_offset);
 
-  if (mem_assigner->GetP2PMemOffset() > 0) {
+  if (mem_assigner->GetP2PMemOffset() >= 0) {
     MemoryOffset p2p_memory_offset(RT_MEMORY_P2P_DDR, mem_assigner->GetP2PMemOffset());
     memory_offset_.emplace(RT_MEMORY_P2P_DDR, p2p_memory_offset);
   }
@@ -137,7 +135,7 @@ ge::Status GraphMemoryAssigner::AssignVarAttr2Nodes() {
   return ge::SUCCESS;
 }
 
-ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc,
+ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc,
                                                                   int64_t dim_index, int64_t &output_mem_size,
                                                                   int64_t &batch_dim_num, int64_t &out_size) {
   graphStatus graph_status = ge::TensorUtils::GetSize(*output_desc, out_size);
@@ -182,68 +180,6 @@ ge::Status GraphMemoryAssigner::CalculateTensorRealSizeAndOutSize(const ge::Cons
   return SUCCESS;
 }
 
-Status GraphMemoryAssigner::GetMaxBatchLabel(const map<string, vector<NodePtr>> &mem_reuse_virtual_nodes_map,
-                                             int32_t mem_reuse_model, string &max_batch_label) {
-  for (auto &i_map : mem_reuse_virtual_nodes_map) {
-    vector<NodePtr> virtual_nodes_list = i_map.second;
-    vector<int64_t> max_shape_dims;
-    size_t max_batch_dim = 0;
-    bool max_batch_dim_find = false;
-    for (size_t i = 0; i < virtual_nodes_list.size(); ++i) {
-      GE_CHECK_NOTNULL(virtual_nodes_list[i]);
-      OpDescPtr op_desc = virtual_nodes_list[i]->GetOpDesc();
-      GE_CHECK_NOTNULL(op_desc);
-
-      ge::ConstGeTensorDescPtr input_output_desc;
-      if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
-        input_output_desc = op_desc->GetOutputDescPtr(kVirtualNodeDataIndex);
-      } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
-        input_output_desc = op_desc->GetInputDescPtr(kVirtualNodeDataIndex);
-      } else {
-        std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model);
-        GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-        return FAILED;
-      }
-      GE_CHECK_NOTNULL(input_output_desc);
-
-      if (i == 0) {
-        // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value.
-        (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label);
-        max_shape_dims = input_output_desc->GetShape().GetDims();
-      } else {
-        vector<int64_t> current_shape_dims = input_output_desc->GetShape().GetDims();
-        if (current_shape_dims.size() != max_shape_dims.size()) {
-          std::string error = "The shape of several nodes between multiple batches does not match.";
-          GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-          return FAILED;
-        }
-        for (size_t j = 0; j < current_shape_dims.size(); ++j) {
-          if (current_shape_dims[j] == max_shape_dims[j]) {
-            continue;
-          }
-          if (max_batch_dim_find && max_batch_dim != j) {
-            std::string error = "The shape of several nodes between multiple batches does not match.";
-            GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-            return FAILED;
-          }
-          max_batch_dim_find = true;
-          max_batch_dim = j;
-          if (current_shape_dims[j] > max_shape_dims[j]) {
-            max_shape_dims[j] = current_shape_dims[j];
-            // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value.
-            (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, max_batch_label);
-          }
-          // Only compare the first different dim in shape.
-          break;
-        }
-      }
-    }
-    // In every element of virtual_input_nodes_map, the label of the max batch node is the same.
-    break;
-  }
-  return SUCCESS;
-}
-
 Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size_t> &mem_type_to_offset) {
   if (memory_offset_.empty()) {
     GELOGE(FAILED, "memory_offset_ is empty.");
@@ -251,13 +187,6 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size
   }
 
   GE_CHK_STATUS_RET(ReAssignContinuousMemory(is_loop_graph), "ReAssignContinuousMemory Failed!");
-
-  GE_CHK_STATUS_RET(ReAssignReuseAndNoPaddingContinuousInputMemory(),
-                    "ReAssignReuseAndNoPaddingContinuousInputMemory Failed!");
-
-  GE_CHK_STATUS_RET(ReAssignReuseAndNoPaddingContinuousOutputMemory(),
-                    "ReAssignReuseAndNoPaddingContinuousOutputMemory Failed!");
-
   GE_CHK_STATUS_RET(ReAssignAtomicMemory(is_loop_graph), "ReAssignAtomicMemory Failed!");
 
   size_t total_mem_offset = 0;
@@ -274,6 +203,8 @@ Status GraphMemoryAssigner::ReAssignMemory(bool is_loop_graph, map<int64_t, size
       ErrorManager::GetInstance().ATCReportErrMessage("E19022", {"memType", "size", "item", "maxsize"},
         {std::to_string(iter.first), std::to_string(iter.second), "featuremap",
          std::to_string(VarManager::Instance(session_id)->GetGraphMemoryMaxSize())});
+      GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(),
+              iter.second, iter.first);
     }
     return ge::FAILED;
   }
@@ -314,22 +245,137 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(map<int64_t, size_t> &mem_offse
   return SUCCESS;
 }
 
+uint32_t GetContinuousMemoryType(const OpDescPtr &op_desc) {
+  if (op_desc == nullptr) {
+    return 0;
+  };
+
+  bool is_continuous = false;
+  uint32_t continuous_type = 0;
+  // If GetBool fail, is_continuous is false.
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_continuous);
+  if (is_continuous) {
+    continuous_type |= kTypeInput;
+  } else {
+    (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, is_continuous);
+    if (is_continuous) {
+      bool attr_reuse = false;
+      (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
+      if (attr_reuse) {
+        continuous_type |= kTypeInputNoPadding;
+      }
+    }
+  }
+
+  is_continuous = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_continuous);
+  if (is_continuous) {
+    continuous_type |= kTypeOutput;
+  } else {
+    (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, is_continuous);
+    if (is_continuous) {
+      bool attr_reuse = false;
+      (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
+      if (attr_reuse) {
+        continuous_type |= kTypeOutputNoPadding;
+      }
+    }
+  }
+
+  if (continuous_type != 0) {
+    GELOGI("Current node %s continuous type %d.", op_desc->GetName().c_str(), continuous_type);
+  }
+  return continuous_type;
+}
+
+Status GetMemorySize(const OpDescPtr &op_desc, const ge::ConstGeTensorDescPtr &output_desc, uint32_t continuous_type,
+                     int64_t &tensor_size, int64_t &nopadding_size) {
+  if ((op_desc == nullptr) || (output_desc == nullptr)) {
+    GELOGE(FAILED, "Input para is nullptr.");
+    return FAILED;
+  }
+  tensor_size = 0;
+  nopadding_size = 0;
+  bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0);
+  if (is_nopadding) {
+    int64_t attr_dim_index;
+    bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index);
+    if (!get_attr_dim_flag) {
+      GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed.");
+      return FAILED;
+    }
+
+    // Calculate tensor real size of each piece of data and out size of complete data
+    int64_t batch_dim_num = 1;
+    if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, nopadding_size, batch_dim_num, tensor_size) !=
+        SUCCESS) {
+      GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s.", op_desc->GetName().c_str());
+      return FAILED;
+    }
+  } else {
+    if (ge::TensorUtils::GetSize(*output_desc, tensor_size) != ge::SUCCESS) {
+      GELOGE(FAILED, "GetSize failed.");
+      return FAILED;
+    }
+  }
+  if ((tensor_size < 0) || (nopadding_size < 0)) {
+    GELOGE(FAILED, "GetMemorySize for node %s failed.", op_desc->GetName().c_str());
+    return FAILED;
+  }
+  return SUCCESS;
+}
+
+void AlignMemOffset(int64_t &mem_align_size) {
+  if (mem_align_size <= 0) {
+    return;
+  }
+  mem_align_size = (mem_align_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
+}
+
+bool IsContinuousInputConflict(const ge::NodePtr &node, const OpDescPtr &peer_op_desc) {
+  bool is_peer_output_continuous = false;
+  // If GetBool fail, is_peer_output_continuous is false.
+  (void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous);
+
+  // Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and
+  // continuous output of the previous node is the same, we can support it. If size != 1, there may be
+  // conflict between the two, we can not support it.
+  auto peer_output_size = peer_op_desc->GetOutputsSize();
+  GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1),
+                  std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
+                      " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
+                      " requires continuous output. There may be conflict between the two." +
+                      "This node is not supported now.";
+                  GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
+                  return true;);
+
+  bool is_peer_reference = false;
+  // If GetBool fail, is_peer_reference is false.
+  (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference);
+  GE_IF_BOOL_EXEC(is_peer_reference,
+                  std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
+                      " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
+                      " requires continuous output. There may be conflict between the two." +
+                      "This node is not supported now.";
+                  GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
+                  return true;);
+  return false;
+}
+
 Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
   Status ret;
   for (auto &node : compute_graph_->GetAllNodes()) {
-    // Get the continuous input type of the node, default is false
-    bool is_input_continuous = false;
-    GE_CHECK_NOTNULL(node->GetOpDesc());
-    // If GetBool fail, is_input_continuous is false.
-    (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
+    GE_CHECK_NOTNULL(node);
+    auto continuous_type = GetContinuousMemoryType(node->GetOpDesc());
 
     // Assign continuous input memory
-    if (is_input_continuous) {
-      int64_t memory_type = RT_MEMORY_HBM;
-      GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed.");
+    bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0);
+    int64_t memory_type = RT_MEMORY_HBM;
+    GE_CHK_STATUS_RET(GetNodeMemoryType(node, memory_type, "input"), "Get node memory type failed.");
+    if (continuous_input) {
       int64_t mem_clean_start = 0;
       int64_t mem_clean_size = 0;
-      ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type);
+      ret = AssignContinuousInputMemory(node, mem_clean_start, mem_clean_size, memory_type, continuous_type);
       if (ret != ge::SUCCESS) {
         GELOGE(ret, "Assign continuous input memory failed!");
         return ret;
@@ -339,7 +385,6 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
       vector<int32_t> input_indexes;
       // If GetListInt fail, input_indexes is empty.
       (void) ge::AttrUtils::GetListInt(node->GetOpDesc(), ATOMIC_ATTR_INPUT_INDEX, input_indexes);
-
       if (!input_indexes.empty() && input_indexes[0] == kAllInputAddrIsAtomic) {
         // check whether there is an atomic conflict between the current node and the peer out node
         if (!CheckInputIsSupportAtomic(node)) {
@@ -351,9 +396,10 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
         const auto &in_control_anchor = node->GetInControlAnchor();
         GE_CHECK_NOTNULL(in_control_anchor);
         for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
+          GE_CHECK_NOTNULL(peer_out_control_anchor);
           auto peer_out_node = peer_out_control_anchor->GetOwnerNode();
           if (peer_out_node->GetType() == ATOMICADDRCLEAN) {
-            ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size});
+            ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size}, memory_type);
             if (ret != SUCCESS) {
               GELOGE(ret, "Failed to set attr for atomic addr clean node %s.", peer_out_node->GetName().c_str());
               return ret;
@@ -363,23 +409,12 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
       }
     }
 
-    // Get the reference type of the node, default is false
-    bool is_ref = false;
-    // If GetBool fail, is_ref is false.
-    (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref);
-
-    // Get the continuous output type of the node, default is false
-    bool is_output_continuous = false;
-    // If GetBool fail, is_output_continuous is false.
-    (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous);
-
-    // If the output is ref type and refers to the ref of an input, the name of the output
-    // and the input are the same. Ge encounters ref type, finds matching relationship according
-    // to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast
-    if (!is_ref && is_output_continuous) {  // Assign continuous output memory
-      ret = AssignContinuousOutputMemory(node);
+    // Assign continuous output memory
+    bool continuous_output = ((continuous_type & kTypeOutput) != 0) || ((continuous_type & kTypeOutputNoPadding) != 0);
+    if (continuous_output) {
+      ret = AssignContinuousOutputMemory(node, memory_type, continuous_type);
       if (ret != ge::SUCCESS) {
-        GELOGE(ret, "Assign reference memory failed!");
+        GELOGE(ret, "Assign continuous output memory failed!");
         return ret;
       }
     }
@@ -392,522 +427,187 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
 }
 
 Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
-                                                        int64_t &continuous_mem_size, int64_t memory_type) {
+    int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type) {
   GELOGI("Current node %s needs continuous input.", node->GetName().c_str());
-  bool continuous_input_alloc = false;
-  (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, continuous_input_alloc);
   auto iter = memory_offset_.find(memory_type);
   if (iter == memory_offset_.end()) {
     std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type);
     GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
     return FAILED;
   }
+  // The head and tail of hcom continuous input should be added 512
+  iter->second.mem_offset_ += MEM_ALIGN_SIZE;
+  continuous_mem_start = iter->second.mem_offset_;
+  int64_t mem_offset = iter->second.mem_offset_;
+  int64_t extra_memory_size = 0;
+  bool is_continuous_input_allocated = false;
+  (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_CONTINUOUS_INPUT_ALLOC, is_continuous_input_allocated);
   for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
+    GE_IF_BOOL_EXEC(in_data_anchor == nullptr, continue);
     auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
     GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, continue);
-
     auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc();
     GE_IF_BOOL_EXEC(peer_op_desc == nullptr, continue);
-    bool is_peer_output_continuous = false;
-    // If GetBool fail, is_peer_output_continuous is false.
-    (void) ge::AttrUtils::GetBool(peer_op_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_peer_output_continuous);
-
-    // Get peer node output size, if size == 1(peer node has only one output), continuous input of the node and
-    // continuous output of the previous node is the same, we can support it. If size != 1, there may be
-    // conflict between the two, we can not support it.
-    auto peer_output_size = peer_op_desc->GetOutputsSize();
-    GE_IF_BOOL_EXEC(is_peer_output_continuous && (peer_output_size != 1),
-                    std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
-                        " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
-                        " requires continuous output. There may be conflict between the two. This node is not supported now.";
-                    GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-                    return PARAM_INVALID;);
-
-    bool is_peer_reference = false;
-    // If GetBool fail, is_peer_reference is false.
-    (void) AttrUtils::GetBool(peer_op_desc, ATTR_NAME_REFERENCE, is_peer_reference);
-    GE_IF_BOOL_EXEC(is_peer_reference,
-                    std::string error = "Current op" + FmtToStr(node->GetOpDesc()->GetName()) +
-                        " requires continuous input, while the previous op" + FmtToStr(peer_op_desc->GetName()) +
-                        " requires continuous output. There may be conflict between the two. This node is not supported now.";
-                    GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-                    return PARAM_INVALID;);
-
-    vector<int64_t> output_list = peer_op_desc->GetOutputOffset();
-    std::vector<int64_t> offsets_for_fusion = {};
-    bool has_offset_attr =
-        AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion);
-    if (peer_out_data_anchor->GetIdx() < static_cast<int>(output_list.size())) {
-      if (continuous_input_alloc && !has_offset_attr) {
-        if (in_data_anchor->GetIdx() == 0) {
-          continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx());
-        }
-        // can not use else if, incase only one input
-        if (in_data_anchor->GetIdx() == static_cast<int>(node->GetAllInDataAnchors().size()) - 1) {
-          int64_t tensor_desc_size = 0;
-          Status ret = ge::TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())),
-                                                tensor_desc_size);
-          GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;);
-
-          tensor_desc_size = (tensor_desc_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
-          continuous_mem_size =
-              output_list.at(peer_out_data_anchor->GetIdx()) - continuous_mem_start + tensor_desc_size + MEM_ALIGN_SIZE;
-        }
-        GELOGI(
-            "[IMAS]Check Continuous input : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%u] "
-            "real_size[%u].",
-            node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
-            peer_out_data_anchor->GetIdx(), output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(),
-            0, 0);
-        continue;
-      }
-
-      output_list.at(peer_out_data_anchor->GetIdx()) = iter->second.mem_offset_;
-    } else {
-      std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range.";
-      GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-      GELOGE(FAILED, "index : %d is out of range.", peer_out_data_anchor->GetIdx());
-      return FAILED;
-    }
-    peer_op_desc->SetOutputOffset(output_list);
-    size_t pre_mem_offset = iter->second.mem_offset_;
+    GE_IF_BOOL_EXEC(IsContinuousInputConflict(node, peer_op_desc), return PARAM_INVALID;);
 
     int64_t tensor_desc_size = 0;
-    if (has_offset_attr) {
-      if (peer_out_data_anchor->GetIdx() < static_cast<int>(offsets_for_fusion.size())) {
-        auto offset_for_fusion = offsets_for_fusion[peer_out_data_anchor->GetIdx()];
-        iter->second.mem_offset_ += offset_for_fusion;
-      } else {
+    int64_t nopadding_size = 0;
+    int64_t real_size = 0;
+    std::vector<int64_t> offsets_of_fusion = {};
+    bool lx_fusion = AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_of_fusion);
+    lx_fusion = lx_fusion && !offsets_of_fusion.empty();
+    if (lx_fusion) {
+      if (peer_out_data_anchor->GetIdx() >= static_cast<int>(offsets_of_fusion.size())) {
         std::string error = "fusion: peer node" + FmtToStr(peer_op_desc->GetName()) +
             " index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range.";
         GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
         return FAILED;
       }
+      nopadding_size = offsets_of_fusion[peer_out_data_anchor->GetIdx()];
+      tensor_desc_size = nopadding_size;
     } else {
-      Status ret =
-          TensorUtils::GetSize(*(peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx())), tensor_desc_size);
-      GE_IF_BOOL_EXEC(ret != ge::SUCCESS, GELOGE(FAILED, "GetSize failed."); return FAILED;);
-
-      iter->second.mem_offset_ += tensor_desc_size;
-    }
-
-    // If set tensor_actual_size, Memory alignment is not required.
-    int32_t is_tensor_actual_size = 0;
-    ge::AttrUtils::GetInt(peer_op_desc, ATTR_NAME_GET_TENSOR_ACTUAL_SIZE, is_tensor_actual_size);
-    if (is_tensor_actual_size == 0) {
-      AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
+      if (GetMemorySize(node->GetOpDesc(), peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx()),
+                        continuous_type, tensor_desc_size, nopadding_size) != ge::SUCCESS) {
+        return FAILED;
+      }
     }
-    GELOGI(
-        "[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] "
-        "real_size[%ld].", node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
-        peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(),
-        (iter->second.mem_offset_ - pre_mem_offset), tensor_desc_size);
-  }
 
-  iter->second.mem_offset_ += MEM_ALIGN_SIZE;
-  if (!continuous_input_alloc) {
-    continuous_mem_size = iter->second.mem_offset_ - continuous_mem_start;
-  }
-  return SUCCESS;
-}
-
-Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node) {
-  GELOGI("Current node %s needs continuous output.", node->GetName().c_str());
-  auto out_op_desc = node->GetOpDesc();
-  GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED);
-  vector<int64_t> output_list = out_op_desc->GetOutputOffset();
-
-  if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) {
-    GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.",
-           out_op_desc->GetOutputsSize(), output_list.size());
-    return ge::FAILED;
-  }
-
-  size_t mem_offset = output_list[0];
-  for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
-    output_list[out_data_anchor->GetIdx()] = mem_offset;
-    int64_t tensor_desc_size = 0;
-    if (ge::TensorUtils::GetSize(*(out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx())), tensor_desc_size) !=
-        ge::SUCCESS) {
-      GELOGE(FAILED, "GetSize failed.");
-      return FAILED;
-    }
-    mem_offset += tensor_desc_size;
-    if (mem_offset <= 0) {
+    bool is_nopadding = ((continuous_type & kTypeInputNoPadding) != 0) || lx_fusion;
+    vector<int64_t> output_list = peer_op_desc->GetOutputOffset();
+    if (peer_out_data_anchor->GetIdx() >= static_cast<int>(output_list.size())) {
+      std::string error = "index" + FmtToStr(peer_out_data_anchor->GetIdx()) + " is out of range.";
+      GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
       return FAILED;
     }
-    mem_offset = (mem_offset + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
-    GELOGI(
-        "[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] "
-        "real_size[%ld].",
-        node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
-        output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), tensor_desc_size, tensor_desc_size);
-  }
-  out_op_desc->SetOutputOffset(output_list);
-  return ge::SUCCESS;
-}
-
-Status GraphMemoryAssigner::ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse) {
-  OpDescPtr op_desc = node->GetOpDesc();
-  vector<int64_t> output_list = op_desc->GetOutputOffset();
-  if (output_list.empty()) {
-    GELOGE(FAILED, "Outputoffset is empty node name:%s", node->GetName().c_str());
-    return FAILED;
-  }
-  output_list.at(0) = mem_offset_reuse;
-  op_desc->SetOutputOffset(output_list);
-  GELOGI("Set virtual input node %s output offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse);
 
-  int64_t attr_dim_index;
-  bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index);
-  if (!get_attr_dim_flag) {
-    GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed.");
-    return FAILED;
-  }
-
-  size_t extra_memory_size = 0;
-  for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
-    auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
-    GE_CHECK_NOTNULL(peer_out_data_anchor);
-    auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc();
-    GE_CHECK_NOTNULL(peer_op_desc);
-    vector<int64_t> output_offsets = peer_op_desc->GetOutputOffset();
-    if (peer_out_data_anchor->GetIdx() >= static_cast<int>(output_offsets.size())) {
-      GELOGE(ge::FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx());
-      return ge::FAILED;
+    // when continuous input has been allocated first input is beginning offset
+    bool is_allocated_first_input = is_continuous_input_allocated && (in_data_anchor->GetIdx() == 0);
+    if (is_allocated_first_input) {
+      mem_offset = output_list.at(peer_out_data_anchor->GetIdx());
+      continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx());
+    } else {
+      // set offset for input
+      output_list.at(peer_out_data_anchor->GetIdx()) = mem_offset;
+      peer_op_desc->SetOutputOffset(output_list);
     }
-    output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse;
-    peer_op_desc->SetOutputOffset(output_offsets);
-    size_t pre_mem_offset = mem_offset_reuse;
 
-    // Calculate tensor real size of each piece of data and out size of complete data
-    ge::ConstGeTensorDescPtr output_desc = peer_op_desc->GetOutputDescPtr(peer_out_data_anchor->GetIdx());
-    GE_CHECK_NOTNULL(output_desc);
-    int64_t output_mem_size;
-    int64_t batch_dim_num = 1;
-    int64_t out_size;
-    if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) !=
-        SUCCESS) {
-      GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].",
-             peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx());
-      return FAILED;
+    int64_t align_size = tensor_desc_size;
+    if (is_nopadding) {
+      mem_offset += nopadding_size;
+      extra_memory_size += (tensor_desc_size - nopadding_size);
+      real_size = nopadding_size;
+    } else {
+      ge::AlignMemOffset(align_size);
+      mem_offset += align_size;
+      // The head and tail of hcom continuous input should be added 512
+      extra_memory_size = MEM_ALIGN_SIZE;
+      real_size = tensor_desc_size;
     }
 
-    mem_offset_reuse += output_mem_size;
-    extra_memory_size = extra_memory_size + out_size - output_mem_size;
-
-    GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%ld] "
-           "real_size[%ld].",
-           node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(),
-           peer_out_data_anchor->GetIdx(), pre_mem_offset, peer_op_desc->GetStreamId(), out_size,
-           output_mem_size);
-  }
-  mem_offset_reuse += extra_memory_size;
-  size_t after_mem_offset = mem_offset_reuse;
-  GELOGI("After reassign virtual input node[name: %s, type: %s] memory, memory offset = %zu.",
-         op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset);
-  return SUCCESS;
-}
-
-Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
-  map<string, vector<NodePtr>> mem_reuse_virtual_input_nodes_map;
-  int64_t memory_type = RT_MEMORY_HBM;
-  for (const auto &n : compute_graph_->GetAllNodes()) {
-    OpDescPtr op_desc = n->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    bool attr_continuous = false;
-    bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_INPUT, attr_continuous);
-    GE_IF_BOOL_EXEC(!get_continuous_flag, continue);
-    bool attr_reuse = false;
-    bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
-    GE_IF_BOOL_EXEC(!get_reuse_flag, continue);
-    if (attr_reuse && attr_continuous) {
-      if (op_desc->GetOutputsSize() != kVirtualInputNodeOutputSize) {
-        // When current virtual node has several outputs, can't directly determine which input is the tensor for reuse.
-        std::string error = "Only one output is supported, current virtual node" + FmtToStr(n->GetName()) +
-            " has " + FmtToStr(op_desc->GetOutputsSize()) + " outputs.";
-        GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-        return FAILED;
-      }
-      GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "input"), "Get node memory type failed.");
-      auto iter = memory_offset_.find(memory_type);
-      if (iter == memory_offset_.end()) {
-        std::string error = "Memory offset does not have memory type" + FmtToStr(memory_type);
-        GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-        return FAILED;
-      }
-      GELOGD("Start to reassign memory for virtual input node, memory offset = %zu, memory type = %ld.",
-             iter->second.mem_offset_, memory_type);
-      string batch_label_string;
-      // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter
-      (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
-      if (batch_label_string.empty()) {
-        size_t node_mem_offset = iter->second.mem_offset_;
-        // No ATTR_NAME_BATCH_LABEL, no need to reuse memory.
-        Status status = ReAssignVirtualInputNodeMemory(n, node_mem_offset);
-        if (status != SUCCESS) {
-          GELOGE(FAILED, "Reassign memory of virtual input node failed, node name: %s.", n->GetName().c_str());
-          return FAILED;
-        }
-
-        iter->second.mem_offset_ = node_mem_offset;
-        AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
-        GELOGD("After reassign memory for virtual input node, align memory = %zu, memory type = %ld.",
-               iter->second.mem_offset_, memory_type);
-      } else {
-        // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory.
-        string current_node_full_name = op_desc->GetName();
-        size_t pos = current_node_full_name.find(kMbatchNodeNameFlag);
-        if (pos == string::npos) {
-          GELOGE(FAILED, "Cannot find key string [%s] of multi-batch in name of virtual input node, node name: %s.",
-                 kMbatchNodeNameFlag, n->GetName().c_str());
-          return FAILED;
-        }
-        string fixed_name = current_node_full_name.substr(0, pos);
-        vector<NodePtr> parallel_virtual_input_nodes;
-        if (mem_reuse_virtual_input_nodes_map.count(fixed_name) != 0) {
-          parallel_virtual_input_nodes = mem_reuse_virtual_input_nodes_map[fixed_name];
-        }
-        parallel_virtual_input_nodes.emplace_back(n);
-        mem_reuse_virtual_input_nodes_map[fixed_name] = parallel_virtual_input_nodes;
-      }
-    }
+    GELOGI("[IMAS]Continuous input : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld] "
+        "size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(),
+        node->GetType().c_str(), peer_op_desc->GetName().c_str(),peer_out_data_anchor->GetIdx(),
+        output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), memory_type,
+        is_continuous_input_allocated ? 0UL : align_size, real_size, is_nopadding);
   }
 
-  int32_t mem_reuse_model = 0;
-  if (ReAssignVirtualNodesMemory(mem_reuse_virtual_input_nodes_map, mem_reuse_model) != SUCCESS) {
-    GELOGE(FAILED, "Reassign memory of virtual input nodes failed.");
-    return FAILED;
+  mem_offset += extra_memory_size;
+  ge::AlignMemOffset(mem_offset);
+  continuous_mem_size = mem_offset - continuous_mem_start;
+  if (is_continuous_input_allocated) {
+    // not allocate memory here, so no need add 512 in header
+    iter->second.mem_offset_ -= MEM_ALIGN_SIZE;
+  } else {
+    iter->second.mem_offset_ = mem_offset;
   }
   return SUCCESS;
 }
 
-Status GraphMemoryAssigner::ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse) {
-  OpDescPtr op_desc = node->GetOpDesc();
-
-  // 1. set memory of to be reused input tensor
+Status GetFirstInputPeerOutOutputOffset(const ge::NodePtr &node, int64_t &mem_offset) {
   auto in_data_anchor_list = node->GetAllInDataAnchors();
+  if (in_data_anchor_list.empty()) {
+    GELOGE(FAILED, "Node %s's in data anchor is empty.", node->GetName().c_str());
+    return FAILED;
+  }
   auto peer_out_data_anchor = in_data_anchor_list.at(0)->GetPeerOutAnchor();
-  GE_CHECK_NOTNULL(peer_out_data_anchor);
+  GE_IF_BOOL_EXEC(peer_out_data_anchor == nullptr, GELOGE(ge::FAILED, "peer_out_data_anchor is null.");
+                  return ge::FAILED);
   auto peer_op_desc = peer_out_data_anchor->GetOwnerNode()->GetOpDesc();
-  GE_CHECK_NOTNULL(peer_op_desc);
+  GE_IF_BOOL_EXEC(peer_op_desc == nullptr, GELOGE(ge::FAILED, "peer_op_desc is null."); return ge::FAILED);
   vector<int64_t> in_node_output_offsets = peer_op_desc->GetOutputOffset();
   if (peer_out_data_anchor->GetIdx() >= static_cast<int>(in_node_output_offsets.size())) {
     GELOGE(FAILED, "Index : %d is out of range.", peer_out_data_anchor->GetIdx());
     return FAILED;
   }
-  in_node_output_offsets.at(peer_out_data_anchor->GetIdx()) = mem_offset_reuse;
-  peer_op_desc->SetOutputOffset(in_node_output_offsets);
-  GELOGI("Set virtual output node %s input data offset to %zu.", op_desc->GetName().c_str(), mem_offset_reuse);
+  mem_offset = in_node_output_offsets.at(peer_out_data_anchor->GetIdx());
+  return SUCCESS;
+}
 
-  // 2. set memory of output tensor
-  vector<int64_t> output_list = op_desc->GetOutputOffset();
-  if (output_list.empty()) {
-    GELOGE(FAILED, "Outputoffset is empty, node name: %s", node->GetName().c_str());
-    return FAILED;
-  }
-  if (op_desc->GetOutputsSize() > output_list.size()) {
-    GELOGE(FAILED, "The size %zu of op_desc is more than output_list's size %zu.", op_desc->GetOutputsSize(),
-           output_list.size());
-    return FAILED;
-  }
-  int64_t attr_dim_index;
-  bool get_attr_dim_flag = ge::AttrUtils::GetInt(op_desc, ATTR_NAME_REUSE_INPUT_ON_DIM_INDEX, attr_dim_index);
-  if (!get_attr_dim_flag) {
-    GELOGE(FAILED, "Get attr _reuse_input_on_dim_index failed.");
-    return FAILED;
+Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node, int64_t memory_type,
+                                                         uint32_t continuous_type) {
+  GELOGI("Current node %s needs continuous output.", node->GetName().c_str());
+  auto out_op_desc = node->GetOpDesc();
+  GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(ge::FAILED, "out_op_desc is null."); return ge::FAILED);
+  vector<int64_t> output_list = out_op_desc->GetOutputOffset();
+  if ((out_op_desc->GetOutputsSize() > output_list.size()) || (output_list.size() == 0)) {
+    GELOGE(ge::FAILED, "The size %zu of node output desc is more than output_list's size %zu.",
+           out_op_desc->GetOutputsSize(), output_list.size());
+    return ge::FAILED;
   }
 
-  size_t extra_memory_size = 0;
-  for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
-    output_list[out_data_anchor->GetIdx()] = mem_offset_reuse;
-    size_t pre_mem_offset = mem_offset_reuse;
-
-    // calculate tensor real size of each piece of data and out size of complete data
-    ge::ConstGeTensorDescPtr output_desc = op_desc->GetOutputDescPtr(out_data_anchor->GetIdx());
-    GE_CHECK_NOTNULL(output_desc);
-    int64_t output_mem_size;
-    int64_t batch_dim_num = 1;
-    int64_t out_size;
-    if (CalculateTensorRealSizeAndOutSize(output_desc, attr_dim_index, output_mem_size, batch_dim_num, out_size) !=
-        SUCCESS) {
-      GELOGE(FAILED, "CalculateTensorRealSizeAndOutSize failed for node %s output [%d].",
-             op_desc->GetName().c_str(), out_data_anchor->GetIdx());
-      return FAILED;
+  int64_t mem_offset = 0;
+  bool is_nopadding = ((continuous_type & kTypeOutputNoPadding) != 0);
+  if (is_nopadding) {
+    // out tensor memory must be reused input tensor memory
+    if (GetFirstInputPeerOutOutputOffset(node, mem_offset) != SUCCESS) {
+      return ge::FAILED;
     }
+  } else {
+    // Get the reference type of the node, default is false
+    bool is_ref = false;
+    // If GetBool fail, is_ref is false.
+    (void) ge::AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_REFERENCE, is_ref);
 
-    mem_offset_reuse += output_mem_size;
-    extra_memory_size = extra_memory_size + out_size - output_mem_size;
-
-    GELOGI("[IMAS]Virtual node optimize: set %s name[%s] output[%d] offset to [%zu], size[%ld], real_size[%ld].",
-           node->GetOwnerComputeGraph()->GetName().c_str(), op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
-           pre_mem_offset, out_size, output_mem_size);
-  }
-  op_desc->SetOutputOffset(output_list);
-  mem_offset_reuse += extra_memory_size;
-  size_t after_mem_offset = mem_offset_reuse;
-  GELOGI("After reassign virtual output node[name: %s, type: %s] memory, memory offset = %zu.",
-         op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset);
-  return SUCCESS;
-}
-
-Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() {
-  map<string, vector<NodePtr>> mem_reuse_virtual_output_nodes_map;
-  int64_t memory_type = RT_MEMORY_HBM;
-  for (const auto &n : compute_graph_->GetAllNodes()) {
-    OpDescPtr op_desc = n->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    bool attr_continuous = false;
-    bool get_continuous_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOPADDING_CONTINUOUS_OUTPUT, attr_continuous);
-    GE_IF_BOOL_EXEC(!get_continuous_flag, continue);
-    bool attr_reuse = false;
-    bool get_reuse_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_OUTPUT_REUSE_INPUT, attr_reuse);
-    GE_IF_BOOL_EXEC(!get_reuse_flag, continue);
-
-    if (attr_reuse && attr_continuous) {
-      auto in_data_anchor_list = n->GetAllInDataAnchors();
-      if (in_data_anchor_list.size() != kVirtualOutputNodeInputSize) {
-        // When current virtual node has several inputs, can't directly determine which input is the tensor for reuse.
-        std::string error = "Only one input is supported, current virtual node" + FmtToStr(n->GetName()) +
-            " has " + FmtToStr(in_data_anchor_list.size()) + " inputs.";
-        GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-        return FAILED;
-      }
-      GE_CHK_STATUS_RET(GetNodeMemoryType(n, memory_type, "output"), "Get node memory type failed.");
-      auto iter = memory_offset_.find(memory_type);
-      if (iter == memory_offset_.end()) {
-        std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM);
-        GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-        return FAILED;
-      }
-      GELOGD("Start to reassign memory for virtual output node, memory offset = %zu, memory type = %ld.",
-             iter->second.mem_offset_, memory_type);
-      string batch_label_string;
-      // Not all ops have ATTR_NAME_BATCH_LABEL, no need to check return value, only check out parameter
-      (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
-      if (batch_label_string.empty()) {
-        size_t node_mem_offset = iter->second.mem_offset_;
-        // No ATTR_NAME_BATCH_LABEL, no need to reuse memory.
-        Status status = ReAssignVirtualOutputNodeMemory(n, node_mem_offset);
-        if (status != SUCCESS) {
-          GELOGE(FAILED, "Reassign memory of virtual output node failed, node name: %s.", n->GetName().c_str());
-          return FAILED;
-        }
-        iter->second.mem_offset_ = node_mem_offset;
-        AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
-        GELOGD("After reassign memory for virtual output node, align memory = %zu, memory type = %ld.",
-               iter->second.mem_offset_, memory_type);
-      } else {
-        // Has ATTR_NAME_BATCH_LABEL, for dynamic multi-batch node, need to reuse memory.
-        string current_node_full_name = op_desc->GetName();
-        size_t pos = current_node_full_name.find(kMbatchNodeNameFlag);
-        if (pos == string::npos) {
-          std::string error = "Cannot find key string" + FmtToStr(kMbatchNodeNameFlag) +
-          " of multi-batch in name of virtual output node, the node name is " + FmtToStr(n->GetName());
-          GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-          return FAILED;
-        }
-        string fixed_name = current_node_full_name.substr(0, pos);
-        vector<NodePtr> parallel_virtual_output_nodes;
-        if (mem_reuse_virtual_output_nodes_map.count(fixed_name) != 0) {
-          parallel_virtual_output_nodes = mem_reuse_virtual_output_nodes_map[fixed_name];
-        }
-        parallel_virtual_output_nodes.emplace_back(n);
-        mem_reuse_virtual_output_nodes_map[fixed_name] = parallel_virtual_output_nodes;
-      }
+    // If the output is ref type and refers to the ref of an input, the name of the output
+    // and the input are the same. Ge encounters ref type, finds matching relationship according
+    // to the names of input and output, and allocates the same memory address, eg: HCOMBroadcast
+    if (is_ref) {
+      GELOGI("Current node %s no needs assign continuous output because reference input by name.",
+             node->GetName().c_str());
+      return SUCCESS;
     }
+    mem_offset = output_list[0];
   }
 
-  int32_t mem_reuse_model = 1;
-  if (ReAssignVirtualNodesMemory(mem_reuse_virtual_output_nodes_map, mem_reuse_model) != SUCCESS) {
-    GELOGE(FAILED, "Reassign memory of virtual output nodes failed.");
-    return FAILED;
-  }
-  return SUCCESS;
-}
-
-Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map<string, vector<NodePtr>> &mem_reuse_nodes_map,
-                                                       int32_t mem_reuse_model) {
-  // Find max batch label value
-  string max_batch_label;
-  GE_CHK_STATUS_RET(GetMaxBatchLabel(mem_reuse_nodes_map, mem_reuse_model, max_batch_label),
-                    "Get max batch label failed.");
-  PrintMemoryOffset();
-  vector<size_t> nodes_mem_offset_list;
-  for (auto &i_map : mem_reuse_nodes_map) {
-    vector<NodePtr> virtual_nodes_list = i_map.second;
-    int64_t memory_type = RT_MEMORY_HBM;
-    GE_CHK_STATUS_RET(GetNodeListMemoryType(virtual_nodes_list, mem_reuse_model, memory_type),
-                      "Get node list memory type failed.");
-    auto iter = memory_offset_.find(memory_type);
-    if (iter == memory_offset_.end()) {
-      std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM);
-      GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
+  for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
+    output_list[out_data_anchor->GetIdx()] = mem_offset;
+    int64_t tensor_desc_size = 0;
+    int64_t nopadding_size = 0;
+    if (GetMemorySize(out_op_desc, out_op_desc->GetOutputDescPtr(out_data_anchor->GetIdx()), continuous_type,
+                      tensor_desc_size, nopadding_size) != ge::SUCCESS) {
       return FAILED;
     }
-    size_t max_batch_node_mem_offset = iter->second.mem_offset_;
-    nodes_mem_offset_list.emplace_back(max_batch_node_mem_offset);
-    for (auto &i_node : virtual_nodes_list) {
-      // Op_desc is not nullptr, it has been checked.
-      OpDescPtr op_desc = i_node->GetOpDesc();
-      string batch_label_string;
-      // All ops must have ATTR_NAME_BATCH_LABEL, no need to check return value.
-      (void) ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label_string);
-      if (batch_label_string == max_batch_label) {
-        Status status = SUCCESS;
-        if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
-          status = ReAssignVirtualInputNodeMemory(i_node, max_batch_node_mem_offset);
-        } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
-          status = ReAssignVirtualOutputNodeMemory(i_node, max_batch_node_mem_offset);
-        } else {
-          std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model);
-          GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-          return FAILED;
-        }
-
-        if (status != SUCCESS) {
-          GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str());
-          return FAILED;
-        }
-        iter->second.mem_offset_ = max_batch_node_mem_offset;
-        AlignMemOffset(MEM_ALIGN_SIZE, memory_type);
-        GELOGD("After reassign memory for virtual node, align memory = %zu, memory type = %ld.",
-               iter->second.mem_offset_, memory_type);
-        // Only assign memory of max batch nodes.
-        break;
-      }
-    }
-  }
-  PrintMemoryOffset();
-  size_t memory_reuse_index = 0;
-  for (auto &i_map : mem_reuse_nodes_map) {
-    vector<NodePtr> virtual_nodes_list = i_map.second;
-    for (auto &i_node : virtual_nodes_list) {
-      size_t remaining_batch_node_mem_offset = nodes_mem_offset_list[memory_reuse_index];
-      Status status = SUCCESS;
-      if (mem_reuse_model == kVirtualInputNodeMemoryReuse) {
-        status = ReAssignVirtualInputNodeMemory(i_node, remaining_batch_node_mem_offset);
-      } else if (mem_reuse_model == kVirtualOutputNodeMemoryReuse) {
-        status = ReAssignVirtualOutputNodeMemory(i_node, remaining_batch_node_mem_offset);
-      } else {
-        std::string error = "Invalid parameter memory reuse model, which is " + FmtToStr(mem_reuse_model);
-        GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
-        return FAILED;
-      }
 
-      if (status != SUCCESS) {
-        GELOGE(FAILED, "Reassign memory of virtual node failed, node name: %s.", i_node->GetName().c_str());
-        return FAILED;
-      }
+    if (is_nopadding) {
+      mem_offset += nopadding_size;
+    } else {
+      mem_offset += tensor_desc_size;
+      ge::AlignMemOffset(mem_offset);
     }
-    memory_reuse_index++;
+    GELOGI("[IMAS]Continuous output : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld]"
+           " size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(),
+           node->GetType().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
+           output_list[out_data_anchor->GetIdx()], out_op_desc->GetStreamId(), memory_type, 0UL,
+           is_nopadding ? nopadding_size : tensor_desc_size, is_nopadding);
   }
-  return SUCCESS;
+  out_op_desc->SetOutputOffset(output_list);
+  return ge::SUCCESS;
 }
 
 Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {
-  map<NodePtr, vector<NodePtr>> normal_atomic_and_clean_nodes_map;
-  vector<NodePtr> connecting_output_atomic_nodes;
+  // key:dynamic batch, batch name
+  map<string, map<NodePtr, vector<NodePtr>>> normal_atomic_and_clean_nodes_map;
+  map<string, vector<NodePtr>> connecting_output_atomic_nodes;
   Status status = FilterAtomicNodesForMemoryAssign(normal_atomic_and_clean_nodes_map, connecting_output_atomic_nodes);
   if (status != SUCCESS) {
     GELOGE(status, "Failed to filter atomic nodes for memory assignment.");
@@ -917,45 +617,60 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {
   auto mem_iter = memory_offset_.find(RT_MEMORY_HBM);
   if (mem_iter == memory_offset_.end()) {
     std::string error = "Memory offset does not have memory type" + FmtToStr(RT_MEMORY_HBM);
-    GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str()); 
+    GE_ERRORLOG_AND_ERRORMSG(FAILED, error.c_str());
     return FAILED;
   }
 
-  for (auto &iter : normal_atomic_and_clean_nodes_map) {
-    int64_t atomic_mem_start = static_cast<int64_t>(mem_iter->second.mem_offset_);
-    GELOGD("Begin to reAssign atomic memory, atomic address memory start = %ld", atomic_mem_start);
+  int64_t batch_atomic_mem_start = static_cast<int64_t>(mem_iter->second.mem_offset_);
+  int64_t batch_max_mem_offset = batch_atomic_mem_start;
+  for (auto &iter_batch : normal_atomic_and_clean_nodes_map) {
+    mem_iter->second.mem_offset_ = batch_atomic_mem_start;
+    for (auto &iter : iter_batch.second) {
+      int64_t atomic_mem_start = static_cast<int64_t>(mem_iter->second.mem_offset_);
+      GELOGD("Begin to reAssign atomic memory, atomic address memory start = %ld", atomic_mem_start);
 
-    for (auto &atomic_node : iter.second) {
-      vector<int64_t> mem_offset_end;
-      status = AssignAtomicOutputAndWorkspaceMemory(atomic_node, mem_offset_end);
-      if (status != SUCCESS) {
-        GELOGE(status, "Assign atomic output and workspace memory failed, node name is %s.",
-               atomic_node->GetName().c_str());
-        return status;
+      for (auto &atomic_node : iter.second) {
+        vector<int64_t> mem_offset_end;
+        status = AssignAtomicOutputAndWorkspaceMemory(atomic_node, mem_offset_end);
+        if (status != SUCCESS) {
+          GELOGE(status, "Assign atomic output and workspace memory failed, node name is %s.",
+                 atomic_node->GetName().c_str());
+          return status;
+        }
       }
-    }
 
-    int64_t atomic_mem_size = static_cast<int64_t>(mem_iter->second.mem_offset_) - atomic_mem_start;
-    if (atomic_mem_size != 0) {
-      GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}),
-                        "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str());
+      int64_t atomic_mem_size = static_cast<int64_t>(mem_iter->second.mem_offset_) - atomic_mem_start;
+      GE_CHECK_NOTNULL(mem_assigner_);
+      GE_CHECK_NOTNULL(mem_assigner_->GetPriorityAssinger());
+      if ((atomic_mem_size != 0) && (iter_batch.first == mem_assigner_->GetPriorityAssinger()->GetMaxBatchLabel())) {
+        GE_CHK_STATUS_RET(SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size}, RT_MEMORY_HBM),
+                          "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str());
+      }
     }
+    batch_max_mem_offset = std::max(batch_max_mem_offset, static_cast<int64_t>(mem_iter->second.mem_offset_));
   }
 
-  if (AssignConnectNetOutputAtomicMemory(connecting_output_atomic_nodes) != SUCCESS) {
-    GELOGE(FAILED, "Failed to assign memory of nodes that connect to netoutput.");
-    return FAILED;
+  mem_iter->second.mem_offset_ = static_cast<size_t>(batch_max_mem_offset);
+  batch_atomic_mem_start = batch_max_mem_offset;
+  for (auto &iter_batch : connecting_output_atomic_nodes) {
+    mem_iter->second.mem_offset_ = batch_atomic_mem_start;
+    if (AssignConnectNetOutputAtomicMemory(iter_batch.second) != SUCCESS) {
+      GELOGE(FAILED, "Failed to assign memory of nodes that connect to netoutput.");
+      return FAILED;
+    }
+    batch_max_mem_offset = std::max(batch_max_mem_offset, static_cast<int64_t>(mem_iter->second.mem_offset_));
   }
-
+  mem_iter->second.mem_offset_ = static_cast<size_t>(batch_max_mem_offset);
   return SUCCESS;
 }
 
-Status GraphMemoryAssigner::FilterAtomicNodesForMemoryAssign(map<NodePtr, vector<NodePtr>> &normal_atomic_nodes_map,
-                                                             vector<NodePtr> &connecting_output_atomic_nodes) {
+Status GraphMemoryAssigner::FilterAtomicNodesForMemoryAssign(
+    map<string, map<NodePtr, vector<NodePtr>>> &normal_atomic_nodes_map,
+    map<string, vector<NodePtr>> &connecting_output_atomic_nodes) {
   GE_CHECK_NOTNULL(compute_graph_);
   for (const auto &node : compute_graph_->GetAllNodes()) {
     if (node->GetType() == ATOMICADDRCLEAN) {
-      vector<NodePtr> tmp_normal_atomic_nodes;
+      map<string, vector<NodePtr>> tmp_normal_atomic_nodes;
       const auto &out_control_anchor = node->GetOutControlAnchor();
       GE_CHECK_NOTNULL(out_control_anchor);
       for (const auto &peer_in_control_anchor : out_control_anchor->GetPeerInControlAnchors()) {
@@ -977,23 +692,28 @@ Status GraphMemoryAssigner::FilterAtomicNodesForMemoryAssign(map<NodePtr, vector
                 return ge::PARAM_INVALID;
               }
 
+              std::string batch_label;
+              (void)ge::AttrUtils::GetStr(peer_in_node_desc, ATTR_NAME_BATCH_LABEL, batch_label);
+
               vector<int> is_connecting_output;
               // If GetBool fail, attr is_connecting_output is an empty vector.
               (void) ge::AttrUtils::GetListInt(peer_in_node_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connecting_output);
               if (is_connecting_output.empty()) {
-                tmp_normal_atomic_nodes.emplace_back(peer_in_node);
+                tmp_normal_atomic_nodes[batch_label].emplace_back(peer_in_node);
                 continue;
               }
-              connecting_output_atomic_nodes.emplace_back(peer_in_node);
-              tmp_normal_atomic_nodes.clear();
+              connecting_output_atomic_nodes[batch_label].emplace_back(peer_in_node);
+              tmp_normal_atomic_nodes[batch_label].clear();
               break;
             }
           }
         }
       }
 
-      if (!tmp_normal_atomic_nodes.empty()) {
-        normal_atomic_nodes_map[node] = tmp_normal_atomic_nodes;
+      for (auto &it_atomic_node : tmp_normal_atomic_nodes) {
+        if (!it_atomic_node.second.empty()) {
+          normal_atomic_nodes_map[it_atomic_node.first][node] = it_atomic_node.second;
+        }
       }
     }
   }
@@ -1061,7 +781,7 @@ Status GraphMemoryAssigner::AssignConnectNetOutputAtomicMemory(vector<NodePtr> &
     }
 
     // All atomic nodes use atomic_addr_clean op independently, so we need to set the attr separately.
-    if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end) != SUCCESS) {
+    if (SetIndependentAtomicAttr(node, original_atomic_mem_start, mem_offset_end, RT_MEMORY_HBM) != SUCCESS) {
       GELOGE(FAILED, "Failed to set atomic attr separately.");
       return FAILED;
     }
@@ -1206,9 +926,12 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node, ve
     }
 
     output_list[output_index] = iter->second.mem_offset_;
-    GELOGI("[IMAS]Atomic output : Set %s name[%s] output[%ld] offset to [%zu] stream_id[%ld] size[%ld] real_size[%ld].",
-           compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index,
-           iter->second.mem_offset_, op_desc->GetStreamId(), size, size);
+    std::string batch_label;
+    (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
+    GELOGI("[IMAS]Atomic output : Set %s name[%s] optype[%s] output[%ld] offset to [%zu] stream_id[%ld] memtype[%ld] "
+           "size[%ld] real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(),
+           node->GetType().c_str(), output_index, iter->second.mem_offset_, op_desc->GetStreamId(), RT_MEMORY_HBM,
+           size, size, batch_label.c_str());
 
     iter->second.mem_offset_ += size;
     AlignMemOffset(MEM_ALIGN_SIZE, RT_MEMORY_HBM);
@@ -1281,11 +1004,14 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc
       }
 
       workspace_vector[workspace_index] = mem_type_iter->second.mem_offset_;
+      std::string batch_label;
+      (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
       GELOGI(
-          "[IMAS]Atomic ordinary workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] "
-          "size[%ld] real_size[%ld].",
-          compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index,
-          mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size);
+          "[IMAS]Atomic ordinary workspace : Set %s name[%s] optype[%s] workspace[%lu] offset to [%zu] stream_id[%ld] "
+          "memtype[%ld] size[%ld] real_size[%ld] batch[%s].",
+          compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), workspace_index,
+          mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), RT_MEMORY_HBM, workspace_size, workspace_size,
+          batch_label.c_str());
 
       mem_type_iter->second.mem_offset_ += workspace_size;
       mem_offset_end.emplace_back(mem_type_iter->second.mem_offset_);
@@ -1319,10 +1045,13 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt
       auto workspace_size = info_iter.second;
 
       size_t workspace_offset = mem_type_iter->second.mem_offset_;
+      std::string batch_label;
+      (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
       GELOGI(
-          "[IMAS]Atomic fusion workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] size[%ld] "
-          "real_size[%ld].", compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index,
-          mem_type_iter->second.mem_offset_, op_desc->GetStreamId(), workspace_size, workspace_size);
+          "[IMAS]Atomic fusion workspace : Set %s name[%s] optype[%s] workspace[%lu] offset to [%zu] stream_id[%ld] "
+          "memtype[%ld] ssize[%ld] real_size[%ld] batch[%s].", compute_graph_->GetName().c_str(),
+          op_desc->GetName().c_str(), op_desc->GetType().c_str(), workspace_index, mem_type_iter->second.mem_offset_,
+          op_desc->GetStreamId(), RT_MEMORY_HBM, workspace_size, workspace_size, batch_label.c_str());
 
       mem_type_iter->second.mem_offset_ += workspace_size;
       mem_offset_end.emplace_back(mem_type_iter->second.mem_offset_);
@@ -1398,7 +1127,7 @@ ge::Status GraphMemoryAssigner::SetInputOffset() {
     return FAILED;
   }
   for (auto pair : memory_offset_) {
-    GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memory type[%ld]", compute_graph_->GetName().c_str(),
+    GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu], memtype[%ld]", compute_graph_->GetName().c_str(),
             pair.second.mem_offset_, pair.first);
   }
 
@@ -1567,7 +1296,7 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const {
 }
 
 Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start,
-                                                     const vector<int64_t> &mem_offset_end) {
+                                                     const vector<int64_t> &mem_offset_end, int64_t memory_type) {
   GELOGD("Start to set independent atomic attr, atomic_addr_clean memory offset start is %ld", atomic_mem_start);
 
   // Parsing offset and size vectors
@@ -1596,7 +1325,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in
       GELOGD("Current node memory_offset vector size is %zu, node name %s, node type is %s.", memory_offset_size.size(),
              peer_out_node_desc->GetName().c_str(), peer_out_node_desc->GetType().c_str());
       if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) {
-        if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size) != SUCCESS) {
+        if (SetAtomicCleanAttr(peer_out_node, memory_offset_start, memory_offset_size, memory_type) != SUCCESS) {
           GELOGE(FAILED, "Set atomic clean attr failed.");
           return FAILED;
         }
@@ -1607,7 +1336,7 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in
 }
 
 ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const vector<int64_t> &atomic_mem_start,
-                                                   const vector<int64_t> &atomic_mem_size) {
+                                                   const vector<int64_t> &atomic_mem_size, int64_t memory_type) {
   auto node_op_desc = node->GetOpDesc();
   if (node_op_desc != nullptr) {
     GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str());
@@ -1646,9 +1375,10 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const ve
     }
     string atomic_mem_size_str = ss.str();
 
-    GELOGI("[IMAS]SetAtomicCleanAttr : Set graph[%s] atomic_node[%s] output offset [%s] size[%s] streamid[%ld]",
-           node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(),
-           atomic_mem_start_str.c_str(), atomic_mem_size_str.c_str(), node->GetOpDesc()->GetStreamId());
+    GELOGI("[IMAS]SetAtomicCleanAttr : Set %s atomic_node name[%s] optype[%s] output[0] offset to [%s] streamid[%ld]"
+           " memtype[%ld] size[%s]",node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(),
+           node->GetType().c_str(), atomic_mem_start_str.c_str(), node->GetOpDesc()->GetStreamId(), memory_type,
+           atomic_mem_size_str.c_str());
   }
   return SUCCESS;
 }
diff --git a/ge/graph/build/memory/graph_mem_assigner.h b/ge/graph/build/memory/graph_mem_assigner.h
index 8ac166fe..a380e594 100755
--- a/ge/graph/build/memory/graph_mem_assigner.h
+++ b/ge/graph/build/memory/graph_mem_assigner.h
@@ -119,31 +119,15 @@ class GraphMemoryAssigner {
   ///
   ge::Status ReAssignContinuousMemory(bool is_loop_graph);
 
-  ge::Status ReAssignReuseAndNoPaddingContinuousInputMemory();
-
-  ge::Status ReAssignReuseAndNoPaddingContinuousOutputMemory();
-
-  ge::Status ReAssignVirtualInputNodeMemory(NodePtr node, size_t &mem_offset_reuse);
-
-  ge::Status ReAssignVirtualOutputNodeMemory(NodePtr node, size_t &mem_offset_reuse);
-
-  ge::Status ReAssignVirtualNodesMemory(map<string, vector<NodePtr>> &mem_reuse_nodes_map, int32_t mem_reuse_model);
-
-  ge::Status GetMaxBatchLabel(const map<string, vector<NodePtr>> &mem_reuse_virtual_nodes_map,
-                              int32_t mem_reuse_model, string &max_batch_label);
-
-  ge::Status CalculateTensorRealSizeAndOutSize(const ge::ConstGeTensorDescPtr &output_desc, int64_t dim_index,
-                                               int64_t &output_mem_size, int64_t &batch_dim_num, int64_t &out_size);
-
   ge::Status ReAssignAtomicMemory(bool is_loop_graph);
-  
-  ge::Status FilterAtomicNodesForMemoryAssign(std::map<NodePtr, vector<NodePtr>> &normal_atomic_nodes_map,
-                                              std::vector<NodePtr> &connecting_output_atomic_nodes);
+
+  ge::Status FilterAtomicNodesForMemoryAssign(map<string, map<NodePtr, vector<NodePtr>>> &normal_atomic_nodes_map,
+                                              map<string, vector<NodePtr>> &connecting_output_atomic_nodes);
 
   ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
-                                         int64_t &continuous_mem_size, int64_t memory_type);
+                                         int64_t &continuous_mem_size, int64_t memory_type, uint32_t continuous_type);
 
-  ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node);
+  ge::Status AssignContinuousOutputMemory(const ge::NodePtr &node, int64_t memory_type, uint32_t continuous_type);
 
   ///
   /// @brief check the input of node whether support atomic attr
@@ -169,10 +153,10 @@ class GraphMemoryAssigner {
   ge::Status AssignConnectNetOutputAtomicMemory(vector<NodePtr> &connect_netoutput_nodes);
 
   ge::Status SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start,
-                                      const std::vector<int64_t> &mem_offset_end);
+                                      const std::vector<int64_t> &mem_offset_end, int64_t memory_type);
 
   ge::Status SetAtomicCleanAttr(const ge::NodePtr &node, const std::vector<int64_t> &atomic_mem_start,
-                                const std::vector<int64_t> &atomic_mem_size);
+                                const std::vector<int64_t> &atomic_mem_size, int64_t memory_type);
 
   ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node);
 
diff --git a/ge/graph/build/model_builder.cc b/ge/graph/build/model_builder.cc
index d7039cfb..ec891f70 100755
--- a/ge/graph/build/model_builder.cc
+++ b/ge/graph/build/model_builder.cc
@@ -55,15 +55,8 @@ using std::vector;
 namespace {
 const uint32_t kWeightsStartOffset = 512;
 const int32_t kWrongIndex = -2;
-
-const float kImgRatioYUV420SP_U8 = 1.5;
-const int kImgRatioRGB888_U8 = 3;
-const int kImgRatioNC1HWC0DI_FP16 = 12;
 const int kInvalidIndexNum = -1;
 
-const uint32_t kInputDimensions2D = 2;
-const uint32_t kInputDimensions3D = 3;
-
 const char *const kVectorCore = "VectorCore";
 const char *const kCoreType = "ge.engineType";
 const std::string kEnableL1Fusion = "ge.l1Fusion";
@@ -224,6 +217,7 @@ Status ModelBuilder::AdjustConstWeightSize(const ge::NodePtr &node, size_t &mem_
     GeTensorDesc &tensor_desc = weight->MutableTensorDesc();
     size_t output_size = weight->GetData().size();
     TensorUtils::SetDataOffset(tensor_desc, mem_offset);
+    GELOGD("Node: %s, weight size: %zu.", node->GetName().c_str(), output_size);
     mem_offset += output_size;
   }
   return SUCCESS;
@@ -282,7 +276,7 @@ Status ModelBuilder::SetInputOutputDesc() {
 void ModelBuilder::AddNodeInputProperty() {
   for (const ge::NodePtr &node : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = node->GetOpDesc();
-    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return );
+    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return);
     vector<string> src_name_list;
     vector<int64_t> src_index_list;
     for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
@@ -309,10 +303,10 @@ void ModelBuilder::AddNodeInputProperty() {
 
   for (const ge::NodePtr &node : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = node->GetOpDesc();
-    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return );
+    GE_IF_BOOL_EXEC(node_op_desc == nullptr, GELOGW("node_op_desc is nullptr!"); return);
     GE_IF_BOOL_EXEC(node_op_desc->GetType() == NETOUTPUT, continue);
     auto out_control_anchor = node->GetOutControlAnchor();
-    GE_IF_BOOL_EXEC(out_control_anchor == nullptr, GELOGW("out_control_anchor is nullptr"); return );
+    GE_IF_BOOL_EXEC(out_control_anchor == nullptr, GELOGW("out_control_anchor is nullptr"); return);
     vector<string> dst_name_list;
     vector<int64_t> dst_index_list;
     string dst_name_temp;
@@ -330,7 +324,7 @@ void ModelBuilder::AddNodeInputProperty() {
       dst_name_temp = "";
       int64_t dst_index = kWrongIndex;  // assign an impossible value to dst_index.
       for (const auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-        GE_IF_BOOL_EXEC(in_data_anchor == nullptr, GELOGW("in_data_anchor is nullptr"); return );
+        GE_IF_BOOL_EXEC(in_data_anchor == nullptr, GELOGW("in_data_anchor is nullptr"); return);
         ge::NodePtr dst_node = in_data_anchor->GetOwnerNode();
         dst_name_temp = dst_name_temp.empty() ? dst_node->GetName() : dst_name_temp + ":" + dst_node->GetName();
         dst_index = in_data_anchor->GetIdx();
@@ -568,7 +562,7 @@ Status ModelBuilder::MergeWeights() {
         return FAILED;
       }
     }
-    weight_data.clear();
+    weight->ClearData();
   }
 
   return SUCCESS;
@@ -581,9 +575,13 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) {
   // Add TBE Kernels and custom aicpu op bin
   std::set<std::string> tbe_name_set;
   std::set<std::string> aicpu_name_set;
+  std::set<std::string> aicpu_op_types;
+  std::set<std::string> aicpu_tf_op_types;
   for (const ge::NodePtr &n : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = n->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
+    // check aicpu op type
+    CollectCheckAicpuAttr(node_op_desc, aicpu_op_types, aicpu_tf_op_types);
     TBEKernelPtr tbe_kernel = node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
     if (tbe_kernel == nullptr) {
       std::string kernel_name;
@@ -605,6 +603,8 @@ Status ModelBuilder::SaveDataToModel(ge::Model &model, ge::GeModel &ge_model) {
     tbe_kernel_store_.AddTBEKernel(tbe_kernel);
   }
 
+  SetModelCheckAicpuAttr(model, aicpu_op_types, aicpu_tf_op_types);
+
   for (const ge::NodePtr &n : compute_graph_->GetNodes(compute_graph_->GetGraphUnknownFlag())) {
     auto node_op_desc = n->GetOpDesc();
     GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
@@ -796,4 +796,51 @@ Status ModelBuilder::CompileSingleOp() {
   GE_TIMESTAMP_CALLNUM_END(BatchCompileOp, "GraphBuild::CompileOp");
   return ge::SUCCESS;
 }
+
+void ModelBuilder::CollectCheckAicpuAttr(const OpDescPtr &op_desc, std::set<std::string> &aicpu_op_types,
+                                         std::set<std::string> &aicpu_tf_op_types) {
+  std::string aicpu_optype;
+  bool has_attr_check_cpu = ge::AttrUtils::GetStr(op_desc, "needCheckCpu", aicpu_optype);
+  std::vector<std::string> tf_optypes;
+  bool has_attr_check_tf = ge::AttrUtils::GetListStr(op_desc, "needCheckTf", tf_optypes);
+  if (has_attr_check_cpu && !aicpu_optype.empty()) {
+    aicpu_op_types.insert(aicpu_optype);
+  }
+
+  if (has_attr_check_tf && !tf_optypes.empty()) {
+    aicpu_tf_op_types.insert(tf_optypes.begin(), tf_optypes.end());
+  }
+
+  return;
+}
+
+void ModelBuilder::SetModelCheckAicpuAttr(ge::Model &model, std::set<std::string> &aicpu_op_types,
+                                          std::set<std::string> &aicpu_tf_op_types) {
+  std::vector<std::string> aicpu_optype_list;
+  std::vector<std::string> aicpu_tf_optype_list;
+  if (ge::AttrUtils::GetListStr(&model, "needCheckCpu", aicpu_optype_list)) {
+    GELOGI("Already have aicpu optype size: %zu", aicpu_optype_list.size());
+    aicpu_op_types.insert(aicpu_optype_list.begin(), aicpu_optype_list.end());
+  }
+
+  if (ge::AttrUtils::GetListStr(&model, "needCheckTf", aicpu_tf_optype_list)) {
+    GELOGI("Already have aicpu tf optype size: %zu", aicpu_tf_optype_list.size());
+    aicpu_tf_op_types.insert(aicpu_tf_optype_list.begin(), aicpu_tf_optype_list.end());
+  }
+
+  // reset list with set
+  aicpu_optype_list.assign(aicpu_op_types.begin(), aicpu_op_types.end());
+  aicpu_tf_optype_list.assign(aicpu_tf_op_types.begin(), aicpu_tf_op_types.end());
+  GELOGI(
+    "Check Aicpu op types ComputeGraph: %s aicpu_op_types: %zu, aicpu_optype_list: %zu, aicpu_tf_op_types: %zu, "
+    "aicpu_tf_optype_list:%zu.",
+    compute_graph_->GetName().c_str(), aicpu_op_types.size(), aicpu_optype_list.size(), aicpu_tf_op_types.size(),
+    aicpu_tf_optype_list.size());
+  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, "needCheckCpu", aicpu_optype_list), return,
+                   "Set attr needCheckCpu fail.");
+
+  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, "needCheckTf", aicpu_tf_optype_list), return,
+                   "Set attr needCheckTf fail.");
+  return;
+}
 }  // namespace ge
diff --git a/ge/graph/build/model_builder.h b/ge/graph/build/model_builder.h
index e75521c7..12420614 100644
--- a/ge/graph/build/model_builder.h
+++ b/ge/graph/build/model_builder.h
@@ -55,13 +55,13 @@ class ModelBuilder {
 
   ge::Buffer GetWeightBuffer() const;
 
+  Status MergeWeights();
+
  protected:
   void AddNodeInputProperty();
 
   void ClearOriginalFormat();
 
-  Status MergeWeights();
-
  private:
   bool SetInputConst(const OpDescPtr &op_desc, const NodePtr &src_node, size_t index, vector<bool> &is_input_const);
 
@@ -83,6 +83,12 @@ class ModelBuilder {
 
   Status CompileSingleOp();
 
+  void CollectCheckAicpuAttr(const OpDescPtr &op_desc, std::set<std::string> &aicpu_op_types,
+                               std::set<std::string> &aicpu_tf_op_types);
+
+  void SetModelCheckAicpuAttr(ge::Model &model, std::set<std::string> &aicpu_op_types,
+                                std::set<std::string> &aicpu_tf_op_types);
+
   uint64_t session_id_;
 
   map<int64_t, size_t> mem_type_to_mem_offset_;
diff --git a/ge/graph/build/stream_allocator.cc b/ge/graph/build/stream_allocator.cc
index 4378f71b..63112ea8 100644
--- a/ge/graph/build/stream_allocator.cc
+++ b/ge/graph/build/stream_allocator.cc
@@ -34,7 +34,6 @@ using std::string;
 using std::vector;
 
 namespace {
-const uint32_t kMaxSwitchStreamNum = 1;
 const int64_t kTaskNumPerNormalNode = 3;
 const int64_t kTaskNumPerHcclNode = 200;
 const char *const kTrueStr = "true";
@@ -49,7 +48,8 @@ inline bool HasContinuousStreamLabel(const ge::OpDescPtr &op_desc, std::string &
 }
 
 bool IsHcclOp(const string &op_type) {
-  const set<string> hccl_op_types({ge::HCOMBROADCAST, ge::HCOMALLGATHER, ge::HCOMALLREDUCE, ge::HCOMREDUCESCATTER, ge::HCOMREDUCE});
+  const set<string> hccl_op_types({ge::HCOMBROADCAST, ge::HCOMALLGATHER,
+                                   ge::HCOMALLREDUCE, ge::HCOMREDUCESCATTER, ge::HCOMREDUCE});
   return hccl_op_types.find(op_type) != hccl_op_types.end();
 }
 }  // namespace
diff --git a/ge/graph/build/stream_graph_optimizer.cc b/ge/graph/build/stream_graph_optimizer.cc
index 582c080b..05049818 100644
--- a/ge/graph/build/stream_graph_optimizer.cc
+++ b/ge/graph/build/stream_graph_optimizer.cc
@@ -38,7 +38,7 @@ void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Grap
         continue;
       }
       for (ge::NodePtr &node : subgraph->GetDirectNode()) {
-        GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return );
+        GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return);
         if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) {
           node->GetOpDesc()->SetId(static_cast<int64_t>(node_size));
           node_size++;
@@ -48,26 +48,41 @@ void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Grap
   }
 }
 
-bool StreamGraphOptimizer::IsSameStreamId(const ComputeGraphPtr &comp_graph) {
+bool StreamGraphOptimizer::IsSameStreamIdOrBatchLabel(const ComputeGraphPtr &comp_graph) {
   if (comp_graph == nullptr) {
     return false;
   }
   std::set<int64_t> stream_set;
+  std::set<std::string> label_set;
   for (const ge::NodePtr &cur_node : comp_graph->GetDirectNode()) {
     GE_IF_BOOL_EXEC(cur_node->GetOpDesc() == nullptr, continue);
     int64_t stream_id = cur_node->GetOpDesc()->GetStreamId();
     if (stream_id == kInvalidStream) {
       continue;
     }
-    GELOGD("Node %s in subgraph %s stream id is: %ld, node num: %zu", cur_node->GetName().c_str(),
-           comp_graph->GetName().c_str(), stream_id, comp_graph->GetDirectNodesSize());
     stream_set.insert(stream_id);
+
+    std::string batch_label;
+    if (AttrUtils::GetStr(cur_node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) {
+      label_set.insert(batch_label);
+    } else {
+      GELOGD("Node %s[%s] has no batch label, subgraph %s, stream id: %ld", cur_node->GetName().c_str(),
+             cur_node->GetType().c_str(), comp_graph->GetName().c_str(), stream_id);
+      continue;
+    }
+
+    GELOGD("Node %s in subgraph %s stream id: %ld, node num: %zu", cur_node->GetName().c_str(),
+           comp_graph->GetName().c_str(), stream_id, comp_graph->GetDirectNodesSize());
   }
-  if (stream_set.size() > 1) {
-    GELOGI("Nodes of graph: %s have different stream id, node num: %zu, different stream num: %zu.",
+  if (stream_set.size() > 1 || label_set.size() > 1) {
+    GELOGI("Nodes of graph: %s have different stream id or batch_label, node num: %zu, different stream num: %zu.",
            comp_graph->GetName().c_str(), comp_graph->GetDirectNodesSize(), stream_set.size());
     return false;
   }
+
+  if (!label_set.empty()) {
+    (void)AttrUtils::SetStr(comp_graph, ATTR_NAME_BATCH_LABEL, *label_set.begin());
+  }
   return true;
 }
 
@@ -99,8 +114,8 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com
           continue;
         }
 
-        if (!IsSameStreamId(subgraph)) {
-          GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str());
+        if (!IsSameStreamIdOrBatchLabel(subgraph)) {
+          GELOGI("There are more than one stream or batch_label in subgraph %s", subgraph->GetName().c_str());
           continue;
         }
         OpDescPtr op_desc = nodes.at(0)->GetOpDesc();
@@ -112,9 +127,11 @@ Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &com
           return FAILED;
         }
         run_context.stream = run_context.graphStreamList[stream_id];
-        GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.",
-               subgraph->GetName().c_str(), engine_name.c_str(), stream_id,
-               static_cast<uint64_t>(reinterpret_cast<uintptr_t>(run_context.stream)));
+	std::string batch_label;
+	(void)AttrUtils::GetStr(subgraph, ATTR_NAME_BATCH_LABEL, batch_label);
+        GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu, "
+	       "batch_label: %s", subgraph->GetName().c_str(), engine_name.c_str(), stream_id,
+               static_cast<uint64_t>(reinterpret_cast<uintptr_t>(run_context.stream)), batch_label.c_str());
         for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) {
           GE_CHECK_NOTNULL(*iter);
           Status ret = (*iter)->OptimizeStreamGraph(*subgraph, run_context);
diff --git a/ge/graph/build/stream_graph_optimizer.h b/ge/graph/build/stream_graph_optimizer.h
index b0eea135..d69fa7ba 100644
--- a/ge/graph/build/stream_graph_optimizer.h
+++ b/ge/graph/build/stream_graph_optimizer.h
@@ -41,7 +41,7 @@ class StreamGraphOptimizer {
  private:
   void RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map);
 
-  bool IsSameStreamId(const ComputeGraphPtr &comp_graph);
+  bool IsSameStreamIdOrBatchLabel(const ComputeGraphPtr &comp_graph);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_BUILD_OPTIMIZE_STREAM_GRAPH_H_
diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc
index 41607f1f..bb72fa8a 100755
--- a/ge/graph/build/task_generator.cc
+++ b/ge/graph/build/task_generator.cc
@@ -49,14 +49,12 @@ const char *const kIsLastNode = "is_last_node";
 const char *const kIsInputVar = "INPUT_IS_VAR";
 const char *const kIsOutputVar = "OUTPUT_IS_VAR";
 const char *const kProfilingMode = "PROFILING_MODE";
-const char *const kProfilingFpPoint = "FP_POINT";
-const char *const kProfilingBpPoint = "BP_POINT";
 const uint32_t kProfilingArStep = 2;
 const uint64_t kProfilingFpStartLogid = 1;
 const uint64_t kProfilingBpEndLogid = 2;
 const uint64_t kProfilingArStartLogid = 3;
 const uint64_t kProfilingArEndLogid = 4;
-const uint64_t kProfilingIterEndLogid = 255;
+const uint64_t kProfilingIterEndLogid = 65535;
 const int64_t kHashFactor = 100000;
 const int64_t kInvalidGroupId = -1;
 }  // namespace
@@ -276,6 +274,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
   };
   GE_MAKE_GUARD(release, callback);
 
+  uint64_t all_reduce_node_idx = 0;
   for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
     OpDescPtr op_desc = node->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
@@ -294,7 +293,7 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
     // Part2: Call
     auto fusion_task_info =
         FusionTaskInfo{run_context,        graph,         node,        op_desc,         node_index,      ge_lib,
-                       ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes};
+                       ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes, all_reduce_node_idx};
     GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen),
                       "Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str());
     // continue directly
@@ -318,7 +317,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
                       type.c_str());
     // Profiling task
     size_t task_list_size_before = task_def_list.size();
-    GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list));
+    GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes,
+                                                node_index, task_def_list, all_reduce_node_idx));
     int64_t op_id = op_desc->GetId();
     // Compatible with dynamic shape scenes, the default is 0
     int64_t stream_id = 0;
@@ -338,8 +338,8 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
       return ret;
     }
     // Profiling task
-    GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list));
-
+    GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes,
+                                               node_index, task_def_list, all_reduce_node_idx));
     size_t task_list_size_after = task_def_list.size();
     // If tasks is reduced
     if (task_list_size_after < task_list_size_before) {
@@ -382,6 +382,7 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
   auto &op_name_map = fusion_task_info.op_name_map;
   auto &profiling_point = fusion_task_info.profiling_point;
   auto &all_reduce_nodes = fusion_task_info.all_reduce_nodes;
+  auto &all_reduce_idx = fusion_task_info.all_reduce_node_idx;
   // If op_desc have this attr, call nodes with same group key in a stream together
   if (ge::AttrUtils::GetInt(fusion_op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key) &&
       (fusion_nodes_seen.count(node.get()) == 0)) {
@@ -428,7 +429,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
         return INTERNAL_ERROR;
       }
       // profiling task
-      (void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list);
+      (void)InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes,
+                                      node_index, task_def_list, all_reduce_idx);
       run_context.stream = run_context.graphStreamList[stream_id];
       GELOGI("Fusion: Call %s to generate fusion_node:[fusion_node_name:%s(%s), id:%ld, stream_id:%ld] task.",
              op_kernel_lib_name.c_str(), fusion_node_name.c_str(), fusion_node_type.c_str(), op_id, stream_id);
@@ -441,7 +443,8 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
         return ret;
       }
       // profiling task
-      (void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list);
+      (void)InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes,
+                                     node_index, task_def_list, all_reduce_idx);
       size_t task_list_size_after = task_def_list.size();
       // if tasks is reduced
       if (task_list_size_after < task_list_size_before) {
@@ -569,7 +572,7 @@ Status TaskGenerator::MarkFirstAndLastOps(const vector<OpDescPtr> &ops, bool is_
       continue;
     }
     string op_type = op_desc->GetType();
-    if (!is_single_stream && (!op_desc->GetSubgraphInstanceNames().empty() || separator_types.count(op_type) != 0)) {
+    if ((!is_single_stream && !op_desc->GetSubgraphInstanceNames().empty()) || separator_types.count(op_type) != 0) {
       continuous_op_lists.emplace_back(vector<OpDescPtr>());
     } else {
       continuous_op_lists.back().emplace_back(op_desc);
@@ -810,40 +813,33 @@ Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint
                                    vector<uint32_t> &all_reduce_nodes, std::string &fp_point_str,
                                    std::string &bp_point_str) const {
 
-  if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_FPPONIT_OPTIONS, fp_point_str) == SUCCESS &&
-      ge::GetContext().GetOption(OPTION_EXEC_PROFILING_BPPONIT_OPTIONS, bp_point_str) == SUCCESS &&
-      !fp_point_str.empty() && !bp_point_str.empty()) {
-      return SUCCESS;
-  }
+  ProfilingManager::Instance().GetFpBpPoint(fp_point_str, bp_point_str);
 
   Status ret = SUCCESS;
-  const char *fp_point = std::getenv(kProfilingFpPoint);
-  if (fp_point == nullptr) {
+  if (fp_point_str.empty()) {
     ret = AutoFindFpOpIndex(graph, profiling_point);
     if (ret != SUCCESS) {
       GELOGW("First forward profiling op_index not set and FindFpOpIndex failed.");
       return FAILED;
     }
-  } else {
-    fp_point_str = string(fp_point);
-    GELOGI("Get fp_point_str from env %s", fp_point_str.c_str());
   }
 
-  const char *bp_point = std::getenv(kProfilingBpPoint);
-  if (bp_point == nullptr) {
+  if (bp_point_str.empty()) {
     ret = AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes);
     if (ret != SUCCESS) {
       GELOGW("Last backward profiling op_index not set and FindBpOpIndex failed.");
       return FAILED;
     }
-  } else {
-    bp_point_str = string(bp_point);
-    GELOGI("Get bp_point_str from env %s", bp_point_str.c_str());
   }
 
   return SUCCESS;
 }
 
+Status TaskGenerator::FindProfilingNodeIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
+                                             std::vector<uint32_t> &all_reduce_nodes) {
+  return FindProfilingTaskIndex(graph, profiling_point, all_reduce_nodes);
+}
+
 Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
                                              vector<uint32_t> &all_reduce_nodes) const {
   GE_CHECK_NOTNULL(graph);
@@ -854,7 +850,6 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
     GELOGD("Profiling is not open.");
     return SUCCESS;
   }
-
   GELOGI("Start get FP/BP index.");
   std::string fp_point_str;
   std::string bp_point_str;
@@ -892,18 +887,27 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi
   return SUCCESS;
 }
 
-
 Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
                                                 vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
-                                                vector<domi::TaskDef> &task_def_list) {
+                                                vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx) {
   const char *profiling_mode = std::getenv(kProfilingMode);
   bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
                       ProfilingManager::Instance().ProfilingTrainingTraceOn();
-  if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
-      (profiling_point.end_index.empty())) {
+  bool is_insert_fp_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task);
+  bool is_insert_bp_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
+  bool no_insert_profiling_task = ((profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
+                                   (profiling_point.end_index.empty())) &&
+                                  (!(is_insert_fp_profiling_task || is_insert_bp_profiling_task));
+  if (!is_profiling || no_insert_profiling_task) {
     return SUCCESS;
   }
-  if (profiling_point.fp_index == node_index) {
+  GELOGD("Insert fp profiling task: %d, insert bp profiling task: %d, fp index: %u, bp index: %u, end index size: %zu",
+         is_insert_fp_profiling_task, is_insert_bp_profiling_task, profiling_point.fp_index, profiling_point.bp_index,
+         profiling_point.end_index.size());
+
+  if ((profiling_point.fp_index == node_index) || is_insert_fp_profiling_task) {
     uint64_t jobid_log_id = ge::GetContext().TraceId();
     GELOGI("The first FP operator is %s, idx %u, job_id %lu", op_desc->GetName().c_str(), node_index, jobid_log_id);
 
@@ -927,22 +931,40 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const
     task_def_list.emplace_back(fp_task_def);
   }
 
-  for (size_t i = 0; i < all_reduce_nodes.size(); i++) {
-    if (all_reduce_nodes[i] != node_index) {
-      continue;
+  bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
+  uint64_t all_reduce_task_idx = 0;
+  bool is_insert_all_reduce_task = false;
+  if (is_all_reduce && is_insert_bp_profiling_task) {
+    all_reduce_task_idx = all_reduce_node_idx;
+    is_insert_all_reduce_task = true;
+  }
+  if (is_all_reduce) {
+    all_reduce_node_idx++;
+  }
+  if (!is_insert_all_reduce_task) {
+    for (size_t i = 0; i < all_reduce_nodes.size(); i++) {
+      if (all_reduce_nodes[i] == node_index) {
+        all_reduce_task_idx = i;
+        is_insert_all_reduce_task = true;
+        break;
+      }
     }
+  }
+
+  if (is_insert_all_reduce_task) {
     GELOGI("The start allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
     TaskDef ar_task_def;
     ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
     ar_task_def.set_stream_id(op_desc->GetStreamId());
     LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp();
     if (ar_log_def != nullptr) {
-      GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep),
+      GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep),
                       GELOGE(FAILED, "Multiply result is out of range.");
                       return FAILED);
-      auto log_id = i * kProfilingArStep + kProfilingArStartLogid;
+      auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArStartLogid;
       ar_log_def->set_logid(log_id);
       ar_log_def->set_notify(false);
+      (void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id);
     }
     task_def_list.push_back(ar_task_def);
   }
@@ -951,16 +973,27 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const
 
 Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
                                                vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
-                                               vector<domi::TaskDef> &task_def_list) {
+                                               vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx) {
   GE_CHECK_NOTNULL(op_desc);
   const char *profiling_mode = std::getenv(kProfilingMode);
   bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() ||
                       ProfilingManager::Instance().ProfilingTrainingTraceOn();
-  if (!is_profiling || (profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
-      (profiling_point.end_index.empty())) {
+  bool is_insert_bp_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
+  bool is_insert_end_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task);
+  bool no_insert_profiling_task = ((profiling_point.fp_index == 0) || (profiling_point.bp_index == 0) ||
+                                   (profiling_point.end_index.empty())) &&
+                                  (!(is_insert_bp_profiling_task || is_insert_end_profiling_task));
+  if (!is_profiling || no_insert_profiling_task) {
     return SUCCESS;
   }
-  if (profiling_point.bp_index == node_index) {
+  GELOGD("Insert bp profiling task: %d, insert end profiling task: %d, fp index: %u, bp index: %u, end index size: %zu",
+         is_insert_bp_profiling_task, is_insert_end_profiling_task, profiling_point.fp_index, profiling_point.bp_index,
+         profiling_point.end_index.size() );
+
+  bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
+  if ((profiling_point.bp_index == node_index) || (!is_all_reduce && is_insert_bp_profiling_task)) {
     GELOGI("The last BP operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
     TaskDef bp_task_def;
     bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
@@ -971,7 +1004,9 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
     bp_log_def->set_notify(false);
     task_def_list.emplace_back(bp_task_def);
   }
-  if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end()) {
+
+  if (profiling_point.end_index.find(node_index) != profiling_point.end_index.end() ||
+      is_insert_end_profiling_task) {
     GELOGI("The iteration end operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
     TaskDef end_task_def;
     end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
@@ -983,20 +1018,32 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P
     task_def_list.emplace_back(end_task_def);
   }
 
+  uint32_t all_reduce_task_idx = 0;
+  bool is_insert_all_reduce_task = false;
+  if (is_all_reduce && is_insert_bp_profiling_task) {
+    all_reduce_task_idx = all_reduce_node_idx;
+    is_insert_all_reduce_task = true;
+  }
+
   for (size_t i = 0; i < all_reduce_nodes.size(); i++) {
-    if (all_reduce_nodes[i] != node_index) {
-      continue;
+    if (all_reduce_nodes[i] == node_index) {
+      all_reduce_task_idx = i;
+      is_insert_all_reduce_task = true;
+      break;
     }
+  }
+
+  if (is_insert_all_reduce_task) {
     GELOGI("The end allreduce operator is %s, idx %u", op_desc->GetName().c_str(), node_index);
     TaskDef ar_task_def;
     ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
     ar_task_def.set_stream_id(op_desc->GetStreamId());
     LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp();
     GE_CHECK_NOTNULL(ar_log_def);
-    GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(i, kProfilingArStep),
+    GE_IF_BOOL_EXEC(TypeUtils::CheckUint64MulOverflow(all_reduce_task_idx, kProfilingArStep),
                     GELOGE(FAILED, "Multiply result is out of range.");
                     return FAILED);
-    auto log_id = i * kProfilingArStep + kProfilingArEndLogid;
+    auto log_id = all_reduce_task_idx * kProfilingArStep + kProfilingArEndLogid;
     ar_log_def->set_logid(log_id);
     ar_log_def->set_notify(false);
     task_def_list.emplace_back(ar_task_def);
diff --git a/ge/graph/build/task_generator.h b/ge/graph/build/task_generator.h
index c93b2007..5970954c 100755
--- a/ge/graph/build/task_generator.h
+++ b/ge/graph/build/task_generator.h
@@ -51,6 +51,7 @@ struct FusionTaskInfo {
   std::map<uint32_t, string> &op_name_map;
   ProfilingPoint &profiling_point;
   vector<uint32_t> all_reduce_nodes;
+  uint64_t all_reduce_node_idx;
 };
 
 class TaskGenerator {
@@ -76,6 +77,8 @@ class TaskGenerator {
   ///
   Status GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t session_id, RunContext &run_context);
 
+  Status FindProfilingNodeIndex(const ComputeGraphPtr &graph, ProfilingPoint &profiling_point,
+                                std::vector<uint32_t> &all_reduce_nodes);
  private:
   Status UpdateAnchorStatus(const NodePtr &node);
 
@@ -126,10 +129,10 @@ class TaskGenerator {
                                 std::vector<uint32_t> &all_reduce_nodes) const;
   Status InsertProfilingTaskBefore(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
                                    std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
-                                   std::vector<domi::TaskDef> &task_def_list);
+                                   std::vector<domi::TaskDef> &task_def_list, uint64_t &all_reduce_node_idx);
   Status InsertProfilingTaskAfter(const OpDescPtr &op_desc, const ProfilingPoint &profiling_point,
                                   std::vector<uint32_t> &all_reduce_nodes, uint32_t node_index,
-                                  std::vector<domi::TaskDef> &task_def_list);
+                                  std::vector<domi::TaskDef> &task_def_list, uint64_t all_reduce_node_idx);
 
   static bool IsProfPoint(const OpDescPtr &op, const std::string &name);
 
diff --git a/ge/graph/common/transop_util.cc b/ge/graph/common/transop_util.cc
index 9b513fe6..62b4c4e4 100755
--- a/ge/graph/common/transop_util.cc
+++ b/ge/graph/common/transop_util.cc
@@ -23,7 +23,10 @@
 namespace {
 const int kInvalidTransopDataIndex = -1;
 const int kTransOpOutIndex = 0;
-std::map<ge::DataType, ge::DataType> precision_loss_transfer_map = {{ge::DT_FLOAT, ge::DT_BOOL}};
+std::map<ge::DataType, ge::DataType> precision_loss_transfer_map = {
+  {ge::DT_FLOAT, ge::DT_BOOL},
+  {ge::DT_INT64, ge::DT_BOOL}
+};
 }  // namespace
 
 namespace ge {
diff --git a/ge/graph/execute/graph_execute.cc b/ge/graph/execute/graph_execute.cc
index 97e2fd1b..3c5618e8 100755
--- a/ge/graph/execute/graph_execute.cc
+++ b/ge/graph/execute/graph_execute.cc
@@ -560,34 +560,10 @@ Status GraphExecutor::GetModelAttr(uint32_t model_id, std::vector<string> &dynam
   return SUCCESS;
 }
 
-Status GraphExecutor::GetInputOutputDescInfoForZeroCopy(uint32_t model_id, vector<InputOutputDescInfo> &input_desc,
-                                                        vector<InputOutputDescInfo> &output_desc,
-                                                        std::vector<uint32_t> &input_formats,
-                                                        std::vector<uint32_t> &out_formats) {
-  try {
-    auto model_manager = ge::ModelManager::GetInstance();
-    GE_CHECK_NOTNULL(model_manager);
-    Status ret =
-        model_manager->GetInputOutputDescInfoForZeroCopy(model_id, input_desc, output_desc, input_formats, out_formats);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "GetInputOutputDescInfoForZeroCopy failed.");
-      return ret;
-    }
-  } catch (std::bad_alloc &) {
-    GELOGE(MEMALLOC_FAILED, "GetInputOutputDescInfoForZeroCopy failed, bad memory allocation occur !");
-    return MEMALLOC_FAILED;
-  } catch (...) {
-    GELOGE(FAILED, "GetInputOutputDescInfoForZeroCopy failed, some exceptions occur !");
-    return FAILED;
-  }
-
-  return SUCCESS;
-}
-
-Status GraphExecutor::GetAIPPInfo(uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) {
+Status GraphExecutor::GetAippInfo(uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) {
   auto model_manager = ge::ModelManager::GetInstance();
   GE_CHECK_NOTNULL(model_manager);
-  Status ret = model_manager->GetAIPPInfo(model_id, index, aipp_info);
+  Status ret = model_manager->GetAippInfo(model_id, index, aipp_info);
   if (ret != SUCCESS) {
     GELOGW("GetAIPPInfo is not success.");
     return ret;
diff --git a/ge/graph/execute/graph_execute.h b/ge/graph/execute/graph_execute.h
index efc30743..d2a92e47 100755
--- a/ge/graph/execute/graph_execute.h
+++ b/ge/graph/execute/graph_execute.h
@@ -73,7 +73,7 @@ class GraphExecutor {
                                        vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &input_formats,
                                        std::vector<uint32_t> &output_formats, bool new_model_desc = false);
 
-  static Status GetAIPPInfo(uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info);
+  static Status GetAippInfo(uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info);
 
   static Status GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index);
 
@@ -110,10 +110,6 @@ class GraphExecutor {
 
   static Status GetModelAttr(uint32_t model_id, std::vector<string> &dynamic_output_shape_info);
 
-  static Status GetInputOutputDescInfoForZeroCopy(uint32_t model_id, vector<InputOutputDescInfo> &input_desc,
-                                                  vector<InputOutputDescInfo> &output_desc,
-                                                  std::vector<uint32_t> &input_formats,
-                                                  std::vector<uint32_t> &output_formats);
   static Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info);
   static Status GetAllAippInputOutputDims(uint32_t model_id, uint32_t index, std::vector<InputOutputDims> &input_dims,
                                           std::vector<InputOutputDims> &output_dims);
diff --git a/ge/graph/label/case_label_maker.h b/ge/graph/label/case_label_maker.h
index 1078a906..3dbfb2bc 100644
--- a/ge/graph/label/case_label_maker.h
+++ b/ge/graph/label/case_label_maker.h
@@ -86,7 +86,6 @@
                                                                 |    Node    |
                                                                 +------------+
 *******************************************************************************/
-
 namespace ge {
 class CaseOpLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/label/if_label_maker.h b/ge/graph/label/if_label_maker.h
index 0807f549..8b07eb96 100644
--- a/ge/graph/label/if_label_maker.h
+++ b/ge/graph/label/if_label_maker.h
@@ -70,7 +70,6 @@
                                                                 |    Node    |
                                                                 +------------+
 *******************************************************************************/
-
 namespace ge {
 class IfOpLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/label/partitioned_call_label_maker.h b/ge/graph/label/partitioned_call_label_maker.h
index b89cb94c..3944aabd 100644
--- a/ge/graph/label/partitioned_call_label_maker.h
+++ b/ge/graph/label/partitioned_call_label_maker.h
@@ -54,7 +54,6 @@
         |       c       |
         +---------------+
 *******************************************************************************/
-
 namespace ge {
 class PartitionedCallLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/label/while_label_maker.h b/ge/graph/label/while_label_maker.h
index 0eb0deee..6c30475b 100644
--- a/ge/graph/label/while_label_maker.h
+++ b/ge/graph/label/while_label_maker.h
@@ -70,7 +70,6 @@
                                                                 |    Node    |
                                                                 +------------+
 *******************************************************************************/
-
 namespace ge {
 class WhileOpLabelMaker : public LabelMaker {
  public:
diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc
index aa825a5d..6272e581 100755
--- a/ge/graph/load/graph_loader.cc
+++ b/ge/graph/load/graph_loader.cc
@@ -122,14 +122,14 @@ Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string
                                      ModelData &model_data) {
   Status ret;
   if (!CheckInputPathValid(path)) {
-    GELOGE(GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
-    return GE_EXEC_MODEL_PATH_INVALID;
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID, "model path is invalid: %s", path.c_str());
+    return ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID;
   }
 
   GELOGI("Load model begin, model path is: %s", path.c_str());
   if (!key_path.empty() && !CheckInputPathValid(key_path)) {
-    GELOGE(GE_EXEC_MODEL_KEY_PATH_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
-    return GE_EXEC_MODEL_KEY_PATH_INVALID;
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "decrypt_key path is invalid: %s", key_path.c_str());
+    return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   ret = DavinciModelParser::LoadFromFile(path.c_str(), key_path.c_str(), priority, model_data);
@@ -144,63 +144,6 @@ Status GraphLoader::LoadDataFromFile(const std::string &path, const std::string
     return SUCCESS;
 }
 
-Status GraphLoader::LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority,
-                                      const std::shared_ptr<ModelListener> &listener, uint32_t &model_id) {
-  Status ret;
-  ModelData model_data;
-  ret = LoadDataFromFile(path, key_path, priority, model_data);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "LoadModelFromFile: Load failed. ret = %u", ret);
-    if (model_data.model_data != nullptr) {
-      delete[] static_cast<char *>(model_data.model_data);
-      model_data.model_data = nullptr;
-    }
-    return ret;
-  }
-
-  ret = LoadModel(model_data, listener, model_id);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
-    if (model_data.model_data != nullptr) {
-      delete[] static_cast<char *>(model_data.model_data);
-      model_data.model_data = nullptr;
-    }
-  }
-
-  if (model_data.model_data != nullptr) {
-    delete[] static_cast<char *>(model_data.model_data);
-    model_data.model_data = nullptr;
-  }
-
-  return ret;
-}
-
-Status GraphLoader::LoadModel(const ModelData &model_data, const std::shared_ptr<ModelListener> &listener,
-                              uint32_t &model_id) {
-  GELOGI("Load model begin, model_id:%u.", model_id);
-
-  // For GeOp, Open Device 0 here.
-  GE_CHK_RT_RET(rtSetDevice(0));
-  auto model_manager = ModelManager::GetInstance();
-  GE_CHECK_NOTNULL(model_manager);
-  Status ret = model_manager->LoadModelOffline(model_id, model_data, listener);
-  if (ret != SUCCESS) {
-    GE_CHK_RT(rtDeviceReset(0));
-    GELOGE(ret, "LoadModel: Load failed.");
-    return ret;
-  }
-  ret = model_manager->Start(model_id);
-  if (ret != SUCCESS) {
-    if (model_manager->Unload(model_id) != SUCCESS) {
-      GELOGE(FAILED, "LoadModel: Unload failed while trying to unload after a failed start.");
-    }
-    GELOGE(ret, "LoadModel: Start failed.");
-    return ret;
-  }
-  GELOGI("LoadModel: Start model success, model_id:%u.", model_id);
-  return SUCCESS;
-}
-
 Status GraphLoader::CommandHandle(const Command &command) {
   try {
     auto model_manager = ModelManager::GetInstance();
@@ -225,13 +168,13 @@ Status GraphLoader::CommandHandle(const Command &command) {
 }
 
 Status GraphLoader::LoadModelFromData(uint32_t &model_id, const ModelData &model_data, void *dev_ptr,
-                                      size_t memsize, void *weight_ptr, size_t weightsize) {
+                                      size_t mem_size, void *weight_ptr, size_t weight_size) {
   GELOGI("Load model begin, model_id:%u.", model_id);
   // For ACL, Open Device from App.
   auto model_manager = ModelManager::GetInstance();
   GE_CHECK_NOTNULL(model_manager);
   Status ret = model_manager->LoadModelOffline(
-      model_id, model_data, nullptr, dev_ptr, memsize, weight_ptr, weightsize);
+      model_id, model_data, nullptr, dev_ptr, mem_size, weight_ptr, weight_size);
   if (ret != SUCCESS) {
     GELOGE(ret, "Load model failed, model_id:%u.", model_id);
     return ret;
@@ -283,7 +226,8 @@ Status GraphLoader::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asyn
                                  std::vector<GeTensorDesc> &output_desc) {
   auto model_manager = ModelManager::GetInstance();
   GE_CHECK_NOTNULL(model_manager);
-  Status ret = model_manager->ExecuteModel(model_id, stream, async_mode, input_data, input_desc, output_data, output_desc);
+  Status ret = model_manager->ExecuteModel(model_id, stream, async_mode,
+                                           input_data, input_desc, output_data, output_desc);
   if (ret != SUCCESS) {
     GELOGE(ret, "Execute model failed, model_id:%u.", model_id);
     return ret;
@@ -319,10 +263,10 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
   return SUCCESS;
 }
 
-Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
+Status GraphLoader::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, uint32_t sub_model_id) {
   auto model_manager = ModelManager::GetInstance();
   GE_CHECK_NOTNULL(model_manager);
-  Status ret = model_manager->DestroyAicpuKernel(session_id, model_id);
+  Status ret = model_manager->DestroyAicpuKernel(session_id, model_id, sub_model_id);
   if (ret != SUCCESS) {
     GELOGE(ret, "Destroy aicpu kernel failed.");
     return ret;
diff --git a/ge/graph/load/graph_loader.h b/ge/graph/load/graph_loader.h
index 974af5c1..3632a10a 100755
--- a/ge/graph/load/graph_loader.h
+++ b/ge/graph/load/graph_loader.h
@@ -44,12 +44,6 @@ class GraphLoader {
 
   static Status GetMaxUsedMemory(uint32_t model_id, uint64_t &max_size);
 
-  static Status LoadModel(const ModelData &model_data, const std::shared_ptr<ModelListener> &listener,
-                          uint32_t &model_id);
-
-  static Status LoadModelFromFile(const std::string &path, const std::string &key_path, int32_t priority,
-                                  const std::shared_ptr<ModelListener> &listener, uint32_t &model_id);
-
   static Status CommandHandle(const Command &command);
 
   static Status GetMemoryInfo(int64_t &free);
@@ -68,7 +62,7 @@ class GraphLoader {
                              const std::vector<GeTensorDesc> &input_desc, OutputData &output_data,
                              std::vector<GeTensorDesc> &output_desc);
 
-  static Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id);
+  static Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, uint32_t sub_model_id);
 
   static Status DestroyAicpuSessionForInfer(uint32_t model_id);
 
diff --git a/ge/graph/load/new_model_manager/data_dumper.cc b/ge/graph/load/new_model_manager/data_dumper.cc
index 4534fe73..a12a2b2a 100644
--- a/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/ge/graph/load/new_model_manager/data_dumper.cc
@@ -120,6 +120,7 @@ static int32_t GetIrDataType(ge::DataType data_type) {
       {ge::DT_RESOURCE, ge::proto::DT_RESOURCE},
       {ge::DT_STRING_REF, ge::proto::DT_STRING_REF},
       {ge::DT_STRING, ge::proto::DT_STRING},
+      {ge::DT_VARIANT, ge::proto::DT_VARIANT},
   };
 
   auto iter = data_type_map.find(data_type);
@@ -319,6 +320,9 @@ Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vis
   for (auto dim : tensor_descs.at(index).GetShape().GetDims()) {
     output.mutable_shape()->add_dim(dim);
   }
+  for (auto dim : tensor_descs.at(index).GetOriginShape().GetDims()) {
+    output.mutable_origin_shape()->add_dim(dim);
+  }
   int64_t output_size = 0;
   if (TensorUtils::GetTensorSizeInBytes(tensor_descs.at(index), output_size) != SUCCESS) {
     GELOGE(PARAM_INVALID, "Get output size filed");
@@ -476,6 +480,9 @@ Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor
   for (auto dim : tensor_descs.at(index).GetShape().GetDims()) {
     input.mutable_shape()->add_dim(dim);
   }
+  for (auto dim : tensor_descs.at(index).GetOriginShape().GetDims()) {
+    input.mutable_origin_shape()->add_dim(dim);
+  }
   int64_t input_size = 0;
   if (AttrUtils::GetInt(tensor_descs.at(index), ATTR_NAME_INPUT_ORIGIN_SIZE, input_size)) {
     GELOGI("Get aipp input size according to attr is %ld", input_size);
@@ -823,6 +830,13 @@ Status DataDumper::UnloadDumpInfo() {
   return SUCCESS;
 }
 
+void DataDumper::DumpShrink() {
+  compute_graph_.reset();
+  input_map_.clear();
+  ref_info_.clear();
+  op_list_.clear();
+}
+
 void DataDumper::PrintCheckLog(string &dump_list_key) {
   std::set<std::string> model_list = dump_properties_.GetAllDumpModel();
   if (model_list.empty()) {
@@ -891,6 +905,7 @@ Status DataDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> exceptio
       toolkit::dumpdata::DumpData dump_data;
       dump_data.set_version("2.0");
       dump_data.set_dump_time(GetNowTime());
+      dump_data.set_op_name(op_desc_info.op_name);
       for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) {
         toolkit::dumpdata::OpInput input;
         input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i])));
@@ -919,11 +934,11 @@ Status DataDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> exceptio
       ReplaceStringElem(op_name);
       ReplaceStringElem(op_type);
       string dump_file_path =
-          "./" + op_type + "." + op_name + "." + to_string(op_desc_info.task_id) + "." + to_string(now_time);
+          "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time);
       GELOGI("The exception dump file path is %s", dump_file_path.c_str());
 
       uint64_t proto_size = dump_data.ByteSizeLong();
-      unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
+      std::unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
       bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
       if (!ret || proto_size == 0) {
         GELOGE(PARAM_INVALID, "Dump data proto serialize failed");
diff --git a/ge/graph/load/new_model_manager/data_dumper.h b/ge/graph/load/new_model_manager/data_dumper.h
index 46ead310..8e612688 100755
--- a/ge/graph/load/new_model_manager/data_dumper.h
+++ b/ge/graph/load/new_model_manager/data_dumper.h
@@ -83,6 +83,8 @@ class DataDumper {
 
   Status UnloadDumpInfo();
 
+  void DumpShrink();
+
   void SetDumpProperties(const DumpProperties &dump_properties) { dump_properties_ = dump_properties; }
   const DumpProperties &GetDumpProperties() const { return dump_properties_; }
   bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const;
@@ -112,18 +114,18 @@ class DataDumper {
   struct InnerInputMapping;
 
   std::vector<OpDescInfo> op_desc_info_;
-  std::vector<InnerDumpInfo> op_list_;
+  std::vector<InnerDumpInfo> op_list_;  // release after DavinciModel::Init
   uint32_t end_graph_task_id_ = 0;
   uint32_t end_graph_stream_id_ = 0;
   bool is_end_graph_ = false;
-  std::multimap<std::string, InnerInputMapping> input_map_;
+  std::multimap<std::string, InnerInputMapping> input_map_;  // release after DavinciModel::Init
   bool load_flag_;
   uint32_t device_id_;
   uintptr_t global_step_;
   uintptr_t loop_per_iter_;
   uintptr_t loop_cond_;
-  ComputeGraphPtr compute_graph_;
-  std::map<OpDescPtr, void *> ref_info_;
+  ComputeGraphPtr compute_graph_;  // release after DavinciModel::Init
+  std::map<OpDescPtr, void *> ref_info_;  // release after DavinciModel::Init
   void *l1_fusion_addr_ = nullptr;
 
 
diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc
index 81d47b3b..2afbdf30 100755
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -16,7 +16,6 @@
 
 #include "graph/load/new_model_manager/davinci_model.h"
 
-#include <cce/dnn.h>
 #include <graph/utils/node_utils.h>
 #include <algorithm>
 #include <map>
@@ -76,7 +75,6 @@
 namespace ge {
 namespace {
 const uint32_t kDataIndex = 0;
-const uint32_t kOutputNum = 1;
 const uint32_t kTrueBranchStreamNum = 1;
 const uint32_t kGetDynamicDimsCount = 1;
 const uint32_t kThreadNum = 16;
@@ -84,10 +82,11 @@ const uint32_t kAddrLen = sizeof(void *);
 const int kDecimal = 10;
 const int kBytes = 8;
 const uint32_t kDataMemAlignSizeCompare = 64;
-const uint32_t kDumpL1FusionOpMByteSize = 2 * 1024 * 1024;
+const uint32_t kDumpL1FusionOpMByteSize = 2097152;   // 2 * 1024 * 1024
 const uint32_t kDumpFlagOfL1Fusion = 0;
 const char *const kDefaultBatchLable = "Batch_default";
 const char *const kGetDynamicDimsName = "ascend_mbatch_get_dynamic_dims_node";
+const char *const kMultiBatchNodePostfix = "_ascend_mbatch_batch_";
 const int32_t kInvalidStream = -1;
 const uint32_t kEndOfSequence = 0x0704000a;
 const uint32_t kEndOfSequenceNew = 507005;
@@ -97,6 +96,29 @@ const int32_t kModelAbortNormalNew = 507024;
 inline bool IsDataOp(const std::string &node_type) {
   return node_type == DATA_TYPE || node_type == AIPP_DATA_TYPE || node_type == ANN_DATA_TYPE;
 }
+
+inline bool IsTbeTask(const OpDescPtr &op_desc) {
+  uint32_t run_mode = static_cast<uint32_t>(domi::ImplyType::INVALID);
+  if (!AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, run_mode)) {
+    return false;
+  }
+
+  if (run_mode != static_cast<uint32_t>(domi::ImplyType::TVM)) {
+    return false;
+  }
+
+  // Skip no_task operator, such as concat and split.
+  bool attr_no_task = false;
+  bool get_attr_no_task_flag = AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_no_task);
+  if (get_attr_no_task_flag && attr_no_task) {
+    GELOGI("Node[name:%s, type:%s] does not generate task, skip initialization.",
+           op_desc->GetName().c_str(), op_desc->GetType().c_str());
+    return false;
+  }
+
+  return true;
+}
+
 inline bool IsNoTaskAndDumpNeeded(const OpDescPtr &op_desc) {
   bool save_dump_info = false;
   (void)ge::AttrUtils::GetBool(op_desc, ATTR_NO_TASK_AND_DUMP_NEEDED, save_dump_info);
@@ -109,6 +131,7 @@ std::mutex DavinciModel::tvm_bin_mutex_;
 DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener> &listener)
     : weights_mem_base_(nullptr),
       var_mem_base_(nullptr),
+      fixed_mem_base_(0),
       mem_base_(nullptr),
       is_inner_mem_base_(false),
       is_inner_weight_base_(false),
@@ -140,6 +163,7 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener
       is_l1_fusion_enable_(false),
       is_first_execute_(true) {
   op_list_.clear();
+  skt_info_ = {0, 0, 0, 0, nullptr, nullptr, {}, {}, {}, {}, {}, RT_KERNEL_DEFAULT, -1, 0, nullptr};
 }
 
 DavinciModel::~DavinciModel() {
@@ -149,20 +173,11 @@ DavinciModel::~DavinciModel() {
       GELOGW("UnloadDumpInfo failed, ret: %u.", ret);
     }
 
-    for (const auto &op_and_addr : saved_task_addrs_) {
-      auto addr = op_and_addr.second;
-      if (addr != nullptr) {
-        GE_CHK_RT(rtFree(addr));
-      }
-      addr = nullptr;
-    }
-    saved_task_addrs_.clear();
+    ClearTaskAddrs();
 
     GE_CHK_STATUS(ModelRunStop());
 
     op_list_.clear();
-    data_op_list_.clear();
-    output_op_list_.clear();
     tensor_name_to_fixed_addr_size_.clear();
     tensor_name_to_peer_output_index_.clear();
     GE_DELETE_NEW_SINGLE(data_inputer_);
@@ -221,6 +236,17 @@ DavinciModel::~DavinciModel() {
   }
 }
 
+void DavinciModel::ClearTaskAddrs() {
+  for (const auto &op_and_addr : saved_task_addrs_) {
+    auto addr = op_and_addr.second;
+    if (addr != nullptr) {
+      GE_CHK_RT(rtFree(addr));
+    }
+    addr = nullptr;
+  }
+  saved_task_addrs_.clear();
+}
+
 void DavinciModel::UnbindHcomStream() {
   if (!all_hccl_stream_list_.empty()) {
     for (size_t i = 0; i < all_hccl_stream_list_.size(); i++) {
@@ -262,7 +288,11 @@ Status DavinciModel::Assign(const GeModelPtr &ge_model) {
 /// @return: void
 ///
 void DavinciModel::Shrink() {
+  skt_info_ = {0, 0, 0, 0, nullptr, nullptr, {}, {}, {}, {}, {}, RT_KERNEL_DEFAULT, -1, 0, nullptr};
+  DumperShrink();
   ge_model_.reset();  // delete object.
+  op_list_.clear();
+  ClearTaskAddrs();
 }
 
 Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weight_size) {
@@ -290,8 +320,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh
     if (weight_ptr == nullptr) {
       weights_mem_base_ = MallocWeightsMem(weights_size);
       if (weights_mem_base_ == nullptr) {
-        GELOGE(GE_EXEC_ALLOC_WEIGHT_MEM_FAILED, "Alloc weight memory failed. size: %zu", weights_size);
-        return GE_EXEC_ALLOC_WEIGHT_MEM_FAILED;
+        GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc weight memory failed. size: %zu", weights_size);
+        return ACL_ERROR_GE_MEMORY_ALLOCATION;
       }
       is_inner_weight_base_ = true;
     }
@@ -308,8 +338,8 @@ Status DavinciModel::InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weigh
 
 Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) {
   if (is_feature_map_mem_has_inited_) {
-    GELOGE(FAILED, "call InitFeatureMapMem more than once .");
-    return FAILED;
+    GELOGE(PARAM_INVALID, "call InitFeatureMapMem more than once.");
+    return PARAM_INVALID;
   }
   is_feature_map_mem_has_inited_ = true;
 
@@ -317,8 +347,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) {
   std::size_t p2p_data_size = P2PMemInfos().at(RT_MEMORY_P2P_DDR).memory_size;
 
   if ((dev_ptr != nullptr) && (mem_size < TotalMemSize())) {
-    GELOGE(FAILED, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize());
-    return FAILED;
+    GELOGE(PARAM_INVALID, "Invalid mem param: mem_size=%zu totalsize=%zu.", mem_size, TotalMemSize());
+    return PARAM_INVALID;
   }
 
   mem_base_ = static_cast<uint8_t *>(dev_ptr);
@@ -328,11 +358,11 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) {
   if (TotalMemSize() && mem_base_ == nullptr) {
     mem_base_ = MallocFeatureMapMem(data_size);
     if (mem_base_ == nullptr) {
-      GELOGE(GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED, "Alloc feature map memory failed. size: %zu", data_size);
-      return GE_EXEC_ALLOC_FEATURE_MAP_MEM_FAILED;
+      GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc feature map memory failed. size: %zu", data_size);
+      return ACL_ERROR_GE_MEMORY_ALLOCATION;
     }
-    GEEVENT("[IMAS]InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
-            mem_base_, data_size);
+    GEEVENT("[IMAS]InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]",
+            runtime_param_.graph_id, mem_base_, data_size);
 
     if (!is_inner_weight_base_) {
       weights_mem_base_ = mem_base_;
@@ -344,8 +374,8 @@ Status DavinciModel::InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size) {
   if (p2p_data_size != 0) {
     p2p_mem_base_ = MallocP2PMem(p2p_data_size);
     if (p2p_mem_base_ == nullptr) {
-      GELOGE(GE_EXEC_ALLOC_P2P_MEM_FAILED, "Alloc p2p memory failed,size: %zu", p2p_data_size);
-      return GE_EXEC_ALLOC_P2P_MEM_FAILED;
+      GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Alloc p2p memory failed,size: %zu", p2p_data_size);
+      return ACL_ERROR_GE_MEMORY_ALLOCATION;
     }
     GELOGI("InitFeatureMapAndP2PMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
            p2p_mem_base_, p2p_data_size);
@@ -486,8 +516,12 @@ Status DavinciModel::DoTaskSink() {
 
   GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed.");
 
+  GE_CHK_STATUS_RET(ModelManager::GetInstance()->CheckAicpuOpList(ge_model_), "Check aicpu op type failed.");
+
   GE_CHK_STATUS_RET(InitEntryTask(), "InitEntryTask failed.");
 
+  GE_CHK_STATUS_RET(InitL1DataDumperArgs(), "InitL1DataDumperArgs failed.");
+
   GE_CHK_STATUS_RET(DistributeTask(), "Distribute failed.");
 
   GE_CHK_RT_RET(rtModelLoadComplete(rt_model_handle_));
@@ -513,7 +547,7 @@ Status DavinciModel::SetTSDevice() {
 Status DavinciModel::OpDebugRegister() {
   bool is_op_debug = false;
   (void)ge::AttrUtils::GetBool(ge_model_, ATTR_OP_DEBUG_FLAG, is_op_debug);
-  GELOGD("The value of op_debug in ge_model_ is %d.", is_op_debug);
+  GELOGD("The value of op debug in ge_model is %d.", is_op_debug);
   if (is_op_debug) {
     debug_reg_mutex_.lock();
     rtError_t rt_ret = rtMalloc(&op_debug_addr_, kOpDebugMemorySize, RT_MEMORY_DDR);
@@ -606,7 +640,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   version_ = ge_model_->GetVersion();
   name_ = ge_model_->GetName();
   (void)ge::AttrUtils::GetBool(ge_model_, ATTR_NAME_SWITCH_FOR_L1_FUSION, is_l1_fusion_enable_);
-  GELOGD("The value of ge.l1Fusion in ge_model_ is %d.", is_l1_fusion_enable_);
+  GELOGD("The value of ge.l1Fusion in ge_model is %d.", is_l1_fusion_enable_);
   CheckHasHcomOp();
 
   vector<int64_t> huge_stream_list;
@@ -667,6 +701,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
     data_inputer_ = new (std::nothrow) DataInputer();
     GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, MEMALLOC_FAILED, "data_inputer_ is nullptr.");
   }
+  fixed_mem_base_ = reinterpret_cast<uintptr_t>(mem_base_);
   GE_TIMESTAMP_END(InitModelMem, "GraphLoader::InitModelMem");
 
   for (const ge::NodePtr &node : compute_graph->GetDirectNode()) {
@@ -679,24 +714,10 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
 
   GE_CHK_STATUS_RET(InitNodes(compute_graph), "Init nodes failed");
 
-  SetDataDumperArgs(compute_graph);
   GE_TIMESTAMP_START(DoTaskSink);
-  auto ret = DoTaskSink();
+  GE_CHK_STATUS_RET(DoTaskSink(), "Task sink failed");
   GE_TIMESTAMP_END(DoTaskSink, "GraphLoader::DoTaskSink");
 
-  auto all_dump_model = GetDumpProperties().GetAllDumpModel();
-  bool findByOmName = all_dump_model.find(om_name_) != all_dump_model.end();
-  bool findByModelName = all_dump_model.find(name_) != all_dump_model.end();
-  bool dump_l1fusion_op = (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end()) ||
-                          findByOmName || findByModelName;
-  if (dump_l1fusion_op) {
-    // malloc 2M for dump l1fusion op
-    GE_CHK_RT_RET(rtMalloc(&l1_fusion_addr_, kDumpL1FusionOpMByteSize, RT_MEMORY_DDR));
-
-    // send l1fusion dump addr to rts
-    GE_CHK_RT_RET(rtDumpAddrSet(rt_model_handle_, l1_fusion_addr_, kDumpL1FusionOpMByteSize, kDumpFlagOfL1Fusion));
-  }
-
   /// In zero copy model, if a aicpu operator is connected to the first or last layer, before model execution,
   /// the aicpu opertor needs to destroy history record, and update operator memory address.
   /// The model with specified aicpu operators is only marked here, and destruction is in ModelManager::ExecuteModel().
@@ -711,9 +732,10 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   }
 
   // collect profiling for ge
+  GE_CHK_STATUS_RET(InitModelProfile(), "Init model profile failed");
   auto &profiling_manager = ProfilingManager::Instance();
   if (profiling_manager.ProfilingModelLoadOn()) {
-    Status p_ret = ReportProfilingData(!profiling_manager.IsAclApiMode());
+    Status p_ret = ReportProfilingData();
     if (p_ret != SUCCESS) {
       GELOGE(p_ret, "Report profiling data failed.");
       return p_ret;
@@ -721,19 +743,18 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
   }
 
   Shrink();
-  return ret;
+  return SUCCESS;
 }
 
-Status DavinciModel::ReportProfilingData(bool check_device) {
+Status DavinciModel::ReportProfilingData() {
   std::vector<ComputeGraphDescInfo> compute_graph_desc_info;
   Status ret = GetComputeGraphInfo(compute_graph_desc_info);
   if (ret != SUCCESS) {
     GELOGE(ret, "GetComputeGraphInfo failed.");
     return ret;
   }
-  ProfilingManager::Instance().ReportProfilingData(model_id_, GetTaskDescInfo(), compute_graph_desc_info, check_device);
+  ProfilingManager::Instance().ReportProfilingData(model_id_, GetTaskDescInfo(), compute_graph_desc_info);
   GE_CHK_STATUS(SinkModelProfile(), "Sink model profiler failed.");
-  op_list_.clear();
 
   return SUCCESS;
 }
@@ -815,7 +836,6 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
 
   typedef Status (DavinciModel::*OpDescCall)(const OpDescPtr &);
   static std::map<std::string, OpDescCall> op_desc_handle = {
-      {VARIABLE, &DavinciModel::InitVariable},
       {CONSTANTOP, &DavinciModel::InitConstant},
       {STREAMACTIVE, &DavinciModel::InitStreamActive},
       {STREAMSWITCH, &DavinciModel::InitStreamSwitch},
@@ -824,18 +844,15 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
       {CASE, &DavinciModel::InitCase},
   };
 
-  GE_CHK_STATUS_RET(InitInputOutputForDynamic(compute_graph), "InitInputOutputForDynamic failed.");
-
+  vector<OpDescPtr> output_op_list;
   map<uint32_t, OpDescPtr> data_by_index;
+  map<string, OpDescPtr> variable_by_name;
   auto nodes = compute_graph->GetAllNodes();
   const CustAICPUKernelStore &aicpu_kernel_store = ge_model_->GetCustAICPUKernelStore();
-  for (size_t i = 0; i < nodes.size(); i++) {
-    auto node = nodes.at(i);
-    auto op_desc = node->GetOpDesc();
-    if (op_desc == nullptr) {
-      GELOGE(PARAM_INVALID, "op_desc is null.");
-      return PARAM_INVALID;
-    }
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    const auto &node = nodes.at(i);
+    const auto &op_desc = node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
 
     op_list_[op_desc->GetId()] = op_desc;
 
@@ -844,7 +861,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
     GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc);
 
     if (IsDataOp(op_desc->GetType())) {
-      if (InitDataOp(node, data_op_index, data_by_index) != SUCCESS) {
+      if (InitDataOp(compute_graph, node, data_op_index, data_by_index) != SUCCESS) {
         GELOGE(PARAM_INVALID, "Data init failed, Name: %s", op_desc->GetName().c_str());
         return PARAM_INVALID;
       }
@@ -853,17 +870,29 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
     }
 
     if (op_desc->GetType() == NETOUTPUT) {
-      if (InitNetOutput(node) != SUCCESS) {
+      if (InitNetOutput(compute_graph, node, output_op_list) != SUCCESS) {
         GELOGE(PARAM_INVALID, "NetOutput init failed, Name: %s", op_desc->GetName().c_str());
         return PARAM_INVALID;
       }
+      if (InitRealSizeAndShapeInfo(compute_graph, node) != SUCCESS) {
+        GELOGE(PARAM_INVALID, "Init real size and shape failed, Name: %s", op_desc->GetName().c_str());
+        return PARAM_INVALID;
+      }
+      continue;
+    }
+
+    if (op_desc->GetType() == VARIABLE) {
+      if (InitVariable(op_desc, variable_by_name) != SUCCESS) {
+        GELOGE(PARAM_INVALID, "Variable init failed, Name: %s", op_desc->GetName().c_str());
+        return PARAM_INVALID;
+      }
       continue;
     }
 
     auto it = op_desc_handle.find(op_desc->GetType());
     if (it != op_desc_handle.end()) {
       if ((this->*it->second)(op_desc) != SUCCESS) {
-        GELOGE(PARAM_INVALID, "NetOutput init failed, Name: %s", op_desc->GetName().c_str());
+        GELOGE(PARAM_INVALID, "Node init failed, Name: %s", op_desc->GetName().c_str());
         return PARAM_INVALID;
       }
       continue;
@@ -894,17 +923,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
     }
 
     GE_TIMESTAMP_RESTART(InitTbeHandle);
-    uint32_t run_mode = static_cast<uint32_t>(domi::ImplyType::INVALID);
-    if (AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, run_mode) &&
-        run_mode == static_cast<uint32_t>(domi::ImplyType::TVM)) {
-      // Skip no_task operator, such as concat and split.
-      bool attr_notask = false;
-      bool get_attr_notask_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_notask);
-      GE_IF_BOOL_EXEC(get_attr_notask_flag && attr_notask,
-                      GELOGI("Node[name:%s, type:%s] does not generate task, skip initialization.",
-                             op_desc->GetName().c_str(), op_desc->GetType().c_str());
-                      continue;);
-
+    if (IsTbeTask(op_desc)) {
       Status status = InitTbeHandle(op_desc);
       if (status != SUCCESS) {
         GELOGE(status, "TBE init failed. %s", op_desc->GetName().c_str());
@@ -913,33 +932,11 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
     }
     GE_TIMESTAMP_ADD(InitTbeHandle);
   }
-  AdjustDataOpList(data_by_index);
+
+  SetDataDumperArgs(compute_graph, variable_by_name);
   GE_TIMESTAMP_CALLNUM_END(LoadTBEKernelBinToOpDesc, "GraphLoader::LoadTBEKernelBinToOpDesc.");
   GE_TIMESTAMP_CALLNUM_END(InitTbeHandle, "GraphLoader::InitTbeHandle.");
-  return SUCCESS;
-}
-
-Status DavinciModel::InitInputOutputForDynamic(const ComputeGraphPtr &compute_graph) {
-  if (!known_node_) return SUCCESS;
-  // for dynamic shape
-  auto direct_nodes = compute_graph->GetDirectNode();
-  for (size_t i = 0; i < direct_nodes.size(); i++) {
-    auto node = direct_nodes.at(i);
-    auto op_desc = node->GetOpDesc();
-    if (op_desc == nullptr) {
-      GELOGE(PARAM_INVALID, "op_desc is null.");
-      return PARAM_INVALID;
-    }
-    if (IsDataOp(op_desc->GetType())) {
-      GELOGD("init data op %s", op_desc->GetName().c_str());
-      data_op_list_.push_back(op_desc);
-    }
-    if (op_desc->GetType() == NETOUTPUT) {
-      GELOGD("init netouput op %s", op_desc->GetName().c_str());
-      output_op_list_.push_back(op_desc);
-    }
-  }
-  return SUCCESS;
+  return GenInputOutputInfo(data_by_index, output_op_list);
 }
 
 void DavinciModel::SetLabelForDynamic(const NodePtr &node) {
@@ -957,24 +954,34 @@ void DavinciModel::SetLabelForDynamic(const NodePtr &node) {
   }
 }
 
+///
 /// @ingroup ge
 /// @brief Data Op Initialize.
+/// @param [in] ComputeGraphPtr: root graph of the model.
 /// @param [in] NodePtr: Data Op.
-/// @param [in/out] data_op_index: NetOutput addr size info.
+/// @param [in/out] data_op_index: index of courrent count.
+/// @param [in/out] data_by_index: Data ordered by index.
 /// @return Status
-Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, map<uint32_t, OpDescPtr> &data_by_index) {
+///
+Status DavinciModel::InitDataOp(const ComputeGraphPtr &graph, const NodePtr &node, uint32_t &data_op_index,
+                                map<uint32_t, OpDescPtr> &data_by_index) {
   // op_desc Checked by Init: Data, valid.
   auto op_desc = node->GetOpDesc();
-  if (known_node_) {
+  if (node->GetOwnerComputeGraph() != graph) {
+    GELOGI("Skip subgraph Data node: %s.", op_desc->GetName().c_str());
     return SUCCESS;
   }
-  uint32_t parent_index = 0;  // Ignore subgraph Data Node.
-  if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
-    GELOGI("Init zero copy by subgraph Data node: %s.", op_desc->GetName().c_str());
-    return InitInputBatchLabel(node);
+
+  GELOGI("Init Data node: %s.", op_desc->GetName().c_str());
+  auto data_index = data_op_index++;
+  if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
+    GELOGD("Get new index %u, old %u", data_index, data_op_index - 1);
   }
 
-  data_op_list_.push_back(op_desc);
+  data_by_index[data_index] = op_desc;
+  if (known_node_) {
+    return SUCCESS;
+  }
 
   // Make information for copy input data.
   const vector<int64_t> output_size_list = ModelUtils::GetOutputSize(op_desc);
@@ -986,10 +993,7 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
            op_desc->GetName().c_str(), output_size_list.size(), virtual_addr_list.size(), output_offset_list.size());
     return PARAM_INVALID;
   }
-  auto data_index = data_op_index;
-  if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
-    GELOGD("ge_train: get new index %u, old %u", data_index, data_op_index);
-  }
+
   bool fusion_flag = false;
   ZeroCopyOffset zero_copy_offset;
   int64_t data_size = output_size_list[kDataIndex];
@@ -1000,7 +1004,6 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
     return PARAM_INVALID;
   }
   new_input_data_info_[data_index] = zero_copy_offset;
-  data_by_index[data_index] = op_desc;
 
   for (size_t index = 0; index < virtual_addr_list.size(); ++index) {
     void *addr = virtual_addr_list.at(index);
@@ -1011,11 +1014,6 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
     new_input_outside_addrs_[addr] = zero_copy_offset;
   }
 
-  data_op_index++;
-  if (InitInputZeroCopyNodes(node) != SUCCESS) {
-    GELOGE(PARAM_INVALID, "Input zero copy nodes init failed!");
-    return PARAM_INVALID;
-  }
   return SUCCESS;
 }
 
@@ -1023,51 +1021,50 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, ma
 /// @ingroup ge
 /// @brief Sort Data op list by index.
 /// @param [in] data_by_index: map of Data Op.
-/// @return
+/// @param [in] output_op_list: list of NetOutput op.
+/// @return Status
 ///
-void DavinciModel::AdjustDataOpList(const map<uint32_t, OpDescPtr> &data_by_index) {
-  if (data_by_index.size() != data_op_list_.size()) {
-    GELOGW("Data map size: %zu, Data list size: %zu.", data_by_index.size(), data_op_list_.size());
-    return;
-  }
-
-  data_op_list_.clear();
+Status DavinciModel::GenInputOutputInfo(const map<uint32_t, OpDescPtr> &data_by_index,
+                                        const vector<OpDescPtr> &output_op_list) {
+  GELOGD("Data node size: %zu, NetOutput node size: %zu", data_by_index.size(), output_op_list.size());
   for (auto &item : data_by_index) {
-    data_op_list_.emplace_back(item.second);
-  }
-}
+    auto output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, item.second);
+    GELOGD("Data node: %s, output addr size: %zu", item.second->GetName().c_str(), output_addrs.size());
+    input_addrs_list_.emplace_back(output_addrs);
 
-///
-/// @ingroup ge
-/// @brief input zero copy node Initialize.
-/// @param [in] NodePtr: Data Op.
-/// @return Status
-///
-Status DavinciModel::InitInputZeroCopyNodes(const NodePtr &node) {
-  auto out_data_anchor = node->GetOutDataAnchor(kDataIndex);
-  if (out_data_anchor == nullptr) {
-    GELOGE(FAILED, "Out data anchor is nullptr");
-    return FAILED;
+    GE_CHK_STATUS_RET(InitAippInfo(item.first, item.second), "Init AIPP Info failed");
+    GE_CHK_STATUS_RET(InitAippType(item.first, item.second, data_by_index), "Init AIPP Type failed");
+    GE_CHK_STATUS_RET(InitOrigInputInfo(item.first, item.second), "Init Orig input failed");
+    GE_CHK_STATUS_RET(InitAippInputOutputDims(item.first, item.second), "Init AIPP dims failed");
+    if (item.second->GetType() == AIPP_DATA_TYPE) {
+      GELOGI("This is dynamic aipp model, Node: %s", item.second->GetName().c_str());
+      is_dynamic_aipp_ = true;
+    }
   }
-  for (auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-    auto node = peer_in_data_anchor->GetOwnerNode();
-    auto op_desc = node->GetOpDesc();
-    if (op_desc == nullptr) {
-      GELOGE(FAILED, "Op desc is nullptr");
-      return FAILED;
+
+  for (const auto &op_desc : output_op_list) {
+    auto input_addrs = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc);
+    GELOGD("NetOutput node: %s, input addr size: %zu", op_desc->GetName().c_str(), input_addrs.size());
+    output_addrs_list_.emplace_back(input_addrs);
+
+    bool getnext_sink_dynamic = false;
+    if (AttrUtils::GetBool(op_desc, ATTR_GETNEXT_SINK_DYNMAIC, getnext_sink_dynamic) && getnext_sink_dynamic) {
+      GELOGI("ATTR_GETNEXT_SINK_DYNMAIC has been set and is true, node: %s", op_desc->GetName().c_str());
+      is_getnext_sink_dynamic_ = true;
     }
-    string batch_label;
-    (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-    if (batch_label.empty()) {
-      batch_label = kDefaultBatchLable;
+
+    vector<string> shape_info;
+    if (AttrUtils::GetListStr(op_desc, ATTR_NAME_DYNAMIC_OUTPUT_DIMS, shape_info)) {
+      dynamic_output_shape_info_.insert(dynamic_output_shape_info_.end(), shape_info.begin(), shape_info.end());
     }
-    if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) {
-      zero_copy_op_id_batch_label_.emplace(pair<int64_t, string>(op_desc->GetId(), batch_label));
-      GELOGD("Init input zero copy nodes success, op name:%s, op id: %ld, batch label: %s.", op_desc->GetName().c_str(),
-             op_desc->GetId(), batch_label.c_str());
+
+    if (InitOutputTensorInfo(op_desc) != SUCCESS) {
+      return INTERNAL_ERROR;
     }
   }
-  return SUCCESS;
+
+  GE_CHK_STATUS_RET(InitInputDescInfo(data_by_index), "Init input desc info failed");
+  return InitOutputDescInfo(output_op_list);
 }
 
 bool DavinciModel::IsGetNextSinkDynamic(const OpDescPtr &op_desc) {
@@ -1081,24 +1078,27 @@ bool DavinciModel::IsGetNextSinkDynamic(const OpDescPtr &op_desc) {
 
 /// @ingroup ge
 /// @brief NetOutput Op Initialize.
+/// @param [in] ComputeGraphPtr: root graph of the model.
 /// @param [in] NodePtr: NetOutput Op.
+/// @param [in/out] vector<OpDescPtr>: All NetOutput node in model.
 /// @return Status
-Status DavinciModel::InitNetOutput(const NodePtr &node) {
+Status DavinciModel::InitNetOutput(const ComputeGraphPtr &graph, const NodePtr &node,
+                                   vector<OpDescPtr> &output_op_list) {
   // node->GetOpDesc Checked by Init: NetOutput, valid.
   auto op_desc = node->GetOpDesc();
   // excludes the function op sub graph, e.g. case,if
-  if (known_node_) {
+  if (node->GetOwnerComputeGraph() != graph) {
+    GELOGI("Skip subgraph NetOutput node: %s.", op_desc->GetName().c_str());
+    op_list_.erase(op_desc->GetId());
     return SUCCESS;
   }
-  ComputeGraphPtr owner_graph = node->GetOwnerComputeGraph();
-  GE_CHECK_NOTNULL(owner_graph);
-  if (owner_graph->GetParentGraph() != nullptr) {
-    GELOGI("Init zero copy by subgraph NetOutput node: %s.", op_desc->GetName().c_str());
-    op_list_.erase(op_desc->GetId());
-    return InitOutputBatchLabel(node);
+
+  GELOGI("Init NetOutput node: %s.", op_desc->GetName().c_str());
+  output_op_list.push_back(op_desc);
+  if (known_node_) {
+    return SUCCESS;
   }
 
-  output_op_list_.push_back(op_desc);
   // Make information for copy output data.
   const vector<int64_t> input_size_list = ModelUtils::GetInputSize(op_desc);
   const vector<void *> virtual_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc);
@@ -1146,18 +1146,24 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) {
       real_virtual_addrs_.insert(real_addr);
     }
   }
+  return SUCCESS;
+}
 
-  GE_IF_BOOL_EXEC(InitOutputZeroCopyNodes(node) != SUCCESS,
-                  GELOGE(PARAM_INVALID, "Output zero copy nodes init failed!"); return PARAM_INVALID;);
+Status DavinciModel::InitRealSizeAndShapeInfo(const ComputeGraphPtr &compute_graph, const NodePtr &node) {
+  if (node->GetName().find(kMultiBatchNodePostfix) != string::npos) {
+    GELOGD("No need to get size and shape of netoutput in subgraph.");
+    return SUCCESS;
+  }
+  GELOGD("Start init real size and shape info of %s.", node->GetName().c_str());
   GetAllGearsInfo(node);
   if (is_getnext_sink_dynamic_) {
     GE_IF_BOOL_EXEC(GetGetDynamicDimsNodeInfo(node) != SUCCESS,
                     GELOGE(PARAM_INVALID, "Failed to get info of getdynamicdims node."); return PARAM_INVALID;);
   }
   if (is_online_infer_dynamic_) {
-    GE_IF_BOOL_EXEC(GetGearAndRealOutSizeInfo(input_count, node) != SUCCESS,
+    GE_IF_BOOL_EXEC(GetGearAndRealOutSizeInfo(compute_graph, node) != SUCCESS,
                     GELOGE(PARAM_INVALID, "Failed to get gear and real out size info."); return PARAM_INVALID;);
-    GE_IF_BOOL_EXEC(GetGearAndRealOutShapeInfo(input_count, op_desc) != SUCCESS,
+    GE_IF_BOOL_EXEC(GetGearAndRealOutShapeInfo(compute_graph, node) != SUCCESS,
                     GELOGE(PARAM_INVALID, "Failed to get gear and real out shape info."); return PARAM_INVALID;);
   }
 
@@ -1176,7 +1182,7 @@ void DavinciModel::GetAllGearsInfo(const NodePtr &node) {
       if (shape_str.empty()) {
         continue;
       }
-      std::vector<int64_t> gear_info;
+      std::vector<int32_t> gear_info;
       std::vector<std::string> dims = ge::StringUtils::Split(shape_str, ',');
       for (const auto &dim : dims) {
         if (dim.empty()) {
@@ -1192,6 +1198,7 @@ void DavinciModel::GetAllGearsInfo(const NodePtr &node) {
     }
   }
 }
+
 Status DavinciModel::GetGetDynamicDimsNodeInfo(const NodePtr &node) {
   GE_CHECK_NOTNULL(node->GetOpDesc());
   size_t input_count = node->GetAllInDataAnchors().size();
@@ -1229,11 +1236,11 @@ Status DavinciModel::GetGetDynamicDimsNodeInfo(const NodePtr &node) {
   return SUCCESS;
 }
 
-Status DavinciModel::GetGearAndRealOutSizeInfo(size_t input_count, const NodePtr &node) {
-  GELOGD("Start get gear and real output size info of %s, input count is %zu.", node->GetName().c_str(), input_count);
+Status DavinciModel::GetGearAndRealOutSizeInfo(const ComputeGraphPtr &graph, const NodePtr &node) {
+  GELOGD("Start get gear and real output size info of %s.", node->GetName().c_str());
   merge_nodes_gear_and_real_out_size_info_.clear();
-  for (size_t idx = 0; idx < input_count; ++idx) {
-    auto in_anchor = node->GetAllInDataAnchors().at(idx);
+  size_t idx = 0;
+  for (const auto &in_anchor : node->GetAllInDataAnchors()) {
     auto peer_out_anchor = in_anchor->GetPeerOutAnchor();
     if (peer_out_anchor == nullptr) {
       continue;
@@ -1241,89 +1248,106 @@ Status DavinciModel::GetGearAndRealOutSizeInfo(size_t input_count, const NodePtr
     auto peer_node = peer_out_anchor->GetOwnerNode();
     auto op_desc = peer_node->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
-    if ((peer_node->GetType() == MERGE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) {
-      if (GetRealOutputSizeOfMerge(idx, peer_node) != SUCCESS) {
+    if ((peer_node->GetType() == CASE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) {
+      if (GetRealOutputSizeOfCase(graph, idx, peer_node) != SUCCESS) {
         GELOGE(PARAM_INVALID, "Get real output size of %s failed.", peer_node->GetName().c_str());
         return PARAM_INVALID;
       }
     }
+    idx++;
   }
   return SUCCESS;
 }
 
-Status DavinciModel::GetRealOutputSizeOfMerge(size_t input_index, const NodePtr &merge_node) {
-  GELOGD("Start get output size of %s, which is %zu input to netoutput.", merge_node->GetName().c_str(), input_index);
-  std::map<vector<int64_t>, int64_t> gear_and_real_out_size_info;
-  for (auto &in_anchor : merge_node->GetAllInDataAnchors()) {
-    auto peer_out_anchor = in_anchor->GetPeerOutAnchor();
-    if (peer_out_anchor == nullptr) {
-      continue;
-    }
-    auto in_node = peer_out_anchor->GetOwnerNode();
-    GELOGD("Input node of merge is %s.", in_node->GetName().c_str());
-    auto op_desc = in_node->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    string batch_label;
-    if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
-      size_t batch_index = static_cast<size_t>(stoi(batch_label.substr(batch_label.rfind('_') + 1)));
-      GELOGD("Batch index of %s is %zu.", op_desc->GetName().c_str(), batch_index);
-      if (batch_index > all_gears_info_.size()) {
-        GELOGE(PARAM_INVALID, "The value of ATTR_NAME_BATCH_LABEL is invalid.");
-        return PARAM_INVALID;
-      }
+Status DavinciModel::GetRealOutputSizeOfCase(const ComputeGraphPtr &graph, size_t input_index,
+                                             const NodePtr &case_node) {
+  GELOGD("Start get output size of %s, which is %zu input to netoutput.", case_node->GetName().c_str(), input_index);
+  const auto &func_desc = case_node->GetOpDesc();
+  GE_CHECK_NOTNULL(func_desc);
+  std::map<vector<int32_t>, int64_t> gear_and_real_out_size_info;
+  for (const auto &name : func_desc->GetSubgraphInstanceNames()) {
+    const auto &subgraph = graph->GetSubgraph(name);
+    if (subgraph == nullptr) {
+      GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s.", name.c_str());
+      return GE_GRAPH_EMPTY_SUBGRAPH;
+    }
+    for (auto &node : subgraph->GetDirectNode()) {
+      if (node->GetType() == NETOUTPUT) {
+        auto op_desc = node->GetOpDesc();
+        GE_CHECK_NOTNULL(op_desc);
+        string batch_label;
+        if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
+          size_t batch_index = static_cast<size_t>(stoi(batch_label.substr(batch_label.rfind('_') + 1)));
+          GELOGD("Batch index of %s is %zu.", op_desc->GetName().c_str(), batch_index);
+          if (batch_index > all_gears_info_.size()) {
+            GELOGE(PARAM_INVALID, "The value of ATTR_NAME_BATCH_LABEL is invalid.");
+            return PARAM_INVALID;
+          }
 
-      const vector<int64_t> output_size_list = ModelUtils::GetOutputSize(op_desc);
-      int output_index = ge::AnchorUtils::GetIdx(peer_out_anchor);
-      auto tensor_desc = op_desc->GetOutputDescPtr(output_index);
-      GE_CHECK_NOTNULL(tensor_desc);
-      int64_t data_size = 0;
-      if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, data_size) != GRAPH_SUCCESS) {
-        GELOGE(FAILED, "Get tensor size in bytes failed.");
-        return FAILED;
+          const vector<int64_t> input_size_list = ModelUtils::GetInputSize(op_desc);
+          auto tensor_desc = op_desc->GetInputDescPtr(input_index);
+          GE_CHECK_NOTNULL(tensor_desc);
+          int64_t data_size = 0;
+          if (TensorUtils::GetTensorSizeInBytes(*tensor_desc, data_size) != GRAPH_SUCCESS) {
+            GELOGE(FAILED, "Get tensor size in bytes failed.");
+            return FAILED;
+          }
+          gear_and_real_out_size_info[all_gears_info_[batch_index]] = data_size;
+          GELOGD("Get real gear index is: %zu, gear info is %s, size is %ld, tensor size is %ld",
+                 batch_index, formats::JoinToString(all_gears_info_[batch_index]).c_str(),
+                 input_size_list[input_index], data_size);
+        }
+        break;
       }
-      gear_and_real_out_size_info[all_gears_info_[batch_index]] = data_size;
-      GELOGD("Get real gear index is: %zu, gear info is %s, size is %ld, tensor size is %ld",
-             batch_index, formats::JoinToString(all_gears_info_[batch_index]).c_str(),
-             output_size_list[output_index], data_size);
     }
   }
   merge_nodes_gear_and_real_out_size_info_[input_index] = gear_and_real_out_size_info;
   return SUCCESS;
 }
 
-Status DavinciModel::GetGearAndRealOutShapeInfo(size_t input_count, const OpDescPtr &op_desc) {
-  GELOGD("Start to get dynamic output dims of %s.", op_desc->GetName().c_str());
+Status DavinciModel::GetGearAndRealOutShapeInfo(const ComputeGraphPtr &graph, const NodePtr &node) {
+  GELOGD("Start to get dynamic output dims of %s.", node->GetName().c_str());
   merge_nodes_gear_and_real_out_shape_info_.clear();
-  std::vector<std::string> dynamic_output_shape_info;
-  if (!AttrUtils::GetListStr(op_desc, ATTR_NAME_DYNAMIC_OUTPUT_DIMS, dynamic_output_shape_info)) {
-    GELOGD("Can not get dynamic output dims attr");
-    return SUCCESS;
-  }
-  GELOGI("Dynamic output shape info is %s", formats::JoinToString(dynamic_output_shape_info).c_str());
-  std::vector<vector<int64_t>> dynamic_output_shape;
-  ParseDynamicOutShape(dynamic_output_shape_info, dynamic_output_shape);
-  // idx: input_index to netoutput
-  for (size_t idx = 0; idx < input_count; ++idx) {
-    std::map<vector<int64_t>, vector<int64_t>> gear_and_real_out_shape_info;
-    for (auto &it : dynamic_output_shape) {
-      auto gear_index = static_cast<size_t>(it[0]);
-      if (gear_index > all_gears_info_.size()) {
-        GELOGE(PARAM_INVALID, "The value of cur index: %zu is invalid.", static_cast<size_t>(it[0]));
-        return PARAM_INVALID;
+  size_t idx = 0;
+  for (const auto &in_anchor : node->GetAllInDataAnchors()) {
+    auto peer_out_anchor = in_anchor->GetPeerOutAnchor();
+    if (peer_out_anchor == nullptr) {
+      continue;
+    }
+    auto peer_node = peer_out_anchor->GetOwnerNode();
+    auto op_desc = peer_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+    if ((peer_node->GetType() == CASE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) {
+      std::vector<std::string> dynamic_output_shape_info;
+      if (!AttrUtils::GetListStr(node->GetOpDesc(), ATTR_NAME_DYNAMIC_OUTPUT_DIMS, dynamic_output_shape_info)) {
+        GELOGD("Can not get dynamic output dims attr from %s.", node->GetName().c_str());
+        return SUCCESS;
       }
+      GELOGI("Dynamic output shape info is %s", formats::JoinToString(dynamic_output_shape_info).c_str());
+      std::vector<vector<int64_t>> dynamic_output_shape;
+      ParseDynamicOutShape(dynamic_output_shape_info, dynamic_output_shape);
+      std::map<vector<int32_t>, vector<int64_t>> gear_and_real_out_shape_info;
+      for (auto &it : dynamic_output_shape) {
+        auto gear_index = static_cast<size_t>(it[0]);
+        if (gear_index > all_gears_info_.size()) {
+          GELOGE(PARAM_INVALID, "The value of cur index: %zu is invalid.", static_cast<size_t>(it[0]));
+          return PARAM_INVALID;
+        }
 
-      if (static_cast<size_t>(it[1]) == idx) {
-        vector<int64_t> output_shape;
-        for (size_t i = 2; i < it.size(); ++i) {
-          output_shape.emplace_back(it[i]);
+        if (static_cast<size_t>(it[1]) == idx) {
+          vector<int64_t> output_shape;
+          for (size_t i = 2; i < it.size(); ++i) {
+            output_shape.emplace_back(it[i]);
+          }
+          gear_and_real_out_shape_info[all_gears_info_[gear_index]] = output_shape;
+          GELOGD("Get real gear index is: %zu, gear info is %s, output shape is %s.",
+                 gear_index, formats::JoinToString(all_gears_info_[gear_index]).c_str(),
+                 formats::JoinToString(output_shape).c_str());
         }
-        gear_and_real_out_shape_info[all_gears_info_[gear_index]] = output_shape;
-        GELOGD("Get real gear index is: %zu, gear info is %s, output shape is %s.",
-               gear_index, formats::JoinToString(all_gears_info_[gear_index]).c_str(),
-               formats::JoinToString(output_shape).c_str());
       }
+      merge_nodes_gear_and_real_out_shape_info_[idx] = gear_and_real_out_shape_info;
     }
-    merge_nodes_gear_and_real_out_shape_info_[idx] = gear_and_real_out_shape_info;
+    idx++;
   }
   return SUCCESS;
 }
@@ -1344,121 +1368,6 @@ void DavinciModel::ParseDynamicOutShape(const std::vector<std::string> &str_info
   }
 }
 
-///
-/// @ingroup ge
-/// @brief output zero copy node Initialize.
-/// @param [in] NodePtr: netoutput Op.
-/// @return Status
-///
-Status DavinciModel::InitOutputZeroCopyNodes(const NodePtr &node) {
-  set<NodePtr> nodes_need_record;
-  for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
-    auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
-    if (peer_out_data_anchor == nullptr) {
-      continue;
-    }
-    auto peer_node = peer_out_data_anchor->GetOwnerNode();
-    nodes_need_record.emplace(peer_node);
-
-    // Merge node output multiplexed input, upstream nodes need to be considered in multiple batch scenarios
-    if (peer_node->GetType() == MERGE) {
-      for (const auto &merge_peer_in_data_anchor : peer_node->GetAllInDataAnchors()) {
-        auto merge_peer_out_data_anchor = merge_peer_in_data_anchor->GetPeerOutAnchor();
-        if (merge_peer_out_data_anchor == nullptr) {
-          continue;
-        }
-        auto merge_peer_node = merge_peer_out_data_anchor->GetOwnerNode();
-        nodes_need_record.emplace(merge_peer_node);
-      }
-    } else {
-      for (const auto &other_in_data_anchor : peer_out_data_anchor->GetPeerInDataAnchors()) {
-        auto other_in_node = other_in_data_anchor->GetOwnerNode();
-        if (other_in_node->GetType() != NETOUTPUT) {
-          nodes_need_record.emplace(other_in_node);
-        }
-      }
-    }
-  }
-
-  for (const auto &node_need_record : nodes_need_record) {
-    auto op_desc = node_need_record->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    string batch_label;
-    (void)ge::AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-    if (batch_label.empty()) {
-      batch_label = kDefaultBatchLable;
-    }
-    if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) {
-      zero_copy_op_id_batch_label_.emplace(pair<int64_t, string>(op_desc->GetId(), batch_label));
-      GELOGD("Init Output zero copy nodes success, op name:%s, op id: %ld, batch label: %s.",
-             op_desc->GetName().c_str(), op_desc->GetId(), batch_label.c_str());
-    }
-  }
-  return SUCCESS;
-}
-
-///
-/// @ingroup ge
-/// @brief input zero copy node Initialize.
-/// @param [in] NodePtr: Data Op.
-/// @return Status
-///
-Status DavinciModel::InitInputBatchLabel(const NodePtr &node) {
-  string batch_label;
-  if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) {
-    return SUCCESS;  // Not Multi-batch.
-  }
-
-  const auto &out_data_anchor = node->GetOutDataAnchor(kDataIndex);
-  GE_CHECK_NOTNULL(out_data_anchor);
-
-  for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-    const auto &node = peer_in_data_anchor->GetOwnerNode();
-    const auto &op_desc = node->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-
-    if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) {
-      zero_copy_op_id_batch_label_[op_desc->GetId()] = batch_label;
-      GELOGD("Init input zero copy nodes success, op name: %s, op id: %ld, batch label: %s", op_desc->GetName().c_str(),
-             op_desc->GetId(), batch_label.c_str());
-    }
-  }
-
-  return SUCCESS;
-}
-
-///
-/// @ingroup ge
-/// @brief output zero copy node Initialize for Case.
-/// @param [in] NodePtr: netoutput Op.
-/// @return Status
-///
-Status DavinciModel::InitOutputBatchLabel(const NodePtr &node) {
-  string batch_label;
-  if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label)) {
-    return SUCCESS;  // Not Multi-batch.
-  }
-
-  for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
-    const auto &peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
-    if (peer_out_data_anchor == nullptr) {
-      continue;
-    }
-
-    const auto &peer_node = peer_out_data_anchor->GetOwnerNode();
-    const auto &op_desc = peer_node->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-
-    if (zero_copy_op_id_batch_label_.find(op_desc->GetId()) == zero_copy_op_id_batch_label_.end()) {
-      zero_copy_op_id_batch_label_[op_desc->GetId()] = batch_label;
-      GELOGD("Init Output zero copy nodes success, op name: %s, op id: %ld, batch label: %s",
-             op_desc->GetName().c_str(), op_desc->GetId(), batch_label.c_str());
-    }
-  }
-
-  return SUCCESS;
-}
-
 /// @ingroup ge
 /// @brief LabelSet Op Initialize.
 /// @param [in] op_desc: LabelSet Op descriptor.
@@ -1503,8 +1412,23 @@ Status DavinciModel::InitLabelSet(const OpDescPtr &op_desc) {
   return SUCCESS;
 }
 
-Status DavinciModel::InitVariable(const OpDescPtr &op_desc) {
-  variable_op_list_.push_back(op_desc);
+Status DavinciModel::InitVariable(const OpDescPtr &op_desc, map<string, OpDescPtr> &variable_by_name) {
+  if (op_desc->GetName() == NODE_NAME_GLOBAL_STEP) {
+    const auto output_sizes = ModelUtils::GetOutputSize(op_desc);
+    if (!output_sizes.empty()) {
+      global_step_size_ = output_sizes[0];
+    }
+    const auto output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc);
+    if (!output_addrs.empty()) {
+      global_step_addr_ = output_addrs[0];
+    }
+  }
+
+  if (op_desc->HasAttr(VAR_ATTR_VAR_IS_BROADCAST)) {
+    broadcast_variable_[op_desc->GetName()] = op_desc->GetOutputDesc(0);
+  }
+
+  variable_by_name[op_desc->GetName()] = op_desc;
   return SUCCESS;
 }
 
@@ -1544,7 +1468,8 @@ Status DavinciModel::LoadWithQueue() {
   }
 
   if (output_queue_ids_.size() != new_output_data_info_.size()) {
-    GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID, "Output queue ids not match model: output_queue=%zu output_data=%zu",
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID,
+           "Output queue ids not match model: output_queue=%zu output_data=%zu",
            output_queue_ids_.size(), new_output_data_info_.size());
     return ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID;
   }
@@ -1812,32 +1737,30 @@ Status DavinciModel::CpuModelRepeat() {
 
 Status DavinciModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc,
                                             vector<InputOutputDescInfo> &output_desc) {
-  if ((data_op_list_.empty()) || (data_op_list_[0]->GetInputsSize()) != 1) {
+  if (input_addrs_list_.empty() || input_addrs_list_[0].size() != 1) {
     GELOGI("data_op_list_ is empty or input_desc size is not 1.");
   } else {
-    std::vector<uint32_t> input_formats;
-    GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed.");
+    vector<uint32_t> input_formats;
+    GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats, false), "get input desc info failed.");
   }
 
-  std::vector<uint32_t> outputFormats;
-  GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get output desc info failed.");
-
+  vector<uint32_t> output_formats;
+  GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get output desc info failed");
   return SUCCESS;
 }
 
 Status DavinciModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc,
                                             vector<InputOutputDescInfo> &output_desc,
-                                            std::vector<uint32_t> &input_formats,
-                                            std::vector<uint32_t> &outputFormats) {
-  if ((data_op_list_.empty()) || (data_op_list_[0]->GetInputsSize()) != 1) {
+                                            vector<uint32_t> &input_formats,
+                                            vector<uint32_t> &output_formats, bool by_dims) {
+  if (input_addrs_list_.empty() || input_addrs_list_[0].size() != 1) {
     GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!");
     return FAILED;
   }
 
-  GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed");
-
-  GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get ouput desc info failed");
+  GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats, by_dims), "get input desc info failed");
 
+  GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, output_formats), "get output desc info failed");
   return SUCCESS;
 }
 
@@ -1881,73 +1804,104 @@ void DavinciModel::GetUserDesignateShapeOrder(std::vector<std::string> &user_inp
 /// @ingroup ge
 /// @brief Get AIPP input info
 /// @param [in] index
-/// @param [out] aipp_info
+/// @param [int] OpDescPtr
 /// @return execute result
 ///
-Status DavinciModel::GetAIPPInfo(uint32_t index, AippConfigInfo &aipp_info) {
-  GE_CHK_BOOL_RET_STATUS(index < data_op_list_.size(), PARAM_INVALID, "Index %u is invalid.", index);
-  OpDescPtr data_op = data_op_list_[index];
-  if (!data_op->HasAttr(ATTR_NAME_AIPP)) {
-    GELOGW("GetAIPPInfo: there is not AIPP related with index %u.", index);
-    return ACL_ERROR_GE_AIPP_NOT_EXIST;
+Status DavinciModel::InitAippInfo(uint32_t index, const OpDescPtr &op_desc) {
+  if (!op_desc->HasAttr(ATTR_NAME_AIPP)) {
+    GELOGW("There is not AIPP related with index %u.", index);
+    return SUCCESS;
   }
 
-  std::unique_ptr<domi::AippOpParams> aipp_params(new (std::nothrow) domi::AippOpParams());
-  GE_CHECK_NOTNULL(aipp_params);
-
-  ge::GeAttrValue::NAMED_ATTRS aipp_attr;
-  GE_CHK_BOOL_RET_STATUS(AttrUtils::GetNamedAttrs(data_op, ATTR_NAME_AIPP, aipp_attr), GE_AIPP_NOT_EXIST,
+  domi::AippOpParams aipp_params;
+  GeAttrValue::NAMED_ATTRS aipp_attr;
+  GE_CHK_BOOL_RET_STATUS(AttrUtils::GetNamedAttrs(op_desc, ATTR_NAME_AIPP, aipp_attr), GE_AIPP_NOT_EXIST,
                          "Data node do not contain param aipp!");
-  GE_CHK_STATUS_RET(OpUtils::ConvertAippParams(aipp_attr, aipp_params.get()), "get aipp params failed");
-  GELOGI("GetAIPPInfo: node data: %s, type: %s, current index: %u, current node related input rank: %u",
-         data_op->GetName().c_str(), data_op->GetType().c_str(), index, aipp_params->related_input_rank());
+  GE_CHK_STATUS_RET(OpUtils::ConvertAippParams(aipp_attr, &aipp_params), "get aipp params failed");
+  GELOGI("Node data: %s, type: %s, current index: %u, current node related input rank: %u",
+         op_desc->GetName().c_str(), op_desc->GetType().c_str(), index, aipp_params.related_input_rank());
 
-  GE_CHK_STATUS_RET(AippUtils::ConvertAippParams2AippInfo(aipp_params.get(), aipp_info),
+  AippConfigInfo aipp_info;
+  GE_CHK_STATUS_RET(AippUtils::ConvertAippParams2AippInfo(&aipp_params, aipp_info),
                     "convert aipp params to aipp config info failed");
 
+  aipp_info_list_[index] = aipp_info;
   return SUCCESS;
 }
 
-Status DavinciModel::GetAippType(uint32_t index, InputAippType &type, size_t &aipp_index) {
-  GE_CHK_BOOL_RET_STATUS(index < data_op_list_.size(), PARAM_INVALID, "Index %u is invalid.", index);
-  // Set default value
-  type = DATA_WITHOUT_AIPP;
-  aipp_index = 0xFFFFFFFF;  // default invalid value
-  OpDescPtr data_op = data_op_list_[index];
-  GE_CHECK_NOTNULL(data_op);
-  if (!data_op->HasAttr(ATTR_DATA_RELATED_AIPP_MODE)) {
+///
+/// @ingroup ge
+/// @brief Get AIPP input info
+/// @param [in] index
+/// @param [out] aipp_info
+/// @return execute result
+///
+Status DavinciModel::GetAippInfo(uint32_t index, AippConfigInfo &aipp_info) const {
+  const auto it = aipp_info_list_.find(index);
+  if (it == aipp_info_list_.end()) {
+    GELOGW("there is not AIPP related with index %u.", index);
+    return ACL_ERROR_GE_AIPP_NOT_EXIST;
+  }
+
+  aipp_info = it->second;
+  return SUCCESS;
+}
+
+Status DavinciModel::InitAippType(uint32_t index, const OpDescPtr &op_desc, const map<uint32_t, OpDescPtr> &data_list) {
+  if (!op_desc->HasAttr(ATTR_DATA_RELATED_AIPP_MODE)) {
     GELOGW("There is no aipp releated info with index %u.", index);
     return SUCCESS;
   }
-  std::string data_mode;
-  (void)AttrUtils::GetStr(data_op, ATTR_DATA_RELATED_AIPP_MODE, data_mode);
+
+  // Set default value
+  InputAippType aipp_type = DATA_WITHOUT_AIPP;
+  string data_mode;
+  (void)AttrUtils::GetStr(op_desc, ATTR_DATA_RELATED_AIPP_MODE, data_mode);
   if (data_mode == "static_aipp") {
-    type = DATA_WITH_STATIC_AIPP;
+    aipp_type = DATA_WITH_STATIC_AIPP;
   } else if (data_mode == "dynamic_aipp") {
-    type = DATA_WITH_DYNAMIC_AIPP;
+    aipp_type = DATA_WITH_DYNAMIC_AIPP;
   } else if (data_mode == "dynamic_aipp_conf") {
-    type = DYNAMIC_AIPP_NODE;
+    aipp_type = DYNAMIC_AIPP_NODE;
   } else {
     GELOGE(ACL_ERROR_GE_AIPP_MODE_INVALID,
            "The info of aipp releated info %s is invalid with index %u.", data_mode.c_str(), index);
     return ACL_ERROR_GE_AIPP_MODE_INVALID;
   }
 
-  if (type == DATA_WITH_DYNAMIC_AIPP) {
+  size_t aipp_index = 0xFFFFFFFF;  // default invalid value
+  if (aipp_type == DATA_WITH_DYNAMIC_AIPP) {
     string releated_name;
-    (void)AttrUtils::GetStr(data_op, ATTR_DATA_AIPP_DATA_NAME_MAP, releated_name);
-    for (size_t i = 0; i < data_op_list_.size(); ++i) {
-      GE_CHECK_NOTNULL(data_op_list_[i]);
-      if (data_op_list_[i]->GetName() == releated_name) {
-        GELOGI("Find aipp_data [%s] index %zu from index %u", releated_name.c_str(), i, index);
-        aipp_index = i;
+    (void)AttrUtils::GetStr(op_desc, ATTR_DATA_AIPP_DATA_NAME_MAP, releated_name);
+    for (const auto item : data_list) {
+      if (item.second->GetName() == releated_name) {
+        GELOGI("Find aipp_data [%s] index %zu from index %u", releated_name.c_str(), item.first, index);
+        aipp_index = item.first;
       }
     }
+
     if (aipp_index == 0xFFFFFFFF) {
-      GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "Can not find aipp data node from index %u", index);
-      return ACL_ERROR_GE_AIPP_NOT_EXIST;
+      GELOGW("Can not find aipp data node from index %u", index);
+      return SUCCESS;
     }
   }
+
+  aipp_type_list_[index] = { aipp_type, aipp_index };
+  return SUCCESS;
+}
+
+Status DavinciModel::GetAippType(uint32_t index, InputAippType &aipp_type, size_t &aipp_index) const {
+  GE_CHK_BOOL_RET_STATUS(index < input_addrs_list_.size(), PARAM_INVALID, "Index %u is invalid", index);
+  const auto it = aipp_type_list_.find(index);
+  if (it == aipp_type_list_.end()) {
+    GELOGW("There is no aipp releated info with index %u.", index);
+    aipp_type = DATA_WITHOUT_AIPP;
+    aipp_index = 0xFFFFFFFF;
+    return SUCCESS;
+  }
+
+  aipp_type = it->second.first;
+  aipp_index = it->second.second;
   return SUCCESS;
 }
 
@@ -1963,7 +1917,7 @@ void DavinciModel::SetDynamicSize(const std::vector<uint64_t> &batch_num, int32_
   dynamic_type_ = dynamic_type;
 }
 
-void DavinciModel::GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynamic_type) {
+void DavinciModel::GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynamic_type) const {
   if (batch_size_.empty()) {
     GELOGD("User does not set dynamic size");
   }
@@ -1975,122 +1929,95 @@ void DavinciModel::GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynami
   dynamic_type = dynamic_type_;
 }
 
-void DavinciModel::GetModelAttr(std::vector<std::string> &dynamic_output_shape_info) {
-  for (auto &op : output_op_list_) {
-    if (op->GetType() != NETOUTPUT) {
-      continue;
-    }
-    if (!AttrUtils::GetListStr(op, ATTR_NAME_DYNAMIC_OUTPUT_DIMS, dynamic_output_shape_info)) {
-      GELOGD("Can not get dynamic output dims attr");
-    }
-  }
+void DavinciModel::GetModelAttr(vector<string> &out_shape_info) const {
+  out_shape_info.insert(out_shape_info.end(), dynamic_output_shape_info_.begin(), dynamic_output_shape_info_.end());
 }
 
-Status DavinciModel::GetInputOutputDescInfoForZeroCopy(vector<InputOutputDescInfo> &input_desc,
-                                                       vector<InputOutputDescInfo> &output_desc,
-                                                       std::vector<uint32_t> &input_formats,
-                                                       std::vector<uint32_t> &outputFormats) {
-  if ((data_op_list_.empty()) || (1 != data_op_list_[0]->GetInputsSize())) {
-    GELOGE(FAILED, "OP List Pointer is null or input_desc size is not 1!");
-    return FAILED;
-  }
-
-  GE_CHK_STATUS_RET(GetInputDescInfo(input_desc, input_formats), "get input desc info failed");
-
-  GE_CHK_STATUS_RET(GetOutputDescInfo(output_desc, outputFormats), "get ouput desc info failed");
-
-  GE_CHK_BOOL_RET_STATUS(output_desc.size() == output_memory_size_list_.size(), INTERNAL_ERROR,
-                         "output_desc size[%zu] not equal output_size_list_[%zu] size!", output_desc.size(),
-                         output_memory_size_list_.size());
-
-  /// For function zero copy,the momery should be aligned by 512 bytes.
-  /// And, because of the cce op limit, size should be lager than the real shape size. The memory should be padded by 32
-  /// bytes.
-  /// *size equals to ((tensorDesc->dataSize + 2 * 32 - 1) / 32) * 32;
-  for (size_t i = 0; i < output_memory_size_list_.size(); i++) {
-    output_desc[i].size = output_memory_size_list_[i];
-  }
-
-  return SUCCESS;
-}
-
-void DavinciModel::SetInputDimsInfo(const vector<int64_t> &model_input_dims, Format &format,
-                                    InputOutputDescInfo &input) {
+void DavinciModel::SetInputDimsInfo(const vector<int64_t> &input_dims, Format &format, ShapeDescription &shape_info) {
   uint32_t n, c, h, w;
   n = format == FORMAT_NHWC ? NHWC_DIM_N : NCHW_DIM_N;
   c = format == FORMAT_NHWC ? NHWC_DIM_C : NCHW_DIM_C;
   h = format == FORMAT_NHWC ? NHWC_DIM_H : NCHW_DIM_H;
   w = format == FORMAT_NHWC ? NHWC_DIM_W : NCHW_DIM_W;
 
-  if (model_input_dims.size() == static_cast<size_t>(NORMAL_TENSOR_SIZE)) {
-    input.shape_info.num = model_input_dims[n];
-    input.shape_info.height = model_input_dims[h];
-    input.shape_info.width = model_input_dims[w];
-    input.shape_info.channel = model_input_dims[c];
+  if (input_dims.size() == static_cast<size_t>(NORMAL_TENSOR_SIZE)) {
+    shape_info.num = input_dims[n];
+    shape_info.height = input_dims[h];
+    shape_info.width = input_dims[w];
+    shape_info.channel = input_dims[c];
   }
-  for (size_t k = 0; k < model_input_dims.size(); ++k) {
-    input.shape_info.dims.push_back(model_input_dims[k]);
+  for (size_t k = 0; k < input_dims.size(); ++k) {
+    shape_info.dims.push_back(input_dims[k]);
   }
-  return;
 }
 
-void DavinciModel::CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input) {
-  if (is_new_model_desc_ && op_desc->HasAttr(ATTR_NAME_INPUT_DIMS)) {
-    // When static aipp is set, need to get the model input dims which processed by aipp
-    vector<int64_t> model_input_dims;
-    (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_DIMS, model_input_dims);
-    SetInputDimsInfo(model_input_dims, format, input);
-    return;
-  }
+void DavinciModel::CreateInputDimsInfo(const OpDescPtr &op_desc, Format format,
+                                       ShapeDescription &shape_info, ShapeDescription &dims_info) {
   // judge if this data is linked dynamic aipp first, multiply batch has been considered
   if (op_desc->HasAttr(ATTR_DYNAMIC_AIPP_INPUT_DIMS)) {
     vector<int64_t> dynamic_aipp_input_dims;
     (void)AttrUtils::GetListInt(op_desc, ATTR_DYNAMIC_AIPP_INPUT_DIMS, dynamic_aipp_input_dims);
-    SetInputDimsInfo(dynamic_aipp_input_dims, format, input);
-    return;
+    SetInputDimsInfo(dynamic_aipp_input_dims, format, shape_info);
   } else {
     // judge if this data is multiply batch
     if (!op_desc->HasAttr(ATTR_MBATCH_ORIGIN_INPUT_DIMS)) {
       vector<int64_t> input_dims = op_desc->GetInputDescPtr(0)->GetShape().GetDims();
-      SetInputDimsInfo(input_dims, format, input);
-      return;
+      SetInputDimsInfo(input_dims, format, shape_info);
     } else {
       vector<int64_t> origin_input_dims;
       (void)AttrUtils::GetListInt(op_desc, ATTR_MBATCH_ORIGIN_INPUT_DIMS, origin_input_dims);
-      SetInputDimsInfo(origin_input_dims, format, input);
-      return;
+      SetInputDimsInfo(origin_input_dims, format, shape_info);
     }
   }
+
+  if (op_desc->HasAttr(ATTR_NAME_INPUT_DIMS)) {
+    // When static aipp is set, need to get the model input dims which processed by aipp
+    vector<int64_t> model_input_dims;
+    (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_DIMS, model_input_dims);
+    SetInputDimsInfo(model_input_dims, format, dims_info);
+  } else {
+    dims_info = shape_info;
+  }
 }
 
-Status DavinciModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, std::vector<uint32_t> &formats) {
-  for (size_t index = 0; index < data_op_list_.size(); ++index) {
-    InputOutputDescInfo input;
-    GE_CHECK_NOTNULL(data_op_list_[index]);
-    GE_CHECK_NOTNULL(data_op_list_[index]->GetInputDescPtr(0));
+Status DavinciModel::InitInputDescInfo(const map<uint32_t, OpDescPtr> &data_by_index) {
+  for (const auto &item : data_by_index) {
+    const auto op_desc = item.second;
+    GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(0));
 
-    Format format = data_op_list_[index]->GetInputDescPtr(0)->GetFormat();
-    CreateInputDimsInfo(data_op_list_[index], format, input);
+    InputOutputDescInfo input;
+    ShapeDescription dims_info;
+    Format format = op_desc->GetInputDescPtr(0)->GetFormat();
+    CreateInputDimsInfo(op_desc, format, input.shape_info, dims_info);
 
-    input.data_type = data_op_list_[index]->GetInputDescPtr(0)->GetDataType();
-    input.name = data_op_list_[index]->GetName();
+    input.data_type = op_desc->GetInputDescPtr(0)->GetDataType();
+    input.name = op_desc->GetName();
     int64_t input_size = 0;
-    GE_CHK_STATUS_RET(TensorUtils::GetSize(*data_op_list_[index]->GetInputDescPtr(0), input_size),
-                      "get input size failed.");
+    GE_CHK_STATUS_RET(TensorUtils::GetSize(*op_desc->GetInputDescPtr(0), input_size), "get input size failed.");
     input.size = input_size;
-    formats.push_back(format);
-    input_desc.push_back(input);
+    input_formats_.push_back(format);
+    input_descs_.push_back(input);
+
+    input.shape_info = dims_info;
+    input_descs_dims_.push_back(input);
   }
-  // cause GetInputDescInfo called not only once, set is_new_model_desc_ to false after calc the model input dims
-  is_new_model_desc_ = false;
   return SUCCESS;
 }
 
-void DavinciModel::CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputDescInfo &output,
+Status DavinciModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_descs,
+                                      vector<uint32_t> &input_formats, bool by_dims) const {
+  const vector<InputOutputDescInfo> &input_desc_info = by_dims ? input_descs_dims_ : input_descs_;
+  input_descs.insert(input_descs.end(), input_desc_info.begin(), input_desc_info.end());
+  input_formats.insert(input_formats.end(), input_formats_.begin(), input_formats_.end());
+
+  return SUCCESS;
+}
+
+void DavinciModel::CreateOutput(uint32_t index, const OpDescPtr &op_desc, InputOutputDescInfo &output,
                                 uint32_t &format_result) {
   /// netoutput input tensor desc
   GE_IF_BOOL_EXEC(op_desc->GetInputDescPtr(index) == nullptr, GELOGE(FAILED, "OpDesc GetInputDescPtr is nullptr");
-                  return );
+                  return);
   Format format = op_desc->GetInputDescPtr(index)->GetFormat();
   GeShape shape = op_desc->GetInputDescPtr(index)->GetShape();
   DataType data_type = op_desc->GetInputDescPtr(index)->GetDataType();
@@ -2139,10 +2066,9 @@ void DavinciModel::CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputD
   output.data_type = op_desc->GetInputDescPtr(index)->GetDataType();
 }
 
-Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats) {
-  GELOGD("Output node size: %zu", output_op_list_.size());
-  for (size_t i = 0; i < output_op_list_.size(); i++) {
-    auto &op_desc = output_op_list_[i];
+Status DavinciModel::InitOutputDescInfo(const vector<OpDescPtr> &output_op_list) {
+  GELOGD("Output node size: %zu", output_op_list.size());
+  for (const auto &op_desc : output_op_list) {
     uint32_t out_size = static_cast<uint32_t>(op_desc->GetInputsSize());
     for (uint32_t index = 0; index < out_size; index++) {
       string output_name;
@@ -2165,20 +2091,18 @@ Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
                       std::to_string(src_index[index]);
       }
       output.name = output_name;
-      output_desc.push_back(output);
-      formats.push_back(format_result);
+      output_descs_.push_back(output);
+      output_formats_.push_back(format_result);
     }
   }
   return SUCCESS;
 }
 
-ge::Format DavinciModel::GetFormat() {
-  if ((data_op_list_.empty()) || data_op_list_[0] == nullptr || data_op_list_[0]->GetInputDescPtr(0) == nullptr) {
-    GELOGW("OP List Pointer is null or input_desc size is not 1!");
-    return FORMAT_NCHW;
-  }
-
-  return data_op_list_[0]->GetInputDescPtr(0)->GetFormat();
+Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_descs,
+                                       vector<uint32_t> &output_formats) const {
+  output_descs.insert(output_descs.end(), output_descs_.begin(), output_descs_.end());
+  output_formats.insert(output_formats.end(), output_formats_.begin(), output_formats_.end());
+  return SUCCESS;
 }
 
 Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data) {
@@ -2186,8 +2110,9 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data
   const std::vector<DataBuffer> &blobs = input_data.blobs;
   for (const auto &data : new_input_data_info_) {
     if (data.first >= blobs.size()) {
-      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld", blobs.size(),
-             new_input_data_info_.size(), data.first, data.second.GetDataInfo().at(0).first);
+      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld, op_name(%s)", blobs.size(),
+             new_input_data_info_.size(), data.first, data.second.GetDataInfo().at(0).first,
+             data.second.GetOpName().c_str());
       return FAILED;
     }
 
@@ -2198,13 +2123,14 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data
     }
     uint64_t data_size = data.second.GetDataSize();
     GE_CHK_BOOL_RET_STATUS(data_size >= data_buf.length, PARAM_INVALID,
-                           "input data size(%lu) does not match model required size(%lu), ret failed.", data_buf.length,
-                           data_size);
+                           "input data size(%lu) does not match model required size(%lu), op_name(%s) ret failed.",
+                           data_buf.length, data_size, data.second.GetOpName().c_str());
     void *mem_addr = data.second.GetBasicAddr();
     void *data_buf_addr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(data_buf.data));
     uint64_t data_buf_length = data_buf.length;
-    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]",
-           runtime_param_.graph_id, data.first, mem_addr, data_buf_addr, data_size, data_buf_length);
+    GELOGI("CopyPlainData memcpy graph_%u type[F] input[%s] rank[%u] dst[%p] src[%p] mem_size[%lu] datasize[%lu]",
+           runtime_param_.graph_id, data.second.GetOpName().c_str(), data.first, mem_addr, data_buf_addr, data_size,
+           data_buf_length);
     GE_CHK_RT_RET(rtMemcpy(mem_addr, data_size, data_buf_addr, data_buf_length, kind));
   }
 
@@ -2215,217 +2141,191 @@ Status DavinciModel::SyncVarData() {
   GELOGI("Sync var data, model id:%u", model_id_);
   Status ret = SUCCESS;
 
-  OpDescPtr global_step = GetVariableOp(NODE_NAME_GLOBAL_STEP);
-  if (global_step != nullptr) {
-    auto v_output_size = ModelUtils::GetOutputSize(global_step);
-    auto v_output_addr = ModelUtils::GetOutputDataAddrs(runtime_param_, global_step);
-    if (v_output_size.empty() || v_output_addr.empty()) {
-      GELOGE(PARAM_INVALID, "global step op:%s not set output", global_step->GetName().c_str());
-      return PARAM_INVALID;
-    }
-    std::vector<uint64_t> v_step;
-    v_step.push_back(iterator_count_);
-    GE_CHK_RT_RET(rtMemcpy(v_output_addr[0], v_output_size[0], v_step.data(), v_step.size() * sizeof(uint64_t),
+  if (global_step_addr_ != nullptr && global_step_size_ != 0) {
+    const vector<uint64_t> v_step = { iterator_count_ };
+    GE_CHK_RT_RET(rtMemcpy(global_step_addr_, global_step_size_, v_step.data(), v_step.size() * sizeof(uint64_t),
                            RT_MEMCPY_HOST_TO_DEVICE));
   }
 
-  for (auto op_desc : variable_op_list_) {
-    ret =
-        VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_);
+  for (const auto &item : broadcast_variable_) {
+    ret = VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, item.first, item.second, mem_base_);
     GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_,
-                     op_desc->GetName().c_str());
+                     item.first.c_str());
   }
   return ret;
 }
 
-inline int64_t SumSize(const vector<int64_t> &size_list) {
-  int64_t sum_size = 0;
-  for (const int64_t &size : size_list) {
-    sum_size += size;
+Status DavinciModel::InitModelProfile() {
+  for (const auto &task : task_list_) {
+    GE_CHECK_NOTNULL(task);
+    const FusionOpInfo *fusion_op_info = task->GetFusionOpInfo();
+    // when type is RT_MODEL_TASK_KERNEL, ctx is not null
+    if ((fusion_op_info == nullptr) || fusion_op_info->original_op_names.empty()) {
+      continue;
+    }
+
+    GELOGI("task.id = %u, opNum = %zu", task->GetTaskID(), fusion_op_info->original_op_names.size());
+    op_id_map_.insert(std::make_pair(fusion_op_info->op_index, task->GetTaskID()));
   }
-  return sum_size;
+
+  std::set<uint32_t> task_id_set;
+  using CIT = std::multimap<uint32_t, uint32_t>::const_iterator;
+  using Range = std::pair<CIT, CIT>;
+  for (const auto &task : task_list_) {
+    GE_CHECK_NOTNULL(task);
+    const FusionOpInfo *fusion_op_info = task->GetFusionOpInfo();
+    if ((fusion_op_info == nullptr) || fusion_op_info->original_op_names.empty()) {
+      continue;
+    }
+
+    if (task_id_set.count(task->GetTaskID()) > 0) {
+      continue;
+    }
+
+    const auto &op_desc = GetOpByIndex(fusion_op_info->op_index);
+    GE_CHK_BOOL_EXEC(op_desc != nullptr, return FAILED, "index: %u out of range", fusion_op_info->op_index);
+
+    ProfileInfo profile;
+    profile.fusion_info = *fusion_op_info;
+    Range range = op_id_map_.equal_range(fusion_op_info->op_index);
+    for (CIT range_idx = range.first; range_idx != range.second; ++range_idx) {
+      profile.task_count++;
+      task_id_set.insert(range_idx->second);
+    }
+
+    // memory info
+    TaskMemInfo &mem_info = profile.memory_info;
+    const auto input_size = ModelUtils::GetInputSize(op_desc);
+    const auto output_size = ModelUtils::GetOutputSize(op_desc);
+    const auto workspace_size = ModelUtils::GetWorkspaceSize(op_desc);
+    const auto weight_size = ModelUtils::GetWeightSize(op_desc);
+    mem_info.input_size = std::accumulate(input_size.begin(), input_size.end(), 0);
+    mem_info.output_size = std::accumulate(output_size.begin(), output_size.end(), 0);
+    mem_info.workspace_size = std::accumulate(workspace_size.begin(), workspace_size.end(), 0);
+    mem_info.weight_size = std::accumulate(weight_size.begin(), weight_size.end(), 0);
+    mem_info.total_size = mem_info.weight_size + mem_info.input_size + mem_info.output_size + mem_info.workspace_size;
+
+    profile_list_.emplace_back(profile);
+  }
+
+  GELOGI("fusion task size: %zu, profile info size: %zu", op_id_map_.size(), profile_list_.size());
+  return SUCCESS;
 }
 
 Status DavinciModel::SinkModelProfile() {
   // profiling plugin must be registered
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return SUCCESS);
-
-  Msprof::Engine::ReporterData reporter_data{};
+  auto &prof_mgr = ProfilingManager::Instance();
+  ReporterData reporter_data{};
   // report model data tag name
-  std::string tag_name;
-  tag_name.append("model_load_info_").append(std::to_string(this->Id()));
+  std::string tag_name("model_load_info_" + std::to_string(this->Id()));
   GE_CHK_BOOL_EXEC(memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN, tag_name.c_str(), tag_name.size()) == EOK,
                    return FAILED, "Sink model tag memcpy error.");
 
   // Model Header
-  string name;
-  if (!om_name_.empty()) {
-    name = om_name_;
-  } else {
-    name = name_;
-  }
+  std::string name = om_name_.empty() ? name_ : om_name_;
   size_t name_len = name.size();
   reporter_data.deviceId = device_id_;
   reporter_data.data = (unsigned char *)&name_len;
   reporter_data.dataLen = sizeof(int32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   reporter_data.data = (unsigned char *)name.c_str();
   reporter_data.dataLen = name.size();
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   uint32_t model_id = this->Id();
   reporter_data.data = (unsigned char *)&model_id;
   reporter_data.dataLen = sizeof(uint32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   // Load Start/End Time
   int64_t start_time = this->GetLoadBeginTime();
   reporter_data.data = (unsigned char *)&start_time;
   reporter_data.dataLen = sizeof(int64_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   int64_t end_time = this->GetLoadEndTime();
   reporter_data.data = (unsigned char *)&end_time;
   reporter_data.dataLen = sizeof(int64_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
-
-  int32_t task_num = task_list_.size();
-  std::multimap<uint32_t, uint32_t> op_id_map;
-  std::set<uint32_t> task_id_set;
-  for (int32_t i = 0; i < task_num; i++) {
-    auto task = task_list_[i];
-    GE_CHECK_NOTNULL(task);
-    auto fusion_op_info = task->GetFusionOpInfo();
-    // when type is RT_MODEL_TASK_KERNEL, ctx is not null
-    if (fusion_op_info != nullptr) {
-      uint32_t op_num = fusion_op_info->original_op_names.size();
-      uint32_t task_id = task->GetTaskID();
-      if (op_num > 0) {
-        op_id_map.insert(std::make_pair(fusion_op_info->op_index, task_id));
-      }
-    }
-  }
-
-  struct memoryInfo {
-    int64_t input_size;
-    int64_t output_size;
-    int64_t weight_size;
-    int64_t workspace_size;
-    int64_t total_size;
-
-    memoryInfo() : input_size(0), output_size(0), weight_size(0), workspace_size(0), total_size(0) {}
-  };
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   using CIT = std::multimap<uint32_t, uint32_t>::const_iterator;
   using Range = std::pair<CIT, CIT>;
-  for (int32_t i = 0; i < task_num; i++) {
-    auto task = task_list_[i];
-    GE_CHECK_NOTNULL(task);
-    auto fusion_op_info = task->GetFusionOpInfo();
-    if (fusion_op_info != nullptr && fusion_op_info->original_op_names.size() > 0) {
-      uint32_t task_id = task->GetTaskID();
-      uint32_t op_num = fusion_op_info->original_op_names.size();
-      uint32_t task_count = 0;
-      if (task_id_set.count(task_id) != 0) {
-        continue;
-      }
-
-      uint32_t op_id = fusion_op_info->op_index;
-      Range range = op_id_map.equal_range(op_id);
-      for (CIT range_idx = range.first; range_idx != range.second; ++range_idx) {
-        task_count++;
-        uint32_t task_id = range_idx->second;
-        task_id_set.insert(task_id);
-      }
-
-      // op name after fusion
-      string fusion_op_name = fusion_op_info->op_name;
-      int32_t fusion_op_name_len = fusion_op_name.size() == 0 ? 1 : fusion_op_name.size();
-      reporter_data.data = (unsigned char *)&fusion_op_name_len;
-      reporter_data.dataLen = sizeof(int32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
-
-      reporter_data.data = (unsigned char *)fusion_op_name.c_str();
-      reporter_data.dataLen = fusion_op_name_len;
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
-
-      // original op name before fusion
-      reporter_data.data = (unsigned char *)&op_num;
-      reporter_data.dataLen = sizeof(int32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
-
-      for (uint32_t k = 0; k < op_num; k++) {
-        std::string op_name = fusion_op_info->original_op_names[k];
-        int32_t op_name_len = op_name.size() == 0 ? 1 : op_name.size();
-        reporter_data.data = (unsigned char *)&op_name_len;
-        reporter_data.dataLen = sizeof(int32_t);
-        GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                         this->Id());
-        reporter_data.data = (unsigned char *)op_name.c_str();
-        reporter_data.dataLen = op_name_len;
-        GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                         this->Id());
-      }
-
-      // stream id info
-      uint32_t streamId = task->GetStreamId();
-      reporter_data.data = (unsigned char *)&streamId;
+  for (const ProfileInfo &profile : profile_list_) {
+    // op name after fusion
+    string fusion_op_name = profile.fusion_info.op_name;
+    int32_t fusion_op_name_len = fusion_op_name.size() == 0 ? 1 : fusion_op_name.size();
+    reporter_data.data = (unsigned char *)&fusion_op_name_len;
+    reporter_data.dataLen = sizeof(int32_t);
+    GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                     "Reporter data fail, model id:%u.", this->Id());
+
+    reporter_data.data = (unsigned char *)fusion_op_name.c_str();
+    reporter_data.dataLen = fusion_op_name_len;
+    GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                     "Reporter data fail, model id:%u.", this->Id());
+
+    // original op name before fusion
+    uint32_t op_num = profile.fusion_info.original_op_names.size();
+    reporter_data.data = (unsigned char *)&op_num;
+    reporter_data.dataLen = sizeof(int32_t);
+    GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                     "Reporter data fail, model id:%u.", this->Id());
+
+    for (uint32_t k = 0; k < op_num; k++) {
+      std::string op_name = profile.fusion_info.original_op_names[k];
+      int32_t op_name_len = op_name.size() == 0 ? 1 : op_name.size();
+      reporter_data.data = (unsigned char *)&op_name_len;
       reporter_data.dataLen = sizeof(int32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
-
-      // memory info
-      struct memoryInfo memory_info;
-      uint32_t op_index = fusion_op_info->op_index;
-      auto iter = op_list_.find(op_index);
-      GE_CHK_BOOL_EXEC(iter != op_list_.end(), return FAILED, "index is out of range, index: %u", op_index);
-      auto op_desc = iter->second;
-      memory_info.input_size = SumSize(ModelUtils::GetInputSize(op_desc));
-      memory_info.output_size = SumSize(ModelUtils::GetOutputSize(op_desc));
-      memory_info.workspace_size = SumSize(ModelUtils::GetWorkspaceSize(op_desc));
-      memory_info.weight_size = SumSize(ModelUtils::GetWeightSize(op_desc));
-      memory_info.total_size =
-          memory_info.weight_size + memory_info.input_size + memory_info.output_size + memory_info.workspace_size;
-      reporter_data.data = (unsigned char *)&memory_info;
-      reporter_data.dataLen = sizeof(struct memoryInfo);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
-
-      // task info
-      reporter_data.data = (unsigned char *)&task_count;
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
+      reporter_data.data = (unsigned char *)op_name.c_str();
+      reporter_data.dataLen = op_name_len;
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
+    }
+
+    // stream id info
+    uint32_t streamId = profile.fusion_info.stream_id;
+    reporter_data.data = (unsigned char *)&streamId;
+    reporter_data.dataLen = sizeof(int32_t);
+    GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                     "Reporter data fail, model id:%u.", this->Id());
+
+    // memory info
+    reporter_data.data = (unsigned char *)&profile.memory_info;
+    reporter_data.dataLen = sizeof(profile.memory_info);
+    GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                     "Reporter data fail, model id:%u.", this->Id());
+
+    // task info
+    reporter_data.data = (unsigned char *)&profile.task_count;
+    reporter_data.dataLen = sizeof(uint32_t);
+    GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                     "Reporter data fail, model id:%u.", this->Id());
+
+    Range task_range = op_id_map_.equal_range(profile.fusion_info.op_index);
+    for (CIT idx = task_range.first; idx != task_range.second; ++idx) {
+      uint32_t task_id = idx->second;
+      reporter_data.data = (unsigned char *)&task_id;
       reporter_data.dataLen = sizeof(uint32_t);
-      GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                       this->Id());
-
-      Range task_range = op_id_map.equal_range(op_id);
-      for (CIT idx = task_range.first; idx != task_range.second; ++idx) {
-        uint32_t task_id = idx->second;
-        reporter_data.data = (unsigned char *)&task_id;
-        reporter_data.dataLen = sizeof(uint32_t);
-        GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                         this->Id());
-      }
+      GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                       "Reporter data fail, model id:%u.", this->Id());
     }
   }
+
   return SUCCESS;
 }
 
 Status DavinciModel::SinkTimeProfile(const InputData &current_data) {
   // profiling plugin must be registered
-  Msprof::Engine::Reporter *reporter = PluginImpl::GetPluginReporter();
-  GE_IF_BOOL_EXEC(reporter == nullptr, GELOGI("Profiling report is nullptr!"); return SUCCESS);
-
-  Msprof::Engine::ReporterData reporter_data{};
+  auto &prof_mgr = ProfilingManager::Instance();
+  ReporterData reporter_data{};
   // report model data tag name
   std::string tag_name;
   tag_name.append("model_time_info_")
@@ -2448,33 +2348,33 @@ Status DavinciModel::SinkTimeProfile(const InputData &current_data) {
   size_t name_len = name.size();
   reporter_data.data = (unsigned char *)&name_len;
   reporter_data.dataLen = sizeof(int32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   reporter_data.data = (unsigned char *)name.c_str();
   reporter_data.dataLen = name.size();
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
-                   this->Id());
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
+                   "Reporter data fail, model id:%u.", this->Id());
 
   // request id
   uint64_t request_id = current_data.request_id;
   reporter_data.data = (unsigned char *)&request_id;
   reporter_data.dataLen = sizeof(uint32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED,
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
                    "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index);
 
   // thread id
   int32_t thread_id = GetDataInputTid();
   reporter_data.data = (unsigned char *)&thread_id;
   reporter_data.dataLen = sizeof(int32_t);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED,
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
                    "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index);
 
   // time info
   time_info_.modelId = this->Id();
   reporter_data.data = (unsigned char *)&time_info_;
   reporter_data.dataLen = sizeof(struct timeInfo);
-  GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED,
+  GE_CHK_BOOL_EXEC(prof_mgr.CallMsprofReport(reporter_data) == 0, return FAILED,
                    "Reporter data fail, model id:%u, data index:%u.", this->Id(), current_data.index);
 
   return SUCCESS;
@@ -2530,7 +2430,7 @@ void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) {
 /// @author
 ///
 Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, rtMemcpyKind_t kind) {
-  if (output_op_list_.empty()) {
+  if (output_addrs_list_.empty()) {
     Status ret = SyncVarData();
     return ret;
   }
@@ -2581,7 +2481,7 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r
     uint64_t buffer_length = buffer.length;
     void *buffer_addr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(buffer.data));
 
-    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%lu] datasize[%lu]",
+    GELOGI("CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%lu] datasize[%lu]",
            runtime_param_.graph_id, output.first, output.second.GetBasicAddr(), data_size, buffer_length);
     GE_CHK_RT_RET(rtMemcpy(buffer_addr, buffer_length, output.second.GetBasicAddr(), data_size, kind));
     idx++;
@@ -2589,20 +2489,12 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data, r
   return SUCCESS;
 }
 
-Status DavinciModel::GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
-                                         std::vector<ge::OutputTensorInfo> &outputs) {
-  GE_CHECK_NOTNULL(op_desc);
-  GE_CHECK_NOTNULL(output_data);
-  if (output_data->blobs.size() > data_index) {
-    GELOGI("No need to generate output tensor info, model id:%u", model_id_);
-    return SUCCESS;
-  }
-  std::vector<int64_t> out_buffer_size_vec;
-  std::vector<std::vector<int64_t>> shape_info_vec;
+Status DavinciModel::InitOutputTensorInfo(const OpDescPtr &op_desc) {
   size_t input_num = op_desc->GetInputsSize();
   if (is_getnext_sink_dynamic_) {
     input_num = input_num - kGetDynamicDimsCount;
   }
+
   for (size_t i = 0; i < input_num; ++i) {
     int64_t size = 0;
     auto input_desc = op_desc->GetInputDescPtr(i);
@@ -2611,36 +2503,59 @@ Status DavinciModel::GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data
     GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS,
                     GELOGE(ret, "Get size from TensorDesc failed, op:%s, input id:%zu", op_desc->GetName().c_str(), i);
                     return ret);
-    std::vector<int64_t> output_shape = input_desc->GetShape().GetDims();
+    const GeShape &shape = input_desc->GetShape();
+    GELOGI("Output size is %ld, output shape is %s.", size, formats::JoinToString(shape.GetDims()).c_str());
+    output_buffer_size_.emplace_back(size);
+    output_shape_info_.emplace_back(shape);
+  }
+
+  return SUCCESS;
+}
+
+Status DavinciModel::GenOutputTensorInfo(OutputData *output_data, vector<OutputTensorInfo> &outputs) {
+  GE_CHECK_NOTNULL(output_data);
+  if (!output_data->blobs.empty()) {
+    GELOGI("No need to generate output tensor info, model id:%u", model_id_);
+    return SUCCESS;
+  }
+
+  vector<int64_t> output_buffer_size;
+  vector<vector<int64_t>> output_shape_info;
+  size_t output_num = output_buffer_size_.size();
+  for (size_t i = 0; i < output_num; ++i) {
+    int64_t output_size = output_buffer_size_[i];
+    vector<int64_t> output_shape = output_shape_info_[i].GetDims();
     if (is_online_infer_dynamic_) {
       if (merge_nodes_gear_and_real_out_size_info_.find(i) != merge_nodes_gear_and_real_out_size_info_.end()) {
         auto gear_and_real_out_size_info = merge_nodes_gear_and_real_out_size_info_[i];
-        size = gear_and_real_out_size_info[cur_dynamic_dims_];
+        output_size = gear_and_real_out_size_info[cur_dynamic_dims_];
         auto gear_and_real_out_shape_info = merge_nodes_gear_and_real_out_shape_info_[i];
         output_shape = gear_and_real_out_shape_info[cur_dynamic_dims_];
         is_dynamic_ = true;
       }
     }
-    GELOGI("Output size is %ld, output shape is %s.", size, formats::JoinToString(output_shape).c_str());
-    out_buffer_size_vec.push_back(size);
-    shape_info_vec.push_back(output_shape);
+    GELOGI("Output size is %ld, output shape is %s.", output_size, formats::JoinToString(output_shape).c_str());
+    output_buffer_size.push_back(output_size);
+    output_shape_info.push_back(output_shape);
   }
-  GELOGI("Output blobs size:%zu, data index:%u, model id:%u", out_buffer_size_vec.size(), data_index, model_id_);
-  for (size_t i = 0; i < out_buffer_size_vec.size(); ++i) {
-    std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[out_buffer_size_vec[i]]);
+
+  GELOGI("Output blobs size:%zu, model id:%u", output_buffer_size_.size(), model_id_);
+  for (size_t i = 0; i < output_buffer_size.size(); ++i) {
+    std::unique_ptr<uint8_t[]> data_buf(new (std::nothrow) uint8_t[output_buffer_size[i]]);
     if (data_buf == nullptr) {
       GELOGE(GE_GRAPH_MALLOC_FAILED, "Malloc buffer failed.");
       return GE_GRAPH_MALLOC_FAILED;
     }
-    output_data->blobs.push_back({data_buf.get(), static_cast<uint64_t>(out_buffer_size_vec[i]), false});
-    ge::OutputTensorInfo output;
-    output.dims = shape_info_vec[i];
+    output_data->blobs.push_back({data_buf.get(), static_cast<uint64_t>(output_buffer_size[i]), false});
+    OutputTensorInfo output;
+    output.dims = output_shape_info[i];
     output.data = std::move(data_buf);
-    output.length = out_buffer_size_vec[i];
+    output.length = output_buffer_size[i];
     outputs.emplace_back(std::move(output));
     GELOGD("Output index:%zu, output dims is %s, data length:%lu.", i,
            formats::JoinToString(output.dims).c_str(), output.length);
   }
+
   return SUCCESS;
 }
 
@@ -2675,35 +2590,28 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
     return INTERNAL_ERROR;
   }
 
-  if (output_op_list_.empty()) {
+  if (output_addrs_list_.empty()) {
     GELOGW("Output tensor list is empty, model id: %u", model_id_);
     GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed.");
     return INTERNAL_ERROR;
   }
 
   GE_CHECK_NOTNULL(output_data);
-  // index of data in output_data
-  uint32_t data_index = 0;
-
   output_data->index = data_id;
   output_data->model_id = model_id_;
 
-  is_getnext_sink_dynamic_ = false;
-  // copy output data from op to designated position
-  for (auto &op_desc : output_op_list_) {
-    if (IsGetNextSinkDynamic(op_desc)) {
-      GELOGD("Reinit cur dynamic dims when getnext sink dynamic.");
-      is_getnext_sink_dynamic_ = true;
-      cur_dynamic_dims_.clear();
-      cur_dynamic_dims_.resize(shape_of_cur_dynamic_dims_);
-      GE_CHK_RT_RET(rtMemcpy(cur_dynamic_dims_.data(), shape_of_cur_dynamic_dims_ * sizeof(int64_t),
-                                netoutput_last_input_addr_, netoutput_last_input_size_, RT_MEMCPY_DEVICE_TO_HOST));
-    }
-    GELOGD("Cur dynamic dims is %s.", formats::JoinToString(cur_dynamic_dims_).c_str());
-    if (GenOutputTensorInfo(op_desc, data_index, output_data, outputs) != SUCCESS) {
-      return INTERNAL_ERROR;
-    }
-    data_index += op_desc->GetInputsSize();
+  if (is_getnext_sink_dynamic_) {
+    GELOGD("Reinit cur dynamic dims when getnext sink dynamic.");
+    cur_dynamic_dims_.clear();
+    cur_dynamic_dims_.resize(shape_of_cur_dynamic_dims_);
+    auto ret = rtMemcpy(cur_dynamic_dims_.data(), shape_of_cur_dynamic_dims_ * sizeof(int32_t),
+                        netoutput_last_input_addr_, netoutput_last_input_size_, RT_MEMCPY_DEVICE_TO_HOST);
+    GE_CHK_RT_RET(ret);
+  }
+
+  GELOGD("Cur dynamic dims is %s.", formats::JoinToString(cur_dynamic_dims_).c_str());
+  if (GenOutputTensorInfo(output_data, outputs) != SUCCESS) {
+    return INTERNAL_ERROR;
   }
 
   if (CopyOutputData(data_id, *output_data, RT_MEMCPY_DEVICE_TO_HOST) != SUCCESS) {
@@ -2727,11 +2635,11 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
 ///
 Status DavinciModel::ReturnNoOutput(uint32_t data_id) {
   GELOGI("ReturnNoOutput model id:%u", model_id_);
-  for (auto op_desc : variable_op_list_) {
+  for (const auto item : broadcast_variable_) {
     Status ret = VarManager::Instance(session_id_)
-                     ->SyncBroadCastData2Var(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_);
+                     ->SyncBroadCastData2Var(runtime_param_.graph_id, item.first, item.second, mem_base_);
     GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_,
-                     op_desc->GetName().c_str());
+                     item.first.c_str());
   }
 
   GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null!");
@@ -2797,88 +2705,54 @@ void *DavinciModel::Run(DavinciModel *model) {
       GE_IF_BOOL_EXEC(current_data.blobs.empty(), break);
       auto shape_data_buffer_data = current_data.blobs.back().data;
       auto shape_data_buffer_length = current_data.blobs.back().length;
-      model->cur_dynamic_dims_.assign(reinterpret_cast<int64_t *>(shape_data_buffer_data),
-                                      reinterpret_cast<int64_t *>(shape_data_buffer_data) +
-                                      shape_data_buffer_length / sizeof(int64_t));
+      model->cur_dynamic_dims_.assign(reinterpret_cast<int32_t *>(shape_data_buffer_data),
+                                      reinterpret_cast<int32_t *>(shape_data_buffer_data) +
+                                      shape_data_buffer_length / sizeof(int32_t));
       GELOGD("Data: cur dynamic dims is %s", formats::JoinToString(model->cur_dynamic_dims_).c_str());
-      delete[] (int64_t *)current_data.blobs.back().data;
+      delete[] reinterpret_cast<int32_t *>(current_data.blobs.back().data);
       current_data.blobs.pop_back();
     }
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_PRE_PROC_END));
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_INFER_START));
-    if (ProfilingManager::Instance().ProfilingOpTraceOn()) {
-      GELOGI("GetOpTraceIterNum:%d", ProfilingManager::Instance().GetOpTraceIterNum());
-      for (int32_t i = 0; i < ProfilingManager::Instance().GetOpTraceIterNum(); i++) {
-        if (!ProfilingManager::Instance().ProfilingLoadFlag()) {
-          vector<int32_t> prof_device_id_vec = ProfilingManager::Instance().GetProfilingDeviceId();
-          for (size_t j = 0; j < prof_device_id_vec.size(); ++j) {
-            // just profiling, no need to check value
-            (void)ProfilingManager::Instance().StartProfiling(i, prof_device_id_vec[j]);
-          }
-        }
-
-        GELOGI("rtModelExecute start.");
-        rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
-        GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
-                        (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
-                        continue);  // [No need to check value]
-        GELOGI("rtModelExecute end");
-
-        GELOGI("rtStreamSynchronize start.");
-        rt_ret = rtStreamSynchronize(model->rt_model_stream_);
-        if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
-          GELOGI("The model with multiple datasets aborts normally.");
-        } else {
-          GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
-                          (void)model->ReturnResult(current_data.index, false, seq_end_flag, data_wrapper->GetOutput());
-                          continue);  // [No need to check value]
-        }
-
-        GELOGI("rtStreamSynchronize end.");
-        (void)ProfilingManager::Instance().StopProfiling();  // just profiling, no need to check value
-      }
+    GE_TIMESTAMP_START(rtModelExecute);
+    GELOGI("rtModelExecute start.");
+    rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
+                    (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
+                    CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
+                    continue);
+    GELOGI("rtModelExecute end");
+    GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute"));
+
+    GE_TIMESTAMP_START(rtStreamSynchronize);
+    GELOGI("rtStreamSynchronize start.");
+    rt_ret = rtStreamSynchronize(model->rt_model_stream_);
+    if (rt_ret == kEndOfSequence || rt_ret == kEndOfSequenceNew) {
+      seq_end_flag = true;
+    }
+    if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
+      GELOGI("The model with multiple datasets aborts normally.");
     } else {
-      GE_TIMESTAMP_START(rtModelExecute);
-      GELOGI("rtModelExecute start.");
-      rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
-                      (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
-                      CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
-                      continue);
-      GELOGI("rtModelExecute end");
-      GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute"));
-
-      GE_TIMESTAMP_START(rtStreamSynchronize);
-      GELOGI("rtStreamSynchronize start.");
-      rt_ret = rtStreamSynchronize(model->rt_model_stream_);
-      if (rt_ret == kEndOfSequence || rt_ret == kEndOfSequenceNew) {
-        seq_end_flag = true;
-      }
-      if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
-        GELOGI("The model with multiple datasets aborts normally.");
-      } else {
-        GE_IF_BOOL_EXEC(
-          rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag);
-          (void)model->ReturnResult(current_data.index, false, seq_end_flag,
-                                    data_wrapper->GetOutput());  // [No need to check value]
-          CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
-          continue);
-      }
-
-      GELOGI("rtStreamSynchronize end.");
-      GE_IF_BOOL_EXEC(model->is_first_execute_,
-                      GE_TIMESTAMP_EVENT_END(rtStreamSynchronize, "GraphExcute::Wait for rtStreamSynchronize"));
-      GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_INFER_END));
+      GE_IF_BOOL_EXEC(
+        rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag);
+        (void)model->ReturnResult(current_data.index, false, seq_end_flag,
+                                  data_wrapper->GetOutput());  // [No need to check value]
+        CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
+        continue);
     }
 
+    GELOGI("rtStreamSynchronize end.");
+    GE_IF_BOOL_EXEC(model->is_first_execute_,
+                    GE_TIMESTAMP_EVENT_END(rtStreamSynchronize, "GraphExcute::Wait for rtStreamSynchronize"));
+    GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), model->SetProfileTime(MODEL_INFER_END));
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(),
                     model->SetProfileTime(MODEL_AFTER_PROC_START));
     GE_TIMESTAMP_START(ReturnResult3);
     // copy output data from device to host
-    GE_IF_BOOL_EXEC(!model->output_op_list_.empty(),
+    GE_IF_BOOL_EXEC(!model->output_addrs_list_.empty(),
                     (void)model->ReturnResult(current_data.index, rslt_flg, false, data_wrapper->GetOutput()))
     // copy output data from device to host for variable graph
-    GE_IF_BOOL_EXEC(model->output_op_list_.empty(), (void)model->ReturnNoOutput(current_data.index));
+    GE_IF_BOOL_EXEC(model->output_addrs_list_.empty(), (void)model->ReturnNoOutput(current_data.index));
     GE_IF_BOOL_EXEC(model->is_first_execute_,
                     GE_TIMESTAMP_EVENT_END(ReturnResult3, "GraphExcute::CopyDataFromDeviceToHost"));
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(),
@@ -2998,49 +2872,83 @@ void DavinciModel::UnbindTaskSinkStream() {
   }
 }
 
+void *DavinciModel::GetRunAddress(void *addr) const {
+  if (fixed_mem_base_ == reinterpret_cast<uintptr_t>(mem_base_)) {
+    return addr;
+  }
+
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(addr);
+  if ((fixed_mem_base_ <= ptr) && (ptr < fixed_mem_base_ + runtime_param_.mem_size)) {
+    return mem_base_ + (ptr - fixed_mem_base_);
+  } else {
+    return addr;
+  }
+}
+
 Status DavinciModel::CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs) {
-  GELOGI("DavinciModel::CreateKnownZeroCopyMap in.");
-  if (inputs.size() > data_op_list_.size()) {
-    GELOGE(FAILED, "input data addr %zu should less than input op number %zu.", inputs.size(), data_op_list_.size());
+  GELOGI("in, inputs size: %zu, input addr size: %zu, outputs size: %zu, output addr size: %zu",
+         inputs.size(), input_addrs_list_.size(), outputs.size(), output_addrs_list_.size());
+  if (inputs.size() > input_addrs_list_.size()) {
+    GELOGE(FAILED, "input data addr %zu should less than input op num %zu.", inputs.size(), input_addrs_list_.size());
     return FAILED;
   }
   // remove zero copy addr in last iteration
-  knonw_input_data_info_.clear();
-  knonw_output_data_info_.clear();
+  known_input_data_info_.clear();
+  known_output_data_info_.clear();
   for (size_t i = 0; i < inputs.size(); ++i) {
-    const vector<void *> addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, data_op_list_[i]);
-    knonw_input_data_info_[addr_list[kDataIndex]] = inputs[i];
-    GELOGI("DavinciModel::CreateKnownZeroCopyMap input %zu,v addr %p,p addr %p .", i, addr_list[kDataIndex], inputs[i]);
+    const vector<void *> &addr_list = input_addrs_list_[i];
+    void *addr = GetRunAddress(addr_list[kDataIndex]);
+    known_input_data_info_[addr] = inputs[i];
+    GELOGI("input %zu, v addr %p, r addr %p, p addr %p", i, addr_list[kDataIndex], addr, inputs[i]);
   }
-  if (output_op_list_.size() < kOutputNum) {
-    GELOGW("output op num in graph is %zu.", output_op_list_.size());
+
+  if (output_addrs_list_.empty()) {
+    GELOGW("output op num in graph is %zu", output_addrs_list_.size());
     return SUCCESS;
   }
-  const vector<void *> addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, output_op_list_[kDataIndex]);
+  const vector<void *> &addr_list = output_addrs_list_.front();
   for (size_t i = 0; i < addr_list.size() && i < outputs.size(); ++i) {
-    knonw_output_data_info_[addr_list[i]] = outputs[i];
-    GELOGI("DavinciModel::CreateKnownZeroCopyMap output %zu,v addr %p,p addr %p .", i, addr_list[i], outputs[i]);
+    void *addr = GetRunAddress(addr_list[i]);
+    known_output_data_info_[addr] = outputs[i];
+    GELOGI("output %zu, v addr %p, r addr %p, p addr %p", i, addr_list[i], addr, outputs[i]);
   }
-  GELOGI("DavinciModel::CreateKnownZeroCopyMap success.");
+
+  GELOGI("success, known input data info size: %zu, known output data info size: %zu",
+         known_input_data_info_.size(), known_output_data_info_.size());
   return SUCCESS;
 }
 
-Status DavinciModel::UpdateKnownZeroCopyAddr() {
-  for (size_t i = 0; i < total_io_addrs_.size(); ++i) {
-    auto it_in = knonw_input_data_info_.find(total_io_addrs_[i]);
-    if (it_in != knonw_input_data_info_.end()) {
-      GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %zu,v addr %p,p addr %p .", i, total_io_addrs_[i],
-             knonw_input_data_info_.at(total_io_addrs_[i]));
-      total_io_addrs_[i] = knonw_input_data_info_.at(total_io_addrs_[i]);
+void DavinciModel::SetTotalIOAddrs(const vector<void *> &io_addrs) {
+  if (fixed_mem_base_ == reinterpret_cast<uintptr_t>(mem_base_)) {
+    total_io_addrs_.insert(total_io_addrs_.end(), io_addrs.begin(), io_addrs.end());
+    return;
+  }
+
+  for (size_t i = 0; i < io_addrs.size(); ++i) {
+    total_io_addrs_.emplace_back(GetRunAddress(io_addrs[i]));
+  }
+}
+
+Status DavinciModel::UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs, bool update_args) {
+  if (fixed_mem_base_ != reinterpret_cast<uintptr_t>(mem_base_) && update_args) {
+    for (size_t i = 0; i < total_io_addrs.size(); ++i) {
+      total_io_addrs[i] = GetRunAddress(total_io_addrs[i]);
+    }
+  }
+
+  for (size_t i = 0; i < total_io_addrs.size(); ++i) {
+    auto it_in = known_input_data_info_.find(total_io_addrs[i]);
+    if (it_in != known_input_data_info_.end()) {
+      GELOGI("input %zu, v addr %p, p addr %p", i, total_io_addrs[i], known_input_data_info_.at(total_io_addrs[i]));
+      total_io_addrs[i] = known_input_data_info_.at(total_io_addrs[i]);
     }
-    auto it_out = knonw_output_data_info_.find(total_io_addrs_[i]);
-    if (it_out != knonw_output_data_info_.end()) {
-      GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %zu,v addr %p,p addr %p .", i, total_io_addrs_[i],
-             knonw_output_data_info_.at(total_io_addrs_[i]));
-      total_io_addrs_[i] = knonw_output_data_info_.at(total_io_addrs_[i]);
+    auto it_out = known_output_data_info_.find(total_io_addrs[i]);
+    if (it_out != known_output_data_info_.end()) {
+      GELOGI("output %zu, v addr %p, p addr %p", i, total_io_addrs[i], known_output_data_info_.at(total_io_addrs[i]));
+      total_io_addrs[i] = known_output_data_info_.at(total_io_addrs[i]);
     }
   }
-  GELOGI("DavinciModel::UpdateKnownZeroCopyAddr success.");
+  GELOGI("success, total io addrs size: %zu", total_io_addrs.size());
   return SUCCESS;
 }
 
@@ -3066,7 +2974,7 @@ Status DavinciModel::UpdateKnownNodeArgs(const vector<void *> &inputs, const vec
   } else {
     total_io_addrs_ = orig_total_io_addrs_;
   }
-  GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(), "DavinciModel::UpdateKnownZeroCopyAddr failed.");
+  GE_CHK_STATUS_RET(UpdateKnownZeroCopyAddr(total_io_addrs_, false), "DavinciModel::UpdateKnownZeroCopyAddr failed.");
 
   if (total_args_size_ == 0) {
     GELOGW("DavinciModel::UpdateKnownNodeArgs device args %p, dst size %u, pass rtMemcpy.", args_, total_args_size_);
@@ -3133,7 +3041,14 @@ Status DavinciModel::MallocKnownArgs() {
     GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
     return RT_ERROR_TO_GE_STATUS(rt_ret);
   }
-
+  // malloc dynamic and static hybrid memory
+  if (total_hybrid_args_size_ != 0) {
+    rt_ret = rtMalloc(&hybrid_addrs_, total_hybrid_args_size_, RT_MEMORY_HBM);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
+      return RT_ERROR_TO_GE_STATUS(rt_ret);
+    }
+  }
   // malloc fixed addr memory, eg: rts op
   if (total_fixed_addr_size_ != 0) {
     GELOGI("Begin to allocate fixed addr.");
@@ -3170,21 +3085,27 @@ Status DavinciModel::DistributeTask() {
 
   const auto &model_task_def = ge_model_->GetModelTaskDefPtr();
   for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
+    auto &task_def = model_task_def->task(task_index);
     auto &task = task_list_.at(task_index);
     GE_CHK_STATUS_RET(task->Distribute(), "Task[%zu] distribute fail", task_index);
     // for data dump
-    auto op_index = std::max(model_task_def->task(task_index).kernel().context().op_index(),
-                             model_task_def->task(task_index).kernel_ex().op_index());
+    auto op_index = std::max(task_def.kernel().context().op_index(),
+                             task_def.kernel_ex().op_index());
     OpDescPtr op = GetOpByIndex(op_index);
     GE_CHECK_NOTNULL(op);
 
-    SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId());
     if (reinterpret_cast<void *>(task->GetDumpArgs()) != nullptr) {
       bool call_dump = GetDumpProperties().IsLayerNeedDump(name_, om_name_, op->GetName()) && task->CallSaveDumpInfo();
       if (call_dump || is_op_debug_reg_) {
         SaveDumpTask(task->GetTaskID(), task->GetStreamId(), op, task->GetDumpArgs());
       }
     }
+
+    auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
+    bool no_need_profiling = (task_type != RT_MODEL_TASK_KERNEL) && (task_type != RT_MODEL_TASK_KERNEL_EX);
+    GE_IF_BOOL_EXEC(no_need_profiling, continue);
+
+    SaveDumpOpInfo(runtime_param_, op, task->GetTaskID(), task->GetStreamId());
     // Load task info for profiling
     TaskDescInfo task_desc_info;
     if (!om_name_.empty()) {
@@ -3193,9 +3114,13 @@ Status DavinciModel::DistributeTask() {
       task_desc_info.model_name = name_;
     }
     task_desc_info.op_name = op->GetName();
-    task_desc_info.block_dim = model_task_def->task(task_index).kernel().block_dim();
+    task_desc_info.block_dim = task_def.kernel().block_dim();
     task_desc_info.task_id = task->GetTaskID();
     task_desc_info.stream_id = task->GetStreamId();
+    task_desc_info.shape_type = "static";
+    task_desc_info.cur_iter_num = 0;
+    profiler_report_op_info_[task_desc_info.op_name] =
+      std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
     task_desc_info_.emplace_back(task_desc_info);
     if (flag) {
       if (task->GetSktTaskID() != 0xFFFFFFFF) {
@@ -3203,6 +3128,8 @@ Status DavinciModel::DistributeTask() {
         string op_name = "super_kernel_" + to_string(task_index);
         task_desc_info.op_name = op_name;
         task_desc_info.task_id = task->GetSktTaskID();
+        profiler_report_op_info_[task_desc_info.op_name] =
+          std::pair<uint32_t, uint32_t>(task_desc_info.task_id, task_desc_info.stream_id);
         task_desc_info_.emplace_back(task_desc_info);
       }
     }
@@ -3283,27 +3210,20 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<v
 
     for (auto &input_outside_addrs : new_input_outside_addrs_) {
       ZeroCopyOffset &input_outside = input_outside_addrs.second;
-      bool ret = input_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen);
-      if (ret) {
-        void *args_val = static_cast<uint8_t *>(args) + offset + i * kAddrLen;
-        SetBatchLabelAddr(op_desc, reinterpret_cast<uintptr_t>(args_val));
-      }
+      input_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen);
     }
 
     for (auto &output_outside_addrs : new_output_outside_addrs_) {
       ZeroCopyOffset &output_outside = output_outside_addrs.second;
-      bool ret = output_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen);
-      if (ret) {
-        void *args_val = static_cast<uint8_t *>(args) + offset + i * kAddrLen;
-        SetBatchLabelAddr(op_desc, reinterpret_cast<uintptr_t>(args_val));
-      }
+      output_outside.SetOutsideAddrsValue(zero_copy_task, outside_addrs[i], args, offset + i * kAddrLen);
     }
   }
-  auto it = zero_copy_op_id_batch_label_.find(op_desc->GetId());
-  if (it == zero_copy_op_id_batch_label_.end()) {
+
+  string batch_label;
+  if (!AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label) || batch_label.empty()) {
     zero_copy_task.SetBatchLabel(kDefaultBatchLable);
   } else {
-    zero_copy_task.SetBatchLabel(it->second);
+    zero_copy_task.SetBatchLabel(batch_label);
   }
 
   std::lock_guard<std::mutex> lock(outside_addrs_mutex_);
@@ -3313,27 +3233,6 @@ void DavinciModel::SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<v
   }
 }
 
-void DavinciModel::SetBatchLabelAddr(const OpDescPtr &op_desc, uintptr_t addr) {
-  // Establish a mapping between batch label and zero copy address for multi-batch scenes
-  auto it = zero_copy_op_id_batch_label_.find(op_desc->GetId());
-  if (it == zero_copy_op_id_batch_label_.end()) {
-    return;
-  }
-
-  const string &batch_label = it->second;
-  auto iter = zero_copy_batch_label_addrs_.find(batch_label);
-  if (iter != zero_copy_batch_label_addrs_.end()) {
-    iter->second.insert(addr);
-    GELOGD("[ZCPY] Set zero copy batch label and addrs success, batch label: %s, op name:%s.", batch_label.c_str(),
-           op_desc->GetName().c_str());
-  } else {
-    set<uintptr_t> addrs = {addr};
-    zero_copy_batch_label_addrs_.emplace(pair<string, set<uintptr_t>>(batch_label, addrs));
-    GELOGD("[ZCPY] New added zero copy batch label and addrs success, batch label: %s, op name:%s.",
-           batch_label.c_str(), op_desc->GetName().c_str());
-  }
-}
-
 ///
 /// @ingroup ge
 /// @brief Copy Check input size and model op size.
@@ -3354,15 +3253,8 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64
         "MAY cause inference result ERROR, please check model input",
         input_size, op_size);
   }
-  bool is_dynamic_aipp = false;
-  for (const auto &op_desc : data_op_list_) {
-    if (op_desc->GetType() == AIPP_DATA_TYPE) {
-      GELOGI("This is dynamic aipp model.");
-      is_dynamic_aipp = true;
-      break;
-    }
-  }
-  if (is_dynamic_aipp) {
+
+  if (is_dynamic_aipp_) {
     GELOGI("This is dynamic aipp model, no need to judge smaller input size");
     return true;
   }
@@ -3391,14 +3283,14 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64
 ///
 Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic) {
   if (UpdateIoTaskArgs(new_input_data_info_, true, input_data.blobs, is_dynamic, input_data.batch_label) != SUCCESS) {
-    GELOGE(PARAM_INVALID, "[ZCPY] Update input data to model failed.");
-    return PARAM_INVALID;
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[ZCPY] Update input data to model failed.");
+    return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   if (UpdateIoTaskArgs(new_output_data_info_, false, output_data.blobs, is_dynamic, input_data.batch_label) !=
       SUCCESS) {
-    GELOGE(PARAM_INVALID, "[ZCPY] Update output data to model failed.");
-    return PARAM_INVALID;
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "[ZCPY] Update output data to model failed.");
+    return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   for (ZeroCopyTask &task : zero_copy_tasks_) {
@@ -3444,7 +3336,7 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &
     }
 
     if (!CheckInputAndModelSize(buffer.length, data.second.GetDataSize(), is_dynamic)) {
-      GELOGE(FAILED, "Check input size and model size failed");
+      GELOGE(FAILED, "Check input size and model size failed, op[%s]", data.second.GetOpName().c_str());
       return FAILED;
     }
 
@@ -3467,15 +3359,15 @@ Status DavinciModel::UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &
       void *addr = data.second.GetDataInfo().at(count).second;
       void *buffer_addr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(buffer.data) +
                                                    data.second.GetRelativeOffset().at(count));
-      GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p", input_or_output.c_str(),
-             data.first, addr, size, buffer_addr);
+      GELOGI("[ZCPY] Copy %s blobs_index %u, virtual_addr: %p, size: %ld, user_data_addr: %p, batch_label: %s",
+             input_or_output.c_str(), data.first, addr, size, buffer_addr, batch_label.c_str());
       // For input data, just copy for rts task.
       for (ZeroCopyTask &task : zero_copy_tasks_) {
         if (task.GetBatchLabel() != kDefaultBatchLable && task.GetBatchLabel() != batch_label) {
           continue;
         }
         uintptr_t addr_val = reinterpret_cast<uintptr_t>(addr);
-        if (task.UpdateTaskParam(addr_val, buffer_addr, zero_copy_batch_label_addrs_, batch_label) != SUCCESS) {
+        if (task.UpdateTaskParam(addr_val, buffer_addr) != SUCCESS) {
           return FAILED;
         }
       }
@@ -3837,9 +3729,6 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
   GELOGD("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, is_async_mode_);
   GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed.");
   is_dynamic_ = input_data.is_dynamic_batch;
-  if (!is_dynamic_) {
-    zero_copy_batch_label_addrs_.clear();
-  }
 
   GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_PRE_PROC_START));
   Status ret = CopyModelData(input_data, output_data, is_dynamic_);
@@ -3861,7 +3750,8 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
   if (!is_async_mode_) {
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_AFTER_PROC_START));
     ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE);
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Copy Output data to user failed.");
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR,
+        "Copy Output data to user failed.");
     GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingModelExecuteOn(), SetProfileTime(MODEL_AFTER_PROC_END));
   }
 
@@ -4044,13 +3934,12 @@ Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id)
   return SUCCESS;
 }
 
-void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &compute_graph) {
+void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &graph, const map<string, OpDescPtr> &variable_by_name) {
   data_dumper_.SetModelName(name_);
   data_dumper_.SetModelId(model_id_);
   data_dumper_.SetOmName(om_name_);
-  data_dumper_.SetComputeGraph(compute_graph);
+  data_dumper_.SetComputeGraph(graph);
   data_dumper_.SetRefInfo(saved_task_addrs_);
-  data_dumper_.SetL1FusionAddr(l1_fusion_addr_);
 
   int32_t device_id = 0;
   rtError_t rt_ret = rtGetDevice(&device_id);
@@ -4061,22 +3950,23 @@ void DavinciModel::SetDataDumperArgs(const ComputeGraphPtr &compute_graph) {
   data_dumper_.SetDeviceId(device_id);
 
   // set loop count addr
-  auto get_var_addr = [](const OpDescPtr &op, const RuntimeParam &runtime_param) -> void * {
-    if (op != nullptr) {
-      auto v_output_size = ModelUtils::GetOutputSize(op);
-      auto v_output_addr = ModelUtils::GetOutputDataAddrs(runtime_param, op);
-      if (v_output_size.empty() || v_output_addr.empty()) {
+  auto get_var_addr = [&](const string &name) -> void *{
+    const auto it = variable_by_name.find(name);
+    if (it != variable_by_name.end()) {
+      const auto output_sizes = ModelUtils::GetOutputSize(it->second);
+      const auto output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, it->second);
+      if (output_sizes.empty() || output_addrs.empty()) {
         return nullptr;
       }
-      return v_output_addr[0];
+      return output_addrs[0];
     }
-    GELOGD("op is null.");
+    GELOGD("op: %s is null.", name.c_str());
     return nullptr;
   };
 
-  data_dumper_.SetLoopAddr(get_var_addr(GetVariableOp(NODE_NAME_GLOBAL_STEP), runtime_param_),
-                           get_var_addr(GetVariableOp(NODE_NAME_FLOWCTRL_LOOP_PER_ITER), runtime_param_),
-                           get_var_addr(GetVariableOp(NODE_NAME_FLOWCTRL_LOOP_COND), runtime_param_));
+  data_dumper_.SetLoopAddr(get_var_addr(NODE_NAME_GLOBAL_STEP),
+                           get_var_addr(NODE_NAME_FLOWCTRL_LOOP_PER_ITER),
+                           get_var_addr(NODE_NAME_FLOWCTRL_LOOP_COND));
 }
 
 uint32_t DavinciModel::GetFlowctrlIndex(uint32_t op_index) {
@@ -4111,7 +4001,15 @@ Status DavinciModel::GetComputeGraphInfo(vector<ComputeGraphDescInfo> &graph_des
     compute_graph_info.output_format = op_desc.output_format;
     compute_graph_info.output_shape = op_desc.output_shape;
     compute_graph_info.output_data_type = op_desc.output_data_type;
-
+    uint32_t task_id = 0;
+    uint32_t stream_id = 0;
+    auto iter = profiler_report_op_info_.find(op_desc.op_name);
+    if (iter != profiler_report_op_info_.end()) {
+      task_id = iter->second.first;
+      stream_id = iter->second.second;
+    }
+    compute_graph_info.task_id = task_id;
+    compute_graph_info.stream_id = stream_id;
     graph_desc_info.emplace_back(compute_graph_info);
   }
   return SUCCESS;
@@ -4124,25 +4022,45 @@ void DavinciModel::SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_s
   }
 }
 
-Status DavinciModel::GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) {
-  GE_CHK_BOOL_RET_STATUS(index < data_op_list_.size(), PARAM_INVALID, "Index %u is invalid.", index);
-  OpDescPtr data_op = data_op_list_[index];
-  if (!data_op->HasAttr(ATTR_NAME_AIPP_INPUTS) || !data_op->HasAttr(ATTR_NAME_AIPP_OUTPUTS)) {
-    GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "GetOrigInputInfo: there is not AIPP related with index %u.", index);
-    return ACL_ERROR_GE_AIPP_NOT_EXIST;
+Status DavinciModel::InitOrigInputInfo(uint32_t index, const OpDescPtr &op_desc) {
+  if (!op_desc->HasAttr(ATTR_NAME_AIPP_INPUTS) || !op_desc->HasAttr(ATTR_NAME_AIPP_OUTPUTS)) {
+    GELOGI("there is not AIPP related with index %u, node: %s.", index, op_desc->GetName().c_str());
+    return SUCCESS;
   }
 
-  vector<std::string> inputs;
-  if (AttrUtils::GetListStr(data_op, ATTR_NAME_AIPP_INPUTS, inputs) && !inputs.empty()) {
+  vector<string> inputs;
+  if (AttrUtils::GetListStr(op_desc, ATTR_NAME_AIPP_INPUTS, inputs) && !inputs.empty()) {
     std::string input = inputs[kAippOriginInputIndex];
-    GELOGI("GetOrigInputInfo: origin input str: %s", input.c_str());
+    GELOGI("origin input str: %s", input.c_str());
     std::vector<std::string> infos = ge::StringUtils::Split(input, ':');
     if (infos.size() != kAippInfoNum) {
-      GELOGW("origin input str is invalid.");
+      GELOGE(ACL_ERROR_GE_AIPP_MODE_INVALID, "origin input str is invalid[%zu, %u].", infos.size(), kAippInfoNum);
+      return ACL_ERROR_GE_AIPP_MODE_INVALID;
     }
-    orig_input_info.format = TypeUtils::SerialStringToFormat(infos[kAippInfoFormat]);
-    orig_input_info.data_type = TypeUtils::SerialStringToDataType(infos[kAippInfoDataType]);
-    orig_input_info.dim_num = std::strtol(infos[kAippInfoDimNum].c_str(), nullptr, kDecimal);
+
+    OriginInputInfo input_info;
+    input_info.format = TypeUtils::SerialStringToFormat(infos[kAippInfoFormat]);
+    input_info.data_type = TypeUtils::SerialStringToDataType(infos[kAippInfoDataType]);
+    input_info.dim_num = std::strtol(infos[kAippInfoDimNum].c_str(), nullptr, kDecimal);
+    orig_input_info_[index] = input_info;
+  } else {
+    OriginInputInfo input_info = { FORMAT_RESERVED, DT_UNDEFINED, 0 };
+    orig_input_info_[index] = input_info;
+  }
+
+  return SUCCESS;
+}
+
+Status DavinciModel::GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const {
+  const auto it = orig_input_info_.find(index);
+  if (it == orig_input_info_.end()) {
+    GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "there is not AIPP related with index %u.", index);
+    return ACL_ERROR_GE_AIPP_NOT_EXIST;
+  }
+
+  const OriginInputInfo &input_info = it->second;
+  if (input_info.format != FORMAT_RESERVED || input_info.data_type != DT_UNDEFINED) {
+    orig_input_info = input_info;
   }
 
   return SUCCESS;
@@ -4152,7 +4070,8 @@ void DavinciModel::ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_
   GELOGI("ParseAIPPInfo: origin str: %s", in_out_info.c_str());
   std::vector<std::string> infos = ge::StringUtils::Split(in_out_info, ':');
   if (infos.size() != kAippInfoNum) {
-    GELOGW("origin input str is invalid.");
+    GELOGE(ACL_ERROR_GE_AIPP_MODE_INVALID, "origin input str is invalid[%zu, %u].", infos.size(), kAippInfoNum);
+    return;
   }
   dims_info.name = infos[kAippInfoTensorName];
   dims_info.size = std::strtol(infos[kAippInfoTensorSize].c_str(), nullptr, kDecimal);
@@ -4167,47 +4086,58 @@ void DavinciModel::ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_
   }
 }
 
-Status DavinciModel::GetAllAippInputOutputDims(uint32_t index, std::vector<InputOutputDims> &input_dims,
-                                               std::vector<InputOutputDims> &output_dims) {
-  GE_CHK_BOOL_RET_STATUS(index < data_op_list_.size(), PARAM_INVALID, "Index %u is invalid.", index);
-  OpDescPtr data_op = data_op_list_[index];
-  if (!data_op->HasAttr(ATTR_NAME_AIPP_INPUTS) || !data_op->HasAttr(ATTR_NAME_AIPP_OUTPUTS)) {
-    GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "GetAllAippInputOutputDims: there is not AIPP related with index %u.", index);
-    return ACL_ERROR_GE_AIPP_NOT_EXIST;
+Status DavinciModel::InitAippInputOutputDims(uint32_t index, const OpDescPtr &op_desc) {
+  if (!op_desc->HasAttr(ATTR_NAME_AIPP_INPUTS) || !op_desc->HasAttr(ATTR_NAME_AIPP_OUTPUTS)) {
+    GELOGI("there is not AIPP related with index %u.", index);
+    return SUCCESS;
   }
 
-  vector<std::string> inputs;
-  if (AttrUtils::GetListStr(data_op, ATTR_NAME_AIPP_INPUTS, inputs) && !inputs.empty()) {
-    GELOGI("GetAllAippInputOutputDims: Data: %s has %zu related aippInfo.", data_op->GetName().c_str(), inputs.size());
+  vector<string> inputs;
+  vector<InputOutputDims> input_dims;
+  if (AttrUtils::GetListStr(op_desc, ATTR_NAME_AIPP_INPUTS, inputs) && !inputs.empty()) {
+    GELOGI("Data: %s has %zu related aippInfo.", op_desc->GetName().c_str(), inputs.size());
     for (auto it : inputs) {
       InputOutputDims input_info;
       ParseAIPPInfo(it, input_info);
       input_dims.emplace_back(input_info);
-      GELOGD("GetAllAippInputOutputDims Aipp origin input dims info: %s", it.c_str());
+      GELOGD("Aipp origin input dims info: %s", it.c_str());
 
-      ConstGeTensorDescPtr data_input_desc = data_op->GetInputDescPtr(kDataIndex);
+      ConstGeTensorDescPtr data_input_desc = op_desc->GetInputDescPtr(kDataIndex);
       int64_t data_input_size;
-      (void)TensorUtils::GetSize(*(data_op->GetInputDescPtr(kDataIndex)), data_input_size);
-      GELOGD(
-          "GetAllAippInputOutputDims related Data[%d]: tensor_name is %s, dim_num is %zu, tensor_size: %zu, format: "
-          "%s, data_type: %s, shape: %s .",
-          index, data_op->GetName().c_str(), data_input_desc->GetShape().GetDimNum(), data_input_size,
-          TypeUtils::FormatToSerialString(data_input_desc->GetFormat()).c_str(),
-          TypeUtils::DataTypeToSerialString(data_input_desc->GetDataType()).c_str(),
-          formats::JoinToString(data_input_desc->GetShape().GetDims()).c_str());
+      (void)TensorUtils::GetSize(*(op_desc->GetInputDescPtr(kDataIndex)), data_input_size);
+      GELOGD("related Data[%d]: tensor_name: %s, dim_num: %zu, tensor_size: %zu, format: %s, data_type: %s, shape: %s",
+        index, op_desc->GetName().c_str(), data_input_desc->GetShape().GetDimNum(), data_input_size,
+        TypeUtils::FormatToSerialString(data_input_desc->GetFormat()).c_str(),
+        TypeUtils::DataTypeToSerialString(data_input_desc->GetDataType()).c_str(),
+        formats::JoinToString(data_input_desc->GetShape().GetDims()).c_str());
     }
   }
 
-  vector<std::string> outputs;
-  if (AttrUtils::GetListStr(data_op, ATTR_NAME_AIPP_OUTPUTS, outputs) && !outputs.empty()) {
+  vector<string> outputs;
+  vector<InputOutputDims> output_dims;
+  if (AttrUtils::GetListStr(op_desc, ATTR_NAME_AIPP_OUTPUTS, outputs) && !outputs.empty()) {
     for (auto it : outputs) {
       InputOutputDims output_info;
       ParseAIPPInfo(it, output_info);
       output_dims.emplace_back(output_info);
-      GELOGD("GetAllAippInputOutputDims Aipp output dims info: %s", it.c_str());
+      GELOGD("Aipp output dims info: %s", it.c_str());
     }
   }
 
+  aipp_dims_info_[index] = { input_dims, input_dims };
+  return SUCCESS;
+}
+
+Status DavinciModel::GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims,
+                                               vector<InputOutputDims> &output_dims) const {
+  const auto it = aipp_dims_info_.find(index);
+  if (it == aipp_dims_info_.end()) {
+    GELOGE(ACL_ERROR_GE_AIPP_NOT_EXIST, "there is not AIPP related with index %u.", index);
+    return ACL_ERROR_GE_AIPP_NOT_EXIST;
+  }
+
+  input_dims = it->second.first;
+  output_dims = it->second.second;
   return SUCCESS;
 }
 
@@ -4219,4 +4149,28 @@ int64_t DavinciModel::GetFixedAddrsSize(string tensor_name) {
   }
 }
 
+Status DavinciModel::InitL1DataDumperArgs() {
+  auto all_dump_model = GetDumpProperties().GetAllDumpModel();
+  bool find_by_om_name = all_dump_model.find(om_name_) != all_dump_model.end();
+  bool find_by_model_name = all_dump_model.find(name_) != all_dump_model.end();
+  bool dump_l1fusion_op =
+    (all_dump_model.find(ge::DUMP_ALL_MODEL) != all_dump_model.end()) || find_by_om_name || find_by_model_name;
+  if (dump_l1fusion_op) {
+    // malloc 2M for dump l1fusion op
+    GE_CHK_RT_RET(rtMalloc(&l1_fusion_addr_, kDumpL1FusionOpMByteSize, RT_MEMORY_DDR));
+
+    // send l1fusion dump addr to rts
+    if (rtDumpAddrSet(rt_model_handle_, l1_fusion_addr_, kDumpL1FusionOpMByteSize, kDumpFlagOfL1Fusion) !=
+        RT_ERROR_NONE) {
+      // l1_fusion_addr_ will be free when DavinciModel destruct
+      GELOGE(FAILED, "Call rtDumpAddrSet failed");
+      return FAILED;
+    }
+
+    // set addr for l1 data dump
+    data_dumper_.SetL1FusionAddr(l1_fusion_addr_);
+  }
+  return SUCCESS;
+}
+
 }  // namespace ge
diff --git a/ge/graph/load/new_model_manager/davinci_model.h b/ge/graph/load/new_model_manager/davinci_model.h
index 650f19eb..4108f2c7 100755
--- a/ge/graph/load/new_model_manager/davinci_model.h
+++ b/ge/graph/load/new_model_manager/davinci_model.h
@@ -49,6 +49,10 @@
 #include "task_info/task_info.h"
 #include "graph/common/local_context.h"
 
+using std::mutex;
+using std::thread;
+using std::multimap;
+
 namespace ge {
 // op debug need 2048 bits buffer
 const size_t kOpDebugMemorySize = 2048UL;
@@ -76,6 +80,39 @@ struct timeInfo {
   int64_t dumpEndTime;
 };
 
+// For super kernel
+struct SuperKernelTaskInfo {
+  uint32_t last_block_dim;
+  uint32_t last_args_size;
+  uint32_t last_task_id;
+  uint32_t last_stream_id;
+  void *last_stream;
+  void *last_sm_desc;
+  vector<void *> kernel_list;
+  vector<void *> arg_list;
+  vector<uint32_t> dump_flag_list;
+  vector<OpDescPtr> op_desc_list;
+  vector<uintptr_t> dump_args_list;
+  uint32_t last_dump_flag;
+  int64_t last_group_key;
+  uintptr_t last_dump_args;
+  OpDescPtr last_op;
+};
+
+struct TaskMemInfo {
+  int64_t input_size{0};
+  int64_t output_size{0};
+  int64_t weight_size{0};
+  int64_t workspace_size{0};
+  int64_t total_size{0};
+};
+
+struct ProfileInfo {
+  FusionOpInfo fusion_info;
+  TaskMemInfo memory_info;
+  uint32_t task_count{0};
+};
+
 enum ExecuteMode {
   INITIALIZATION,
   SYNCHRONIZATION,
@@ -90,7 +127,7 @@ class DavinciModel {
   /// @brief DavinciModel constructor
   /// @author
   ///
-  DavinciModel(int32_t priority, const std::shared_ptr<ModelListener> &listener);
+  DavinciModel(int32_t priority, const shared_ptr<ModelListener> &listener);
 
   ///
   /// @ingroup ge
@@ -120,7 +157,7 @@ class DavinciModel {
   /// @param [in] output_que_ids: input queue ids from user, nums equal NetOutput Op.
   /// @return: 0 for success / others for fail
   ///
-  Status SetQueIds(const std::vector<uint32_t> &input_queue_ids, const std::vector<uint32_t> &output_queue_ids);
+  Status SetQueIds(const vector<uint32_t> &input_queue_ids, const vector<uint32_t> &output_queue_ids);
 
   ///
   /// @ingroup ge
@@ -136,6 +173,20 @@ class DavinciModel {
   ///
   void SetId(uint32_t model_id) { model_id_ = model_id; }
 
+  ///
+  /// @ingroup ge
+  /// @brief Get SubModelId
+  /// @return sub model ID
+  ///
+  uint32_t SubModelId() const { return sub_model_id_; }
+
+  ///
+  /// @ingroup ge
+  /// @brief Get SubModelId
+  /// @return sub model ID
+  ///
+  void SetSubModelId(uint32_t sub_model_id) { sub_model_id_ = sub_model_id; }
+
   static void *Run(DavinciModel *model_pointer);
 
   ///
@@ -190,13 +241,14 @@ class DavinciModel {
   // get total mem size
   size_t TotalMemSize() const { return runtime_param_.mem_size; }
 
-  const std::map<uint32_t, MemInfo> &P2PMemInfos() const {return runtime_param_.memory_infos;}
+  const map<uint32_t, MemInfo> &P2PMemInfos() const { return runtime_param_.memory_infos; }
 
   // model name
   string Name() const { return name_; }
 
   // om_name
   string OmName() const { return om_name_; }
+
   // version
   uint32_t Version() const { return version_; }
 
@@ -222,12 +274,7 @@ class DavinciModel {
 
   Status DestroyThread();
 
-  // Get Data Op.
-  const vector<OpDescPtr> &GetDataList() const { return data_op_list_; }
-
   // get Op
-  const map<uint32_t, OpDescPtr> &GetOpList() const { return op_list_; }
-
   OpDescPtr GetOpByIndex(uint32_t index) const {
     if (op_list_.find(index) == op_list_.end()) {
       return nullptr;
@@ -235,26 +282,16 @@ class DavinciModel {
     return op_list_.at(index);
   }
 
-  OpDescPtr GetVariableOp(const string &name) {
-    for (auto op_desc : variable_op_list_) {
-      if (op_desc != nullptr && op_desc->GetName() == name) {
-        return op_desc;
-      }
-    }
-    return nullptr;
-  }
+  void *GetGlobalStep() const { return global_step_addr_; }
+
   // get task info for profiling
-  const std::vector<TaskDescInfo> &GetTaskDescInfo() const { return task_desc_info_; }
+  const vector<TaskDescInfo> &GetTaskDescInfo() const { return task_desc_info_; }
 
   // get updated task info list
-  std::vector<TaskInfoPtr> GetTaskList() { return task_list_; }
+  vector<TaskInfoPtr> GetTaskList() { return task_list_; }
 
-  ///
-  /// @ingroup ge
-  /// @brief get model input and output format
-  /// @return ccTensorFormat_t current model input and output format
-  ///
-  Format GetFormat();
+  // Modified from KernelTaskInfo.
+  SuperKernelTaskInfo &GetSuperKernelTaskInfo() { return skt_info_; }
 
   rtModel_t GetRtModelHandle() const { return rt_model_handle_; }
 
@@ -289,7 +326,7 @@ class DavinciModel {
   Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc);
 
   Status GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<InputOutputDescInfo> &output_desc,
-                                std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
+                                vector<uint32_t> &input_formats, vector<uint32_t> &output_formats, bool by_dims);
 
   ///
   /// @ingroup ge
@@ -298,7 +335,7 @@ class DavinciModel {
   /// @param [out] dynamic_type
   /// @return execute result
   ///
-  Status GetDynamicBatchInfo(std::vector<std::vector<int64_t>> &batch_info, int32_t &dynamic_type) const;
+  Status GetDynamicBatchInfo(vector<vector<int64_t>> &batch_info, int32_t &dynamic_type) const;
 
   ///
   /// @ingroup ge
@@ -306,13 +343,13 @@ class DavinciModel {
   /// @param [out] batch_info
   /// @return None
   ///
-  void GetCombinedDynamicDims(std::vector<std::vector<int64_t>> &batch_info) const;
+  void GetCombinedDynamicDims(vector<vector<int64_t>> &batch_info) const;
 
-  void GetUserDesignateShapeOrder(std::vector<std::string> &user_input_shape_order) const;
+  void GetUserDesignateShapeOrder(vector<string> &user_input_shape_order) const;
 
-  void GetCurShape(std::vector<int64_t> &batch_info, int32_t &dynamic_type);
+  void GetCurShape(vector<int64_t> &batch_info, int32_t &dynamic_type) const;
 
-  void GetModelAttr(std::vector<std::string> &dynamic_output_shape_info);
+  void GetModelAttr(vector<string> &dynamic_output_shape_info) const;
 
   ///
   /// @ingroup ge
@@ -321,9 +358,9 @@ class DavinciModel {
   /// @param [out] aipp_info
   /// @return execute result
   ///
-  Status GetAIPPInfo(uint32_t index, AippConfigInfo &aipp_info);
+  Status GetAippInfo(uint32_t index, AippConfigInfo &aipp_info) const;
 
-  Status GetAippType(uint32_t index, InputAippType &type, size_t &aipp_index);
+  Status GetAippType(uint32_t index, InputAippType &type, size_t &aipp_index) const;
 
   ///
   /// @ingroup ge
@@ -339,18 +376,7 @@ class DavinciModel {
   /// @param [in] string identification: unique identification for current op.
   /// @return None
   ///
-  void GetUniqueId(const OpDescPtr &op_desc, std::string &unique_identification);
-
-  ///
-  /// @ingroup ge
-  /// @brief get model input and output desc for zero copy
-  /// @param [out] input_shape  model input size
-  /// @param [out] output_shape model output size
-  /// @return execute result
-  ///
-  Status GetInputOutputDescInfoForZeroCopy(vector<InputOutputDescInfo> &input_desc,
-                                           vector<InputOutputDescInfo> &output_desc,
-                                           std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
+  void GetUniqueId(const OpDescPtr &op_desc, string &unique_identification);
 
   Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data);
 
@@ -372,8 +398,6 @@ class DavinciModel {
   ///
   bool RunFlag() const { return run_flg_; }
 
-  Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats);
-
   ///
   /// @ingroup ge
   /// @brief Set Session Id
@@ -419,14 +443,14 @@ class DavinciModel {
   /// @ingroup ge
   /// @brief Save outside address of Data or NetOutput used info for ZeroCopy.
   /// @param [in] const OpDescPtr &op_desc: current op desc
-  /// @param [in] const std::vector<void *> &outside_addrs: address of task
+  /// @param [in] const vector<void *> &outside_addrs: address of task
   /// @param [in] const void *args_offset: arguments address save the address.
   /// @return None.
   ///
-  void SetZeroCopyAddr(const OpDescPtr &op_desc, const std::vector<void *> &outside_addrs, const void *info, void *args,
+  void SetZeroCopyAddr(const OpDescPtr &op_desc, const vector<void *> &outside_addrs, const void *info, void *args,
                        size_t size, size_t offset);
 
-  void SetDynamicSize(const std::vector<uint64_t> &batch_num, int32_t dynamic_type);
+  void SetDynamicSize(const vector<uint64_t> &batch_num, int32_t dynamic_type);
 
   bool GetL1FusionEnableOption() { return is_l1_fusion_enable_; }
 
@@ -436,26 +460,26 @@ class DavinciModel {
 
   int64_t GetLoadEndTime() { return load_end_time_; }
 
-  Status SinkModelProfile();
-
-  Status SinkTimeProfile(const InputData &current_data);
-
-  Status ReportProfilingData(bool check_device = true);
+  Status ReportProfilingData();
 
   void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) {
     data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id);
   }
 
-  void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr<OpDesc> &op_desc, uintptr_t args) {
+  void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr<OpDesc> &op_desc, uintptr_t args) {
     data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args);
   }
 
+  void DumperShrink() {
+    data_dumper_.DumpShrink();
+  }
+
   void SetEndGraphId(uint32_t task_id, uint32_t stream_id);
   DavinciModel &operator=(const DavinciModel &model) = delete;
 
   DavinciModel(const DavinciModel &model) = delete;
 
-  const map<int64_t, std::vector<rtStream_t>> &GetHcclFolowStream() {
+  const map<int64_t, vector<rtStream_t>> &GetHcclFolowStream() {
     return main_follow_stream_mapping_;
   }
   void SaveHcclFollowStream(int64_t main_stream_id, rtStream_t stream);
@@ -473,8 +497,14 @@ class DavinciModel {
     void *cur_args = static_cast<char *>(args_) + offset;
     return cur_args;
   }
-  void SetTotalIOAddrs(vector<void *> &io_addrs) {
-    total_io_addrs_.insert(total_io_addrs_.end(), io_addrs.begin(), io_addrs.end());
+  void SetTotalIOAddrs(const vector<void *> &io_addrs);
+  void SetHybridArgsSize(uint32_t args_size) { total_hybrid_args_size_ += args_size; }
+  uint32_t GetHybridArgsSize() {
+    return total_hybrid_args_size_;
+  }
+  void *GetCurrentHybridArgsAddr(uint32_t offset) {
+    void *cur_args = static_cast<char *>(hybrid_addrs_) + offset;
+    return cur_args;
   }
   void SetTotalFixedAddrsSize(string tensor_name, int64_t fix_addr_size);
   int64_t GetFixedAddrsSize(string tensor_name);
@@ -494,13 +524,13 @@ class DavinciModel {
   Status MallocKnownArgs();
   Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
   Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
-  Status UpdateKnownZeroCopyAddr();
+  Status UpdateKnownZeroCopyAddr(vector<void *> &total_io_addrs, bool update_args = true);
   void SetKnownNodeAddrNotChanged(bool base_addr_not_changed) { base_addr_not_changed_ = base_addr_not_changed; }
 
-  Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info);
-  Status GetAllAippInputOutputDims(uint32_t index, std::vector<InputOutputDims> &input_dims,
-                                   std::vector<InputOutputDims> &output_dims);
-  void SetModelDescVersion(bool is_new_model_desc) { is_new_model_desc_ = is_new_model_desc; }
+  Status GetOrigInputInfo(uint32_t index, OriginInputInfo &orig_input_info) const;
+  Status GetAllAippInputOutputDims(uint32_t index, vector<InputOutputDims> &input_dims,
+                                   vector<InputOutputDims> &output_dims) const;
+
   // om file name
   void SetOmName(string om_name) { om_name_ = om_name; }
 
@@ -510,13 +540,13 @@ class DavinciModel {
   bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
     return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info);
   }
-  Status InitInputOutputForDynamic(const ComputeGraphPtr &compute_graph);
 
  private:
   // memory address of weights
   uint8_t *weights_mem_base_;
   uint8_t *var_mem_base_;
   // memory address of model
+  uintptr_t fixed_mem_base_;  // Initial of mem_base_, keep forever.
   uint8_t *mem_base_;
   uint8_t *p2p_mem_base_;
   bool is_inner_mem_base_;
@@ -529,14 +559,7 @@ class DavinciModel {
   struct timeInfo time_info_;
   int32_t dataInputTid;
 
-  ///
-  /// @ingroup ge
-  /// @brief Save Batch label Info.
-  /// @param [in] const OpDescPtr &op_desc
-  /// @param [in] uintptr_t addr: address value in args block.
-  /// @return None.
-  ///
-  void SetBatchLabelAddr(const OpDescPtr &op_desc, uintptr_t addr);
+  void *GetRunAddress(void *addr) const;
 
   ///
   /// @ingroup ge
@@ -575,7 +598,7 @@ class DavinciModel {
   /// @param [in] batch_label: batch label for multi-batch scenes
   /// @return SUCCESS handle successfully / others handle failed
   ///
-  Status UpdateIoTaskArgs(const std::map<uint32_t, ZeroCopyOffset> &data_info, bool is_input,
+  Status UpdateIoTaskArgs(const map<uint32_t, ZeroCopyOffset> &data_info, bool is_input,
                           const vector<DataBuffer> &blobs, bool is_dynamic, const string &batch_label);
 
   Status CopyInputData(const InputData &input_data, bool device_data = false);
@@ -587,11 +610,12 @@ class DavinciModel {
   Status InitWeightMem(void *dev_ptr, void *weight_ptr, size_t weight_size);
   Status InitFeatureMapAndP2PMem(void *dev_ptr, size_t mem_size);
 
-  void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input);
+  void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, ShapeDescription &shape1, ShapeDescription &shape2);
 
-  void SetInputDimsInfo(const vector<int64_t> &model_input_dims, Format &format, InputOutputDescInfo &input);
+  void SetInputDimsInfo(const vector<int64_t> &input_dims, Format &format, ShapeDescription &shape_info);
 
-  Status GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, std::vector<uint32_t> &formats);
+  Status GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, vector<uint32_t> &input_formats, bool by_dims) const;
+  Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, vector<uint32_t> &output_formats) const;
 
   Status InitTaskInfo(domi::ModelTaskDef &modelTaskInfo);
 
@@ -603,7 +627,7 @@ class DavinciModel {
 
   uint8_t *MallocWeightsMem(size_t weights_size);
 
-  uint8_t* MallocP2PMem(size_t p2p_data_size);
+  uint8_t *MallocP2PMem(size_t p2p_data_size);
 
   void FreeFeatureMapMem();
 
@@ -613,6 +637,8 @@ class DavinciModel {
 
   void ReleaseTask();
 
+  void ClearTaskAddrs();
+
   void UnbindTaskSinkStream();
 
   bool IsAicpuKernelConnectSpecifiedLayer();
@@ -635,59 +661,33 @@ class DavinciModel {
   ///
   /// @ingroup ge
   /// @brief Data Op Initialize.
+  /// @param [in] ComputeGraphPtr: root graph of the model.
   /// @param [in] NodePtr: Data Op.
-  /// @param [in/out] data_op_index: NetOutput addr size info.
+  /// @param [in/out] data_op_index: index of courrent count.
+  /// @param [in/out] data_by_index: Data ordered by index.
   /// @return Status
   ///
-  Status InitDataOp(const NodePtr &node, uint32_t &data_op_index, map<uint32_t, OpDescPtr> &data_by_index);
+  Status InitDataOp(const ComputeGraphPtr &graph, const NodePtr &node, uint32_t &data_op_index,
+                    map<uint32_t, OpDescPtr> &data_by_index);
 
   ///
   /// @ingroup ge
   /// @brief Sort Data op list by index.
   /// @param [in] data_by_index: map of Data Op.
-  /// @return
-  ///
-  void AdjustDataOpList(const map<uint32_t, OpDescPtr> &data_by_index);
-
-  ///
-  /// @ingroup ge
-  /// @brief input zero copy node Initialize.
-  /// @param [in] NodePtr: Data Op.
+  /// @param [in] output_op_list: list of NetOutput op.
   /// @return Status
   ///
-  Status InitInputZeroCopyNodes(const NodePtr &node);
+  Status GenInputOutputInfo(const map<uint32_t, OpDescPtr> &data_by_index, const vector<OpDescPtr> &output_op_list);
 
   ///
   /// @ingroup ge
   /// @brief NetOutput Op Initialize.
+  /// @param [in] ComputeGraphPtr: root graph of the model.
   /// @param [in] NodePtr: NetOutput Op.
+  /// @param [in/out] vector<OpDescPtr>: All NetOutput node in model.
   /// @return Status
   ///
-  Status InitNetOutput(const NodePtr &node);
-
-  ///
-  /// @ingroup ge
-  /// @brief output zero copy node Initialize.
-  /// @param [in] NodePtr: Data Op.
-  /// @return Status
-  ///
-  Status InitOutputZeroCopyNodes(const NodePtr &node);
-
-  ///
-  /// @ingroup ge
-  /// @brief input zero copy node Initialize for Case.
-  /// @param [in] NodePtr: Data Op.
-  /// @return Status
-  ///
-  Status InitInputBatchLabel(const NodePtr &node);
-
-  ///
-  /// @ingroup ge
-  /// @brief output zero copy node Initialize for Case.
-  /// @param [in] NodePtr: netoutput Op.
-  /// @return Status
-  ///
-  Status InitOutputBatchLabel(const NodePtr &node);
+  Status InitNetOutput(const ComputeGraphPtr &graph, const NodePtr &node, vector<OpDescPtr> &output_op_list);
 
   ///
   /// @ingroup ge
@@ -696,7 +696,7 @@ class DavinciModel {
   ///
   Status InitConstant(const OpDescPtr &op_desc);
 
-  Status InitVariable(const OpDescPtr &op_desc);
+  Status InitVariable(const OpDescPtr &op_desc, map<string, OpDescPtr> &variable_by_name);
 
   /// @ingroup ge
   /// @brief LabelSet Op Initialize.
@@ -726,7 +726,7 @@ class DavinciModel {
   ///
   Status InitTbeHandle(const OpDescPtr &op_desc);
 
-  void StoreTbeHandle(const std::string &handle_key);
+  void StoreTbeHandle(const string &handle_key);
   void CleanTbeHandle();
 
   ///
@@ -757,7 +757,7 @@ class DavinciModel {
   ///
   Status BindInputQueue();
 
-  Status CpuTaskModelZeroCopy(std::vector<uintptr_t> &mbuf_list, std::map<const void *, ZeroCopyOffset> &outside_addrs);
+  Status CpuTaskModelZeroCopy(vector<uintptr_t> &mbuf_list, map<const void *, ZeroCopyOffset> &outside_addrs);
 
   ///
   /// @ingroup ge
@@ -828,98 +828,108 @@ class DavinciModel {
 
   Status DoTaskSink();
 
-  void CreateOutput(uint32_t index, OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);
+  void CreateOutput(uint32_t index, const OpDescPtr &op_desc, InputOutputDescInfo &output, uint32_t &format_result);
 
   Status TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id);
 
   // get desc info of graph for profiling
   Status GetComputeGraphInfo(vector<ComputeGraphDescInfo> &graph_desc_info);
 
-  void SetDataDumperArgs(const ComputeGraphPtr &compute_graph);
+  void SetDataDumperArgs(const ComputeGraphPtr &graph, const map<string, OpDescPtr> &variable_by_name);
+
+  Status InitL1DataDumperArgs();
 
-  Status GenOutputTensorInfo(const OpDescPtr &op_desc, uint32_t data_index, OutputData *output_data,
-                             std::vector<ge::OutputTensorInfo> &outputs);
+  Status InitModelProfile();
+  Status SinkModelProfile();
+
+  Status SinkTimeProfile(const InputData &current_data);
 
-  void ParseAIPPInfo(std::string in_out_info, InputOutputDims &dims_info);
+  Status InitOutputTensorInfo(const OpDescPtr &op_desc);
+  Status GenOutputTensorInfo(OutputData *output_data, vector<OutputTensorInfo> &outputs);
+
+  Status InitInputDescInfo(const map<uint32_t, OpDescPtr> &data_by_index);
+  Status InitOutputDescInfo(const vector<OpDescPtr> &output_op_list);
+
+  Status InitOrigInputInfo(uint32_t index, const OpDescPtr &op_desc);
+  Status InitAippInfo(uint32_t index, const OpDescPtr &op_desc);
+  Status InitAippType(uint32_t index, const OpDescPtr &op_desc, const map<uint32_t, OpDescPtr> &data_list);
+  Status InitAippInputOutputDims(uint32_t index, const OpDescPtr &op_desc);
+
+  void ParseAIPPInfo(string in_out_info, InputOutputDims &dims_info);
   void SetLabelForDynamic(const NodePtr &node);
 
-  void ParseDynamicOutShape(const std::vector<std::string> &str_info, std::vector<vector<int64_t>> &vec_info);
+  void ParseDynamicOutShape(const vector<string> &str_info, vector<vector<int64_t>> &vec_info);
   bool IsGetNextSinkDynamic(const OpDescPtr &op_desc);
+
+  Status InitRealSizeAndShapeInfo(const ComputeGraphPtr &compute_graph, const NodePtr &node);
   void GetAllGearsInfo(const NodePtr &node);
   Status GetGetDynamicDimsNodeInfo(const NodePtr &node);
-  Status GetGearAndRealOutSizeInfo(size_t input_count, const NodePtr &node);
-  Status GetRealOutputSizeOfMerge(size_t input_index, const NodePtr &merge_node);
-  Status GetGearAndRealOutShapeInfo(size_t input_count, const OpDescPtr &op_desc);
+  Status GetGearAndRealOutSizeInfo(const ComputeGraphPtr &graph, const NodePtr &node);
+  Status GetRealOutputSizeOfCase(const ComputeGraphPtr &graph, size_t input_index, const NodePtr &case_node);
+  Status GetGearAndRealOutShapeInfo(const ComputeGraphPtr &graph, const NodePtr &node);
 
   bool is_weight_mem_has_inited_;
   bool is_feature_map_mem_has_inited_;
 
   uint32_t model_id_;
   uint32_t runtime_model_id_;
+  uint32_t sub_model_id_ = 0;
   string name_;
 
   // used for inference data dump
   string om_name_;
 
   uint32_t version_;
-  GeModelPtr ge_model_;
+  GeModelPtr ge_model_;  // release after DavinciModel::Init
 
   bool need_destroy_aicpu_kernel_{false};
-  vector<std::string> out_node_name_;
-
-  map<uint32_t, OpDescPtr> op_list_;
+  vector<string> out_node_name_;
 
-  // data op_desc
-  vector<OpDescPtr> data_op_list_;
+  map<uint32_t, OpDescPtr> op_list_;  // release after DavinciModel::Init
 
-  vector<OpDescPtr> output_op_list_;
+  map<string, GeTensorDesc> broadcast_variable_;
+  void *global_step_addr_{nullptr};
+  uint64_t global_step_size_{0};
 
-  vector<OpDescPtr> variable_op_list_;
+  map<uint32_t, ZeroCopyOffset> new_input_data_info_;
+  map<uint32_t, ZeroCopyOffset> new_output_data_info_;
+  map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
+  map<const void *, ZeroCopyOffset> new_output_outside_addrs_;
 
-  std::map<uint32_t, ZeroCopyOffset> new_input_data_info_;
-  std::map<uint32_t, ZeroCopyOffset> new_output_data_info_;
-  std::map<const void *, ZeroCopyOffset> new_input_outside_addrs_;
-  std::map<const void *, ZeroCopyOffset> new_output_outside_addrs_;
-
-  std::set<const void *> real_virtual_addrs_;
+  set<const void *> real_virtual_addrs_;
 
   // output op: save cce op actual needed memory size
   vector<int64_t> output_memory_size_list_;
 
-  std::thread thread_id_;
+  thread thread_id_;
 
-  std::shared_ptr<ModelListener> listener_;
+  shared_ptr<ModelListener> listener_;
 
   bool run_flg_;
 
-  std::mutex mux_run_flg_;
+  mutex mux_run_flg_;
 
   int32_t priority_;
 
   vector<rtStream_t> stream_list_;
 
-  std::mutex all_hccl_stream_list_mutex_;
+  mutex all_hccl_stream_list_mutex_;
   vector<rtStream_t> all_hccl_stream_list_;
 
   // for reuse hccl_follow_stream
-  std::mutex capacity_of_stream_mutex_;
-  std::map<int64_t, std::vector<rtStream_t>> main_follow_stream_mapping_;
+  mutex capacity_of_stream_mutex_;
+  map<int64_t, vector<rtStream_t>> main_follow_stream_mapping_;
 
   vector<rtEvent_t> event_list_;
 
   vector<rtLabel_t> label_list_;
   set<uint32_t> label_id_indication_;
 
-  std::mutex outside_addrs_mutex_;
-  std::vector<ZeroCopyTask> zero_copy_tasks_;  // Task used Data or NetOutput addr.
-  std::set<const void *> copy_only_addrs_;     // Address need copy to original place.
-
-  // {op_id, batch_label}
-  std::map<int64_t, std::string> zero_copy_op_id_batch_label_;
-  // {batch_label, addrs}
-  std::map<std::string, std::set<uintptr_t>> zero_copy_batch_label_addrs_;
+  mutex outside_addrs_mutex_;
+  vector<ZeroCopyTask> zero_copy_tasks_;  // Task used Data or NetOutput addr.
+  set<const void *> copy_only_addrs_;     // Address need copy to original place.
 
-  std::vector<TaskInfoPtr> task_list_;
+  vector<TaskInfoPtr> task_list_;
   // rt_moodel_handle
   rtModel_t rt_model_handle_;
 
@@ -937,39 +947,41 @@ class DavinciModel {
   rtAicpuDeployType_t deploy_type_{AICPU_DEPLOY_RESERVED};
 
   // ACL queue schedule, save queue ids for Init.
-  std::vector<TaskInfoPtr> cpu_task_list_;
-  std::vector<uint32_t> input_queue_ids_;    // input queue ids created by caller.
-  std::vector<uint32_t> output_queue_ids_;   // output queue ids created by caller.
-  std::vector<uintptr_t> input_mbuf_list_;   // input mbuf created by dequeue task.
-  std::vector<uintptr_t> output_mbuf_list_;  // output mbuf created by dequeue task.
+  vector<TaskInfoPtr> cpu_task_list_;
+  vector<uint32_t> input_queue_ids_;    // input queue ids created by caller.
+  vector<uint32_t> output_queue_ids_;   // output queue ids created by caller.
+  vector<uintptr_t> input_mbuf_list_;   // input mbuf created by dequeue task.
+  vector<uintptr_t> output_mbuf_list_;  // output mbuf created by dequeue task.
 
   uint64_t session_id_;
 
   uint32_t device_id_;
 
-  std::mutex flowctrl_op_index_internal_map_mutex_;
-  std::map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;
+  mutex flowctrl_op_index_internal_map_mutex_;
+  map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;
 
-  std::vector<rtStream_t> active_stream_list_;
-  std::set<uint32_t> active_stream_indication_;
+  vector<rtStream_t> active_stream_list_;
+  set<uint32_t> active_stream_indication_;
 
-  std::set<uint32_t> hcom_streams_;
+  set<uint32_t> hcom_streams_;
   RuntimeParam runtime_param_;
 
-  static std::mutex tvm_bin_mutex_;
-  std::set<std::string> tvm_bin_kernel_;
+  static mutex tvm_bin_mutex_;
+  set<string> tvm_bin_kernel_;
 
-  std::map<std::string, uint32_t> used_tbe_handle_map_;
+  map<string, uint32_t> used_tbe_handle_map_;
 
   // for profiling task and graph info
-  std::vector<TaskDescInfo> task_desc_info_;
+  vector<TaskDescInfo> task_desc_info_;
+
+  std::map<std::string, std::pair<uint32_t, uint32_t>> profiler_report_op_info_;
 
   int64_t maxDumpOpNum_;
   // for data dump
   DataDumper data_dumper_;
   uint64_t iterator_count_;
   bool is_l1_fusion_enable_;
-  std::map<OpDescPtr, void *> saved_task_addrs_;
+  map<OpDescPtr, void *> saved_task_addrs_;  // release after DavinciModel::Init
   void *l1_fusion_addr_ = nullptr;
 
   bool known_node_ = false;
@@ -977,15 +989,17 @@ class DavinciModel {
   void *args_ = nullptr;
   void *args_host_ = nullptr;
   void *fixed_addrs_ = nullptr;
+  void *hybrid_addrs_ = nullptr;
+  uint32_t total_hybrid_args_size_ = 0;
   int64_t total_fixed_addr_size_ = 0;
-  std::map<const void *, void *> knonw_input_data_info_;
-  std::map<const void *, void *> knonw_output_data_info_;
+  map<const void *, void *> known_input_data_info_;
+  map<const void *, void *> known_output_data_info_;
   vector<void *> total_io_addrs_;
   vector<void *> orig_total_io_addrs_;
   bool base_addr_not_changed_ = false;
 
   vector<vector<int64_t>> batch_info_;
-  std::vector<std::vector<int64_t>> combined_batch_info_;
+  vector<vector<int64_t>> combined_batch_info_;
   vector<string> user_designate_shape_order_;
   int32_t dynamic_type_ = 0;
   bool is_dynamic_ = false;
@@ -993,29 +1007,54 @@ class DavinciModel {
   vector<uint64_t> batch_size_;
   // key: input tensor name, generally rts op;
   // value: the fixed addr of input anchor, same as the peer output anchor addr of the peer op
-  std::map<string, int64_t> tensor_name_to_fixed_addr_size_;
+  map<string, int64_t> tensor_name_to_fixed_addr_size_;
 
   // key: input tensor name, generally rts op; value: the peer output anchor of the peer op
-  std::map<string, int64_t> tensor_name_to_peer_output_index_;
+  map<string, int64_t> tensor_name_to_peer_output_index_;
   // if model is first execute
   bool is_first_execute_;
   // for op debug
-  std::mutex debug_reg_mutex_;
+  mutex debug_reg_mutex_;
   bool is_op_debug_reg_ = false;
   void *op_debug_addr_ = nullptr;
   void *p2p_debug_addr_ = nullptr;
-  bool is_new_model_desc_{false};
   bool is_online_infer_dynamic_ = false;
   bool is_getnext_sink_dynamic_ = false;
-  std::vector<int64_t> cur_dynamic_dims_;
+  vector<int32_t> cur_dynamic_dims_;
   void *netoutput_last_input_addr_ = nullptr;
   int64_t netoutput_last_input_size_ = 0;
   size_t shape_of_cur_dynamic_dims_ = 0;
   // key: input_index: input is merge node; value: each gear info and each output size
-  std::map<size_t, std::map<vector<int64_t>, int64_t>> merge_nodes_gear_and_real_out_size_info_;
+  map<size_t, map<vector<int32_t>, int64_t>> merge_nodes_gear_and_real_out_size_info_;
   // key: input_index: input is merge node; value: each gear info and each output shape
-  std::map<size_t, std::map<vector<int64_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
-  std::vector<std::vector<int64_t>> all_gears_info_;
+  map<size_t, map<vector<int32_t>, vector<int64_t>>> merge_nodes_gear_and_real_out_shape_info_;
+  vector<vector<int32_t>> all_gears_info_;
+
+  multimap<uint32_t, uint32_t> op_id_map_;
+  vector<ProfileInfo> profile_list_;
+
+  // For super kernel.
+  SuperKernelTaskInfo skt_info_;
+
+  bool is_dynamic_aipp_ = false;
+  vector<string> dynamic_output_shape_info_;
+
+  vector<vector<void *>> input_addrs_list_;
+  vector<vector<void *>> output_addrs_list_;
+
+  vector<int64_t> output_buffer_size_;
+  vector<GeShape> output_shape_info_;
+
+  map<uint32_t, OriginInputInfo> orig_input_info_;
+  map<uint32_t, AippConfigInfo> aipp_info_list_;
+  map<uint32_t, pair<InputAippType, size_t>> aipp_type_list_;
+  map<uint32_t, pair<vector<InputOutputDims>, vector<InputOutputDims>>> aipp_dims_info_;
+
+  vector<InputOutputDescInfo> input_descs_;
+  vector<InputOutputDescInfo> input_descs_dims_;
+  vector<uint32_t> input_formats_;
+  vector<InputOutputDescInfo> output_descs_;
+  vector<uint32_t> output_formats_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
diff --git a/ge/graph/load/new_model_manager/davinci_model_parser.cc b/ge/graph/load/new_model_manager/davinci_model_parser.cc
index 34180d08..76526de2 100644
--- a/ge/graph/load/new_model_manager/davinci_model_parser.cc
+++ b/ge/graph/load/new_model_manager/davinci_model_parser.cc
@@ -16,82 +16,7 @@
 
 #include "graph/load/new_model_manager/davinci_model_parser.h"
 
-#include <fstream>
-#include <memory>
-#include <vector>
-#include "securec.h"
-
-#include "common/debug/log.h"
-#include "graph/load/new_model_manager/davinci_model.h"
-
 namespace ge {
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelInfoParser(const ModelData &model, ModelInfo &model_info) {
-  GE_CHK_RT_RET(rtSetDevice(0));
-  try {
-    uint32_t model_len = 0;
-    uint8_t *model_data = nullptr;
-
-    Status ret = DavinciModelParser::ParseModelContent(model, model_data, model_len);
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, GE_CHK_RT(rtDeviceReset(0)); return ret, "Parse model failed");
-
-    auto *file_header = reinterpret_cast<ModelFileHeader *>(model.model_data);
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(file_header == nullptr, GE_CHK_RT(rtDeviceReset(0));
-                                   return PARAM_INVALID, "file_header is null.");
-
-    model_info.version = file_header->version;
-    model_info.is_encrypt = false;
-    GE_IF_BOOL_EXEC(ENCRYPTED == file_header->is_encrypt, model_info.is_encrypt = true);
-
-    std::shared_ptr<DavinciModel> davinci_model =
-      std::shared_ptr<DavinciModel>(new (std::nothrow) DavinciModel(model.priority, nullptr));
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(davinci_model == nullptr, GE_CHK_RT(rtDeviceReset(0));
-                                   return PARAM_INVALID, "davinci_model is null.");
-
-    GE_MAKE_GUARD(davinci_model, [&] { davinci_model = nullptr; });
-
-    ModelHelper model_helper;
-    ret = model_helper.LoadModel(model);
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((ret != SUCCESS), GE_CHK_RT(rtDeviceReset(0)); return FAILED, "load model failed");
-
-    ret = davinci_model->Assign(model_helper.GetGeModel());
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, GE_CHK_RT(rtDeviceReset(0));
-                                   return ret, "Parse davinci model data failed");
-
-    ret = davinci_model->Init();
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, GE_CHK_RT(rtDeviceReset(0));
-                                   return ret, "Davinci model init failed");
-
-    vector<InputOutputDescInfo> input_list;
-    vector<InputOutputDescInfo> output_list;
-
-    ret = davinci_model->GetInputOutputDescInfo(input_list, output_list);
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, GE_CHK_RT(rtDeviceReset(0));
-                                   return ret, "Davinci model GetInputOutputDescInfo failed");
-
-    for (const auto &desc : input_list) {
-      model_info.input_desc.push_back(desc.shape_info);
-    }
-    for (const auto &desc : output_list) {
-      model_info.output_desc.push_back(desc.shape_info);
-    }
-
-    model_info.name = davinci_model->Name();
-  } catch (...) {
-    DOMI_LOGE("OM model parser failed, some exceptions occur !");
-    GE_CHK_RT(rtDeviceReset(0));
-    return FAILED;
-  }
-
-  GE_CHK_RT(rtDeviceReset(0));
-
-  return SUCCESS;
-}
-
 DavinciModelParser::DavinciModelParser() {}
 
 DavinciModelParser::~DavinciModelParser() {}
diff --git a/ge/graph/load/new_model_manager/model_manager.cc b/ge/graph/load/new_model_manager/model_manager.cc
index d6cdf42d..edc60e50 100755
--- a/ge/graph/load/new_model_manager/model_manager.cc
+++ b/ge/graph/load/new_model_manager/model_manager.cc
@@ -18,6 +18,8 @@
 
 #include <string>
 
+#include "mmpa/mmpa_api.h"
+#include "aicpu/aicpu_schedule/aicpu_op_type_list.h"
 #include "common/dump/dump_manager.h"
 #include "common/l2_cache_optimize.h"
 #include "common/profiling/profiling_manager.h"
@@ -30,6 +32,7 @@
 #include "graph/load/new_model_manager/davinci_model_parser.h"
 #include "model/ge_root_model.h"
 #include "graph/common/local_context.h"
+#include "graph/utils/attr_utils.h"
 #include "common/formats/utils/formats_trans_utils.h"
 #include "hybrid/hybrid_davinci_model.h"
 
@@ -40,9 +43,7 @@ const int kCmdParSize = 2;
 const int kDumpCmdPairSize = 2;
 const std::size_t kProfCmdParaMaxSize = 1000;
 const std::size_t kProfStartCmdParaSize = 2;
-const std::string kCmdTypeProfile = "profile";
 const std::string kCmdTypeDump = "dump";
-const std::string kCmdTypeProfiling = "profiling";
 const std::string kCmdTypeProfInit = "prof_init";
 const std::string kCmdTypeProfFinalize = "prof_finalize";
 const std::string kCmdTypeProfStart = "prof_start";
@@ -51,6 +52,9 @@ const std::string kCmdTypeProfModelSubscribe = "prof_model_subscribe";
 const std::string kCmdTypeProfModelUnsubscribe = "prof_model_cancel_subscribe";
 const char *const kBatchLoadBuf = "batchLoadsoFrombuf";
 const char *const kDeleteCustOp = "deleteCustOp";
+const int kTimeSpecNano = 1000000000;
+const int kTimeSpecMiro = 1000000;
+const int kOpNameMaxSize = 100;
 struct CustAicpuSoBuf {
   uint64_t kernelSoBuf;
   uint32_t kernelSoBufLen;
@@ -77,7 +81,8 @@ ModelManager::ModelManager() {
   session_id_bias_ = 0;
 }
 
-Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id) {
+Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id,
+                                    uint32_t sub_model_id) {
   STR_FWK_OP_KERNEL param_base = {};
   void *devicebase = nullptr;
   void *aicpu_kernel_addr = nullptr;
@@ -87,10 +92,12 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u
   param_base.fwkKernelBase.fwk_kernel.sessionID = session_id;
   if (op_type == aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY) {
     std::vector<uint64_t> v_aicpu_kernel;
-    std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+    std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id) + "_" +
+                            std::to_string(sub_model_id);
+    std::lock_guard<std::recursive_mutex> lock(map_mutex_);
     auto iter = model_aicpu_kernel_.find(model_key);
     if (iter != model_aicpu_kernel_.end()) {
-      GELOGD("kernel destroy session_id %lu, model_id %u.", session_id, model_id);
+      GELOGD("kernel destroy session_id %lu, model_id %u, sub_model_id %u..", session_id, model_id, sub_model_id);
       v_aicpu_kernel = model_aicpu_kernel_.at(model_key);
       // Insert size of aicpu kernel vector in the first element
       v_aicpu_kernel.insert(v_aicpu_kernel.begin(), v_aicpu_kernel.size());
@@ -175,7 +182,7 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u
 }
 
 void ModelManager::DestroyAicpuSession(uint64_t session_id) {
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   auto it = sess_ids_.find(session_id);
   if (it == sess_ids_.end()) {
     GELOGI("The session: %lu not created.", session_id);
@@ -188,7 +195,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
       GE_CHK_RT(rtSetDevice(static_cast<int32_t>(GetContext().DeviceId())));
     }
 
-    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0);
+    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_DESTROY, session_id, 0, 0);
     if (ret != SUCCESS) {
       GELOGW("The session: %lu destroy failed.", session_id);
     } else {
@@ -204,7 +211,7 @@ void ModelManager::DestroyAicpuSession(uint64_t session_id) {
 }
 
 ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   auto hybrid_davinci_model = hybrid_model_map_.find(model_id);
   if (hybrid_davinci_model != hybrid_model_map_.end()) {
     uint64_t session_id = hybrid_davinci_model->second->GetSessionId();
@@ -214,20 +221,22 @@ ge::Status ModelManager::DestroyAicpuSessionForInfer(uint32_t model_id) {
 
   auto it = model_map_.find(model_id);
   if (it == model_map_.end()) {
-    GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id);
-    return GE_EXEC_MODEL_ID_INVALID;
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", model_id);
+    return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID;
   }
   uint64_t session_id = it->second->GetSessionId();
   DestroyAicpuSession(session_id);
   return SUCCESS;
 }
 
-ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id) {
+ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, uint32_t sub_model_id) {
   GELOGD("destroy aicpu kernel in session_id %lu, model_id %u.", session_id, model_id);
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
-  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
+  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id) + "_" +
+                          std::to_string(sub_model_id);
   if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
-    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id);
+    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_KERNEL_DESTROY, session_id, model_id,
+                                sub_model_id);
     if (ret != SUCCESS) {
       GELOGE(FAILED, "Destroy aicpu kernel failed.");
       return FAILED;
@@ -236,10 +245,12 @@ ge::Status ModelManager::DestroyAicpuKernel(uint64_t session_id, uint32_t model_
   return SUCCESS;
 }
 
-ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id) {
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint32_t sub_model_id,
+                                           uint64_t kernel_id) {
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   std::vector<uint64_t> v_aicpu_kernel;
-  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id);
+  std::string model_key = std::to_string(session_id) + "_" + std::to_string(model_id) + "_" +
+                          std::to_string(sub_model_id);
   if (model_aicpu_kernel_.find(model_key) != model_aicpu_kernel_.end()) {
     v_aicpu_kernel = model_aicpu_kernel_.at(model_key);
   }
@@ -249,7 +260,7 @@ ge::Status ModelManager::CreateAicpuKernel(uint64_t session_id, uint32_t model_i
 }
 
 ModelManager::~ModelManager() {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   model_map_.clear();
   model_aicpu_kernel_.clear();
   cust_aicpu_so_.clear();
@@ -345,7 +356,7 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge
 
     GELOGI("Parse model %u success.", model_id);
 
-    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 +
+    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * kTimeSpecNano +
                                                      timespec.tv_nsec));  // 1000 ^ 3 converts second to nanosecond
     davinci_model->SetProfileTime(MODEL_LOAD_END);
   } while (0);
@@ -357,24 +368,25 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge
 
 void ModelManager::InsertModel(uint32_t id, std::shared_ptr<DavinciModel> &davinci_model) {
   GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", id);
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   model_map_[id] = davinci_model;
 }
 
 void ModelManager::InsertModel(uint32_t id, shared_ptr<hybrid::HybridDavinciModel> &hybrid_model) {
   GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", id);
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   hybrid_model_map_[id] = hybrid_model;
 }
 
 Status ModelManager::DeleteModel(uint32_t id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
 
   auto it = model_map_.find(id);
   auto hybrid_model_it = hybrid_model_map_.find(id);
   if (it != model_map_.end()) {
     uint64_t session_id = it->second->GetSessionId();
-    std::string model_key = std::to_string(session_id) + "_" + std::to_string(id);
+    std::string model_key = std::to_string(session_id) + "_" + std::to_string(id)  + "_" +
+                            std::to_string(it->second->SubModelId());
     auto iter_aicpu_kernel = model_aicpu_kernel_.find(model_key);
     if (iter_aicpu_kernel != model_aicpu_kernel_.end()) {
       (void)model_aicpu_kernel_.erase(iter_aicpu_kernel);
@@ -383,22 +395,22 @@ Status ModelManager::DeleteModel(uint32_t id) {
   } else if (hybrid_model_it != hybrid_model_map_.end()) {
     (void)hybrid_model_map_.erase(hybrid_model_it);
   } else {
-    GELOGE(GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id);
-    return GE_EXEC_MODEL_ID_INVALID;
+    GELOGE(ACL_ERROR_GE_EXEC_MODEL_ID_INVALID, "model id %u does not exists.", id);
+    return ACL_ERROR_GE_EXEC_MODEL_ID_INVALID;
   }
 
   return SUCCESS;
 }
 
 std::shared_ptr<DavinciModel> ModelManager::GetModel(uint32_t id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
 
   auto it = model_map_.find(id);
   return (it == model_map_.end()) ? nullptr : it->second;
 }
 
 std::shared_ptr<hybrid::HybridDavinciModel> ModelManager::GetHybridModel(uint32_t id) {
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
 
   auto it = hybrid_model_map_.find(id);
   return (it == hybrid_model_map_.end()) ? nullptr : it->second;
@@ -455,8 +467,8 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d
 
 Status ModelManager::GetCurDynamicDims(const vector<vector<int64_t>> &user_real_input_dims,
                                        const vector<pair<string, vector<int64_t>>> &user_input_dims,
-                                       vector<int64_t> &cur_dynamic_dims) {
-  GELOGD(" Start get cur dynamic dims.");
+                                       vector<int32_t> &cur_dynamic_dims) {
+  GELOGD("Start get cur dynamic dims.");
   if (user_real_input_dims.size() != user_input_dims.size()) {
     GELOGE(INTERNAL_ERROR,
            "The input count of user: %zu should be equal to the data count of graph: %zu",
@@ -473,7 +485,7 @@ Status ModelManager::GetCurDynamicDims(const vector<vector<int64_t>> &user_real_
     }
     for (size_t j = 0; j < user_input_dims.at(i).second.size(); ++j) {
       if (user_input_dims.at(i).second.at(j) < 0) {
-        cur_dynamic_dims.emplace_back(user_real_input_dims[i][j]);
+        cur_dynamic_dims.emplace_back(static_cast<int32_t>(user_real_input_dims[i][j]));
       }
     }
   }
@@ -518,7 +530,7 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector<InputT
     input_data.blobs.push_back(data);
   }
   if (!GetLocalOmgContext().user_input_dims.empty() && GetLocalOmgContext().need_multi_batch) {
-    std::vector<int64_t> cur_dynamic_dims;
+    std::vector<int32_t> cur_dynamic_dims;
     if (!GetLocalOmgContext().user_real_input_dims.empty()) {
       if (GetCurDynamicDims(GetLocalOmgContext().user_real_input_dims, GetLocalOmgContext().user_input_dims,
                             cur_dynamic_dims) != SUCCESS) {
@@ -526,9 +538,9 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector<InputT
         return INTERNAL_ERROR;
       }
       DataBuffer data;
-      data.data = new(std::nothrow) int64_t[cur_dynamic_dims.size()];
+      data.data = new(std::nothrow) int32_t[cur_dynamic_dims.size()];
       GE_CHECK_NOTNULL(data.data);
-      uint64_t length = static_cast<uint64_t>(cur_dynamic_dims.size() * sizeof(int64_t));
+      uint32_t length = static_cast<uint32_t>(cur_dynamic_dims.size() * sizeof(int32_t));
       GE_CHK_BOOL_EXEC(memcpy_s(data.data, length, cur_dynamic_dims.data(), length) == EOK, return INTERNAL_ERROR,
                        "Failed to memcpy data.");
       data.length = length;
@@ -629,8 +641,7 @@ Status ModelManager::Stop(uint32_t model_id) {
 ///
 Status ModelManager::HandleCommand(const Command &command) {
   static const std::map<std::string, std::function<uint32_t(const Command &)>> cmds = {
-      {kCmdTypeProfile, HandleProfileCommand}, {kCmdTypeDump, HandleDumpCommand},
-      {kCmdTypeProfiling, HandleAclProfilingCommand}, {kCmdTypeProfInit, HandleProfInitCommand},
+      {kCmdTypeDump, HandleDumpCommand}, {kCmdTypeProfInit, HandleProfInitCommand},
       {kCmdTypeProfFinalize, HandleProfFinalizeCommand}, {kCmdTypeProfStart, HandleProfStartCommand},
       {kCmdTypeProfStop, HandleProfStopCommand},
       {kCmdTypeProfModelSubscribe, HandleProfModelSubscribeCommand},
@@ -645,21 +656,6 @@ Status ModelManager::HandleCommand(const Command &command) {
   }
 }
 
-Status ModelManager::HandleAclProfilingCommand(const Command &command) {
-  if (command.cmd_params.size() < kCmdParSize) {
-    GELOGE(PARAM_INVALID, "When the cmd_type is 'profiling', the size of cmd_params must larger than 2.");
-    return PARAM_INVALID;
-  }
-
-  std::string map_key = command.cmd_params[0];
-  std::string value = command.cmd_params[1];
-  if (map_key == PROFILE_CONFIG) {
-    ProfilingManager::Instance().SetProfilingConfig(value);
-  }
-
-  return SUCCESS;
-}
-
 Status ModelManager::GetModelByCmd(const Command &command,
                                    std::shared_ptr<DavinciModel> &davinci_model) {
   if (command.cmd_params.size() < kCmdParSize) {
@@ -806,29 +802,6 @@ Status ModelManager::HandleProfStopCommand(const Command &command) {
   return SUCCESS;
 }
 
-Status ModelManager::HandleProfileCommand(const Command &command) {
-  if (command.cmd_params.size() < kCmdParSize) {
-    GELOGE(PARAM_INVALID, "When the cmd_type is 'profile', the size of cmd_params must larger than 2.");
-    return PARAM_INVALID;
-  }
-
-  std::string map_key = command.cmd_params[0];
-  std::string value = command.cmd_params[1];
-
-  GELOGI("Profiling mode, Command key:%s , value:%s ", map_key.c_str(), value.c_str());
-
-  auto iter = PROFILE_COMPONENT_MAP.find(map_key);
-  if (iter != PROFILE_COMPONENT_MAP.end()) {
-    std::string property_value = (value == "on") ? "1" : "0";
-    PropertiesManager::Instance().SetPropertyValue(iter->second, property_value);
-  }
-
-  if ((map_key == PROFILER_JOBCTX || map_key == PROFILER_TARGET_PATH || map_key == RTS_PROFILE_PATH)) {
-    PropertiesManager::Instance().SetPropertyValue(map_key, value);
-  }
-  return SUCCESS;
-}
-
 static Status ParserPara(const Command &command, const string &dump_key, string &dump_value) {
   auto iter = std::find(command.cmd_params.begin(), command.cmd_params.end(), dump_key);
   if (iter != command.cmd_params.end()) {
@@ -940,12 +913,10 @@ Status ModelManager::GetInputOutputDescInfo(const uint32_t model_id, vector<Inpu
   }
 
   std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, GE_EXEC_MODEL_ID_INVALID,
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
                          "GetInputOutputDescInfo Failed, Invalid model id %u!", model_id);
 
-  davinci_model->SetModelDescVersion(new_model_desc);
-
-  return davinci_model->GetInputOutputDescInfo(input_desc, output_desc, inputFormats, outputFormats);
+  return davinci_model->GetInputOutputDescInfo(input_desc, output_desc, inputFormats, outputFormats, new_model_desc);
 }
 
 ///
@@ -1008,8 +979,9 @@ Status ModelManager::GetUserDesignateShapeOrder(const uint32_t model_id,
 }
 
 Status ModelManager::GetCurShape(const uint32_t model_id, std::vector<int64_t> &batch_info, int32_t &dynamic_type) {
-  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHECK_NOTNULL(davinci_model);
+  auto davinci_model = GetModel(model_id);
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+                         "GetCurShape Failed, Invalid Model ID %u!", model_id);
   davinci_model->GetCurShape(batch_info, dynamic_type);
   return SUCCESS;
 }
@@ -1022,22 +994,12 @@ Status ModelManager::GetModelAttr(uint32_t model_id, std::vector<string> &dynami
   }
 
   std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHECK_NOTNULL(davinci_model);
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+                         "GetModelAttr Failed, Invalid Model ID %u!", model_id);
   davinci_model->GetModelAttr(dynamic_output_shape_info);
   return SUCCESS;
 }
 
-Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id, vector<InputOutputDescInfo> &input_desc,
-                                                       vector<InputOutputDescInfo> &output_desc,
-                                                       std::vector<uint32_t> &inputFormats,
-                                                       std::vector<uint32_t> &outputFormats) {
-  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetInputOutputDescInfo Failed, Invalid model id %u!",
-                         model_id);
-
-  return davinci_model->GetInputOutputDescInfoForZeroCopy(input_desc, output_desc, inputFormats, outputFormats);
-}
-
 ///
 /// @ingroup ge
 /// @brief Get AIPP info
@@ -1046,25 +1008,27 @@ Status ModelManager::GetInputOutputDescInfoForZeroCopy(const uint32_t model_id,
 /// @param [out] aipp_info
 /// @return execute result
 ///
-Status ModelManager::GetAIPPInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) {
+Status ModelManager::GetAippInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info) {
   std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
   GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
-                         "GetAIPPInfo failed, invalid model_id is %u.",
-                         model_id);
-
-  return davinci_model->GetAIPPInfo(index, aipp_info);
+      "GetAIPPInfo failed, invalid model_id is %u.", model_id);
+  return davinci_model->GetAippInfo(index, aipp_info);
 }
 
 Status ModelManager::GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index) {
   std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
   GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
-                         "GetAIPPInfo failed, invalid model_id is %u.",
-                         model_id);
-
+      "GetAIPPInfo failed, invalid model_id is %u.", model_id);
   return davinci_model->GetAippType(index, type, aipp_index);
 }
 
 Status ModelManager::GenSessionId(uint64_t &session_id) {
+  const uint64_t kSessionTimeMask = 0xffffffffffff0000;
+  const uint64_t kSessionPidMask  = 0x000000000000ff00;
+  const uint64_t kSessionBiasMask = 0x00000000000000ff;
+
+  const uint64_t kMaskPerOffset = 8;
+
   std::lock_guard<std::mutex> lock(session_id_create_mutex_);
 
   mmTimeval tv;
@@ -1072,12 +1036,14 @@ Status ModelManager::GenSessionId(uint64_t &session_id) {
     GELOGE(INTERNAL_ERROR, "Failed to get current time.");
     return INTERNAL_ERROR;
   }
-  session_id = static_cast<uint64_t>(tv.tv_sec * 1000000 + tv.tv_usec);  // 1000000us
+  uint64_t timestamp = static_cast<uint64_t>(tv.tv_sec * kTimeSpecMiro + tv.tv_usec);  // 1000000us
+
+  static uint32_t pid = mmGetPid();
 
   session_id_bias_++;
-  // max bais 100.
-  session_id_bias_ = session_id_bias_ % 100;
-  session_id = session_id * 100 + session_id_bias_;
+
+  session_id = ((timestamp<<kMaskPerOffset<<kMaskPerOffset) & kSessionTimeMask) +
+               ((pid<<kMaskPerOffset) & kSessionPidMask) + (session_id_bias_ & kSessionBiasMask);
 
   GELOGD("Generate new session id: %lu.", session_id);
   return SUCCESS;
@@ -1086,15 +1052,22 @@ Status ModelManager::GenSessionId(uint64_t &session_id) {
 Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model, shared_ptr<ModelListener> listener,
                                       void *dev_ptr, size_t mem_size, void *weight_ptr, size_t weight_size) {
   GE_CHK_BOOL_RET_STATUS(model.key.empty() || mmAccess2(model.key.c_str(), M_F_OK) == EN_OK,
-	                 ACL_ERROR_GE_PARAM_INVALID,
-                         "input key file path %s is invalid, %s", model.key.c_str(), strerror(errno));
+      ACL_ERROR_GE_PARAM_INVALID, "input key file path %s is invalid, %s", model.key.c_str(), strerror(errno));
   GenModelId(&model_id);
 
   shared_ptr<DavinciModel> davinci_model = nullptr;
   mmTimespec timespec = mmGetTickCount();
 
   ModelHelper model_helper;
-  Status ret = model_helper.LoadModel(model);
+  Status ret = model_helper.LoadRootModel(model);
+  if (model_helper.GetModelType()) {
+    bool is_shape_unknown = false;
+    GE_CHK_STATUS_RET(model_helper.GetGeRootModel()->CheckIsUnknownShape(is_shape_unknown),
+                      "CheckIsUnknownShape failed, model id:%u", model_id);
+    if (is_shape_unknown || GetContext().GetHostExecFlag()) {
+      return DoLoadHybridModelOnline(model_id, model_helper.GetGeRootModel(), listener);
+    }
+  }
   if (ret != SUCCESS) {
     GELOGE(ret, "load model failed.");
     return ret;
@@ -1108,8 +1081,8 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
       GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed");
       return ACL_ERROR_GE_MEMORY_ALLOCATION;
     } catch (...) {
-      GELOGE(INTERNAL_ERROR, "Make shared failed since other exception raise");
-      return INTERNAL_ERROR;
+      GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Make shared failed since other exception raise");
+      return ACL_ERROR_GE_MEMORY_ALLOCATION;
     }
     ret = davinci_model->Assign(ge_model);
     if (ret != SUCCESS) {
@@ -1121,7 +1094,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
     int32_t device_id = 0;
     rtError_t rt_ret = rtGetDevice(&device_id);
     if (rt_ret != RT_ERROR_NONE || device_id < 0) {
-      GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
+      GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
       return RT_ERROR_TO_GE_STATUS(rt_ret);
     }
     davinci_model->SetDeviceId(device_id);
@@ -1148,7 +1121,7 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
 
     GELOGI("Parse model %u success.", model_id);
 
-    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * 1000 * 1000 * 1000 +
+    davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * kTimeSpecNano +
                                                      timespec.tv_nsec));  // 1000 ^ 3 converts second to nanosecond
     davinci_model->SetProfileTime(MODEL_LOAD_END);
 
@@ -1252,14 +1225,16 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy
   }
 
   std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
-  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid model id %u.", model_id);
+  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, ACL_ERROR_GE_EXEC_MODEL_ID_INVALID,
+                         "Invalid model id %u, check whether model has been loaded or not.", model_id);
 
   if (davinci_model->NeedDestroyAicpuKernel()) {
     GELOGI("Start to destroy specified aicpu kernel.");
     // Zero copy is enabled by default, no need to judge.
     uint64_t session_id_davinci = davinci_model->GetSessionId();
     uint32_t model_id_davinci = davinci_model->GetModelId();
-    Status status = DestroyAicpuKernel(session_id_davinci, model_id_davinci);
+    uint32_t sub_model_id = davinci_model->SubModelId();
+    Status status = DestroyAicpuKernel(session_id_davinci, model_id_davinci, sub_model_id);
     if (status != SUCCESS) {
       GELOGW("Destroy specified aicpu kernel failed, session id is %lu, model id is %u.", session_id_davinci,
              model_id_davinci);
@@ -1275,11 +1250,11 @@ Status ModelManager::ExecuteModel(uint32_t model_id, rtStream_t stream, bool asy
 }
 
 Status ModelManager::CreateAicpuSession(uint64_t session_id) {
-  std::lock_guard<std::mutex> lock(sess_ids_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   auto it = sess_ids_.find(session_id);
   // never been created by any model
   if (it == sess_ids_.end()) {
-    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_CREATE, session_id, 0);
+    Status ret = KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType::FWK_ADPT_SESSION_CREATE, session_id, 0, 0);
     if (ret == SUCCESS) {
       (void)sess_ids_.insert(session_id);
       GELOGI("The session: %lu create success.", session_id);
@@ -1289,13 +1264,13 @@ Status ModelManager::CreateAicpuSession(uint64_t session_id) {
   return SUCCESS;
 }
 
-Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name) {
-  GELOGI("LoadCustAicpuSo in, op name %s, so name %s", op_desc->GetName().c_str(), so_name.c_str());
+Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name, bool &loaded) {
+  GELOGD("LoadCustAicpuSo in, op name %s, so name %s", op_desc->GetName().c_str(), so_name.c_str());
   std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
   CustAICPUKernelPtr aicpu_kernel = op_desc->TryGetExtAttr(OP_EXTATTR_CUSTAICPU_KERNEL, CustAICPUKernelPtr());
   if (aicpu_kernel == nullptr) {
-    GELOGE(INTERNAL_ERROR, "cust aicpu op %s can't find kernel!", op_desc->GetName().c_str());
-    return INTERNAL_ERROR;
+    GELOGI("cust aicpu op %s has no corresponding kernel!", op_desc->GetName().c_str());
+    return SUCCESS;
   }
 
   // get current context
@@ -1313,18 +1288,24 @@ Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_
     std::map<string, CustAICPUKernelPtr> new_so_name;
     new_so_name.insert({so_name, aicpu_kernel});
     cust_aicpu_so_[resource_id] = new_so_name;
-    GELOGI("LoadCustAicpuSo new aicpu so resource id %lu", resource_id);
+    loaded = false;
+    GELOGD("LoadCustAicpuSo new aicpu so name %s, resource id %lu", so_name.c_str(), resource_id);
     return SUCCESS;
   }
   auto it_so_name = it->second.find(so_name);
   if (it_so_name == it->second.end()) {
     it->second.insert({so_name, aicpu_kernel});
-    GELOGI("LoadCustAicpuSo add aicpu so resource id %lu", resource_id);
+    loaded = false;
+    GELOGD("LoadCustAicpuSo add aicpu so name %s, resource id %lu", so_name.c_str(), resource_id);
+    return SUCCESS;
   }
+  loaded = true;
+  GELOGD("LoadCustAicpuSo so name %s has been loaded.", so_name.c_str());
   return SUCCESS;
 }
 
 Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) {
+  GELOGD("Aicpu kernel launch task in, kernel name %s.", kernel_name.c_str());
   std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
   if (cust_aicpu_so_.size() == 0) return SUCCESS;
   // get current context
@@ -1488,8 +1469,7 @@ void ModelManager::GenModelId(uint32_t *id) {
   if (id == nullptr) {
     return;
   }
-
-  std::lock_guard<std::mutex> lock(map_mutex_);
+  std::lock_guard<std::recursive_mutex> lock(map_mutex_);
   *id = ++max_model_id_;
 }
 
@@ -1561,4 +1541,205 @@ Status ModelManager::EnableExceptionDump(const std::map<string, string> &options
   return SUCCESS;
 }
 
+Status ModelManager::LaunchKernelCheckAicpuOp(std::vector<std::string> &aicpu_optype_list,
+                                              std::vector<std::string> &aicpu_tf_optype_list) {
+  std::string kernel_name = "checkOpType";
+  GELOGI("LaunchKernelCheckAicpuOpType in, kernel name %s", kernel_name.c_str());
+  std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
+  std::vector<SysOpInfo> req_aicpu_op_info_list;
+  std::vector<SysOpInfo> res_aicpu_op_info_list;
+  std::vector<ReturnCode> res_ret_code_list;
+
+  if (aicpu_optype_list.empty() && aicpu_tf_optype_list.empty()) {
+    GELOGI("No need to check aicpu op type.");
+    return SUCCESS;
+  }
+
+  vector<void *> allocated_mem;
+  rtError_t status;
+  rtStream_t stream = nullptr;
+  void *args = nullptr;
+
+  void *d_req_op_list = nullptr;
+  void *d_res_op_list = nullptr;
+  void *d_ret_code_list = nullptr;
+
+  size_t aicpu_op_nums = aicpu_optype_list.size();
+  size_t tf_op_nums = aicpu_tf_optype_list.size();
+  size_t op_nums = aicpu_op_nums + tf_op_nums;
+  std::function<void()> callback = [&]() {
+    for (auto mem : allocated_mem) {
+      GE_CHK_RT(rtFree(mem));
+    }
+  };
+  GE_MAKE_GUARD(release, callback);
+  // malloc sysOpInfoList in SysOpCheckInfo
+  status = rtMalloc(&d_req_op_list, op_nums * sizeof(SysOpInfo), RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+  allocated_mem.push_back(d_req_op_list);
+
+  // malloc sysOpInfoList in SysOpCheckResp
+  status = rtMalloc(&d_res_op_list, op_nums * sizeof(SysOpInfo), RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+  allocated_mem.push_back(d_res_op_list);
+
+  // malloc returnCodeList in SysOpCheckResp
+  status = rtMalloc(&d_ret_code_list, op_nums * sizeof(ReturnCode), RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+  allocated_mem.push_back(d_ret_code_list);
+
+  for (const auto &op_type : aicpu_optype_list) {
+    SysOpInfo op_info;
+    // malloc op_type name in SysOpInfo
+    void *d_op_type_name = nullptr;
+    status = rtMalloc(&d_op_type_name, op_type.length(), RT_MEMORY_HBM);
+    if (status != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status);
+      return RT_ERROR_TO_GE_STATUS(status);
+    }
+    allocated_mem.push_back(d_op_type_name);
+    GE_CHK_RT(rtMemcpy(d_op_type_name, op_type.length(), op_type.c_str(), op_type.length(), RT_MEMCPY_HOST_TO_DEVICE));
+    op_info.opType = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_op_type_name));
+    op_info.opLen = op_type.length();
+    op_info.kernelsType = CPU_KERNEL;
+    req_aicpu_op_info_list.emplace_back(op_info);
+  }
+
+  for (const auto &op_type : aicpu_tf_optype_list) {
+    SysOpInfo op_info;
+    // malloc op_type name in SysOpInfo
+    void *d_op_type_name = nullptr;
+    status = rtMalloc(&d_op_type_name, op_type.size(), RT_MEMORY_HBM);
+    if (status != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status);
+      return RT_ERROR_TO_GE_STATUS(status);
+    }
+    allocated_mem.push_back(d_op_type_name);
+    GE_CHK_RT(rtMemcpy(d_op_type_name, op_type.size(), op_type.c_str(), op_type.size(), RT_MEMCPY_HOST_TO_DEVICE));
+    op_info.opType = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_op_type_name));
+    op_info.opLen = op_type.size();
+    op_info.kernelsType = TF_KERNEL;
+    req_aicpu_op_info_list.emplace_back(op_info);
+  }
+  GELOGI("Check aicpu op all attr size: %zu, real attr size: %zu.", op_nums, req_aicpu_op_info_list.size());
+  GE_CHK_RT(rtMemcpy(d_req_op_list, sizeof(SysOpInfo) * req_aicpu_op_info_list.size(), req_aicpu_op_info_list.data(),
+                     sizeof(SysOpInfo) * req_aicpu_op_info_list.size(), RT_MEMCPY_HOST_TO_DEVICE));
+
+  SysOpCheckInfo op_check_info_req = { 0 };
+  SysOpCheckResp op_check_info_res = { 0 };
+  op_check_info_req.opListNum = op_nums;
+  op_check_info_req.offSetLen = sizeof(SysOpCheckInfo);
+  op_check_info_req.sysOpInfoList = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_req_op_list));
+
+  op_check_info_res.opListNum = 0;
+  op_check_info_res.isWithoutJson = 0;
+  op_check_info_res.returnCodeList = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_ret_code_list));
+  op_check_info_res.sysOpInfoList = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(d_res_op_list));
+
+  uint32_t args_size = sizeof(SysOpCheckInfo) + sizeof(SysOpCheckResp);
+  status = rtMalloc(&args, args_size, RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt failed, status: 0x%x", status);
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+  allocated_mem.push_back(args);
+  GE_CHK_RT(rtMemcpy(args, sizeof(SysOpCheckInfo), reinterpret_cast<void *>(&op_check_info_req), sizeof(SysOpCheckInfo),
+                     RT_MEMCPY_HOST_TO_DEVICE));
+  GE_CHK_RT(rtMemcpy(
+    reinterpret_cast<void *>(static_cast<uintptr_t>(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args)) +
+    op_check_info_req.offSetLen)), sizeof(SysOpCheckResp), reinterpret_cast<void *>(&op_check_info_res),
+    sizeof(SysOpCheckResp), RT_MEMCPY_HOST_TO_DEVICE));
+  GE_CHK_RT(rtStreamCreate(&stream, 0));
+  GE_CHK_RT(rtCpuKernelLaunch(nullptr, kernel_name.c_str(), 1, args, args_size, nullptr, stream));
+
+  status = rtStreamSynchronize(stream);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt stream sync failed, status: 0x%x", status);
+    GE_CHK_RT(rtStreamDestroy(stream));
+    return RT_ERROR_TO_GE_STATUS(status);
+  }
+
+  // Check the response
+  SysOpCheckResp *d_op_check_info_res =
+    reinterpret_cast<SysOpCheckResp *>(reinterpret_cast<void *>(static_cast<uintptr_t>(static_cast<uint64_t>(
+    reinterpret_cast<uintptr_t>(args)) + op_check_info_req.offSetLen)));
+  (void)memset_s(&op_check_info_res, sizeof(SysOpCheckResp), 0, sizeof(SysOpCheckResp));
+  GE_CHK_RT(rtMemcpy(&op_check_info_res, sizeof(SysOpCheckResp), d_op_check_info_res, sizeof(SysOpCheckResp),
+                     RT_MEMCPY_DEVICE_TO_HOST));
+
+  if (op_check_info_res.isWithoutJson) {
+    GELOGI("No need to check aicpu in this scenoria.");
+    GE_CHK_RT(rtStreamDestroy(stream));
+    return SUCCESS;
+  }
+  uint64_t res_op_nums = op_check_info_res.opListNum;
+  GELOGI("Check aicpu type, is without json: %d, res op num: %lu.", op_check_info_res.isWithoutJson, res_op_nums);
+  if (res_op_nums != 0) {
+    res_ret_code_list.clear();
+    res_ret_code_list.resize(res_op_nums);
+    res_aicpu_op_info_list.clear();
+    res_aicpu_op_info_list.resize(res_op_nums);
+    GE_CHK_RT(rtMemcpy(res_ret_code_list.data(), sizeof(ReturnCode) * res_op_nums,
+                       reinterpret_cast<void *>(static_cast<uintptr_t>(op_check_info_res.returnCodeList)),
+                       sizeof(ReturnCode) * res_op_nums, RT_MEMCPY_DEVICE_TO_HOST));
+    GE_CHK_RT(rtMemcpy(res_aicpu_op_info_list.data(), sizeof(SysOpInfo) * res_op_nums,
+                       reinterpret_cast<void *>(static_cast<uintptr_t>(op_check_info_res.sysOpInfoList)),
+                       sizeof(SysOpInfo) * res_op_nums, RT_MEMCPY_DEVICE_TO_HOST));
+    if (res_ret_code_list.size() != res_aicpu_op_info_list.size() || res_ret_code_list.size() != res_op_nums) {
+      GELOGE(FAILED, "Number of retcode is not equal to number of op type.");
+      GE_CHK_RT(rtStreamDestroy(stream));
+      return FAILED;
+    }
+    std::string fail_reason;
+    for (uint32_t i = 0; i < res_op_nums; i++) {
+      ReturnCode ret_code = res_ret_code_list.at(i);
+      SysOpInfo aicpu_info = res_aicpu_op_info_list.at(i);
+      GELOGI("Not support aicpu op type: %lu, kernel_type:%d, opLen:%d, ret_code:%d", aicpu_info.opType,
+             aicpu_info.kernelsType, aicpu_info.opLen, ret_code);
+      std::vector<char> op_name;
+      op_name.clear();
+      op_name.resize(kOpNameMaxSize);
+      GE_CHK_RT(rtMemcpy(op_name.data(), aicpu_info.opLen, reinterpret_cast<void *>(aicpu_info.opType),
+                         aicpu_info.opLen, RT_MEMCPY_DEVICE_TO_HOST));
+      std::string kernel_type =
+          (static_cast<OpKernelType>(aicpu_info.kernelsType) == TF_KERNEL) ? "TF_KERNEL" : "CPU_KERNEL";
+      string op_name_str(op_name.data());
+      fail_reason += "op_type: " + op_name_str + " kernel_type: " + kernel_type +
+                     "  ret code:" + std::to_string(static_cast<int>(ret_code)) +
+                     "<0: op_type, 1: format, 2: datatype> \n";
+    }
+    fail_reason += "not support.";
+    GELOGE(FAILED, "Check aicpu op_type failed. details: %s", fail_reason.c_str());
+    GE_CHK_RT(rtStreamDestroy(stream));
+    return FAILED;
+  }
+
+  GE_CHK_RT(rtStreamDestroy(stream));
+  GELOGI("Cpu kernel launch check optype task success.");
+  return SUCCESS;
+}
+
+Status ModelManager::CheckAicpuOpList(GeModelPtr ge_model) {
+  std::vector<std::string> aicpu_optype_list;
+  std::vector<std::string> aicpu_tf_optype_list;
+  bool aicpu_need_check = ge::AttrUtils::GetListStr(ge_model, "needCheckCpu", aicpu_optype_list);
+  bool tf_need_check = ge::AttrUtils::GetListStr(ge_model, "needCheckTf", aicpu_tf_optype_list);
+  if (!aicpu_need_check && !tf_need_check) {
+    GELOGI("Graph:%s No need to check aicpu optype.", ge_model->GetGraph().GetName().c_str());
+    return SUCCESS;
+  }
+  GE_CHK_STATUS_RET(LaunchKernelCheckAicpuOp(aicpu_optype_list, aicpu_tf_optype_list),
+                    "Launch check aicpu op type failed.");
+  return SUCCESS;
+}
+
 }  // namespace ge
diff --git a/ge/graph/load/new_model_manager/model_manager.h b/ge/graph/load/new_model_manager/model_manager.h
index e3780d5b..aa0753b1 100755
--- a/ge/graph/load/new_model_manager/model_manager.h
+++ b/ge/graph/load/new_model_manager/model_manager.h
@@ -126,14 +126,14 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   ///
   /// @ingroup domi_ome
   /// @brief Get cur_dynamic_dims for all input.
-  /// @param [in] vector<vector<uint64_t>> &user_real_input_dims: dims info of all user_inputs.
+  /// @param [in] vector<vector<int64_t>> &user_real_input_dims: dims info of all user_inputs.
   /// @param [in] vector<pair<string, vector<int64_t>>> &user_input_dims: key:name. value:dynamic dims from option.
-  /// @param [out] vector<uint64_t> &cur_dynamic_dims: real dims gather, where the index of -1.
+  /// @param [out] vector<int32_t> &cur_dynamic_dims: real dims gather, where the index of -1.
   /// @return 0: SUCCESS / others: INTERNAL_ERROR
   ///
   Status GetCurDynamicDims(const vector<vector<int64_t>> &user_real_input_dims,
                            const vector<pair<string, vector<int64_t>>> &user_input_dims,
-                           vector<int64_t> &cur_dynamic_dims);
+                           vector<int32_t> &cur_dynamic_dims);
 
   ///
   /// @ingroup domi_ome
@@ -169,8 +169,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   /// @brief comment handle function
   ///
   ge::Status HandleCommand(const Command &command);
-  static ge::Status HandleAclProfilingCommand(const Command &command);
-  static ge::Status HandleProfileCommand(const Command &command);
   static ge::Status HandleDumpCommand(const Command &command);
   static ge::Status HandleProfModelSubscribeCommand(const Command &command);
   static ge::Status HandleProfModelUnsubscribeCommand(const Command &command);
@@ -241,24 +239,10 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   /// @param [out] aipp_info
   /// @return execute result
   ///
-  ge::Status GetAIPPInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info);
+  ge::Status GetAippInfo(const uint32_t model_id, uint32_t index, AippConfigInfo &aipp_info);
 
   ge::Status GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index);
 
-  ///
-  /// @ingroup domi_ome
-  /// @brief set model input and output size zero copy
-  /// @param [in] model_id  model id
-  /// @param [out] input_shape   input tensor
-  /// @param [out] output_shape  output tensor
-  /// @return SUCCESS          success
-  /// @return PARAM_INVALID    parameter invalid
-  ///
-  ge::Status GetInputOutputDescInfoForZeroCopy(const uint32_t model_id, std::vector<InputOutputDescInfo> &input_desc,
-                                               std::vector<InputOutputDescInfo> &output_desc,
-                                               std::vector<uint32_t> &inputFormats,
-                                               std::vector<uint32_t> &outputFormats);
-
   ge::Status GetCurShape(const uint32_t model_id, std::vector<int64_t> &batch_info, int32_t &dynamic_type);
 
   ge::Status GetModelAttr(uint32_t model_id, std::vector<string> &dynamic_output_shape_info);
@@ -275,7 +259,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
 
   std::shared_ptr<hybrid::HybridDavinciModel> GetHybridModel(uint32_t id);
 
-  ge::Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id);
+  ge::Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id,
+                            uint32_t sub_model_id);
 
   ge::Status CreateAicpuSession(uint64_t session_id);
 
@@ -283,13 +268,13 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
 
   void DestroyAicpuSession(uint64_t session_id);
 
-  ge::Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id);
+  ge::Status DestroyAicpuKernel(uint64_t session_id, uint32_t model_id, uint32_t sub_model_id);
 
-  ge::Status CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint64_t kernel_id);
+  ge::Status CreateAicpuKernel(uint64_t session_id, uint32_t model_id, uint32_t sub_model_id, uint64_t kernel_id);
 
   ge::Status DestroyAicpuSessionForInfer(uint32_t model_id);
 
-  ge::Status LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name);
+  ge::Status LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_name, bool &loaded);
 
   ge::Status LaunchCustAicpuSo();
 
@@ -297,6 +282,11 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
 
   ge::Status LaunchKernelCustAicpuSo(const string &kernel_name);
 
+  ge::Status LaunchKernelCheckAicpuOp(std::vector<std::string> &aicpu_optype_list,
+                                      std::vector<std::string> &aicpu_tf_optype_list);
+
+  ge::Status CheckAicpuOpList(GeModelPtr ge_model);
+
   ge::Status GetOrigInputInfo(uint32_t model_id, uint32_t index, OriginInputInfo &orig_input_info);
 
   ge::Status GenSessionId(uint64_t &session_id);
@@ -355,8 +345,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
   std::map<uint32_t, std::shared_ptr<hybrid::HybridDavinciModel>> hybrid_model_map_;
   std::map<std::string, std::vector<uint64_t>> model_aicpu_kernel_;
   uint32_t max_model_id_;
-  std::mutex map_mutex_;
-  std::mutex sess_ids_mutex_;
+  std::recursive_mutex map_mutex_;
   std::mutex session_id_create_mutex_;
   static::std::mutex exeception_infos_mutex_;
   uint64_t session_id_bias_;
diff --git a/ge/graph/load/new_model_manager/model_utils.cc b/ge/graph/load/new_model_manager/model_utils.cc
index 34fb7ff3..22a657ad 100755
--- a/ge/graph/load/new_model_manager/model_utils.cc
+++ b/ge/graph/load/new_model_manager/model_utils.cc
@@ -61,7 +61,7 @@ vector<int64_t> ModelUtils::GetInputSize(ConstOpDescPtr op_desc) {
       GELOGI("Get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);
       continue);
 
-    GELOGI("[IMAS]GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
+    GELOGI("GetInputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
     v_input_size.push_back(tensor_size);
   }
 
@@ -96,7 +96,7 @@ vector<int64_t> ModelUtils::GetOutputSize(ConstOpDescPtr op_desc) {
       GELOGI("Get size from TensorDesc failed, op : %s, output index : %zu", op_desc->GetName().c_str(), i);
       continue);
 
-    GELOGI("[IMAS]GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
+    GELOGI("GetOutputSize op: %s, index: %zu, size:%ld", op_desc->GetName().c_str(), i, tensor_size);
     v_output_size.push_back(tensor_size);
   }
 
diff --git a/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
index b09a4fce..7b18a9a3 100644
--- a/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
@@ -24,10 +24,6 @@
 #include "graph/load/new_model_manager/model_utils.h"
 
 namespace ge {
-namespace {
-const uint32_t kMaxTaskOfStream = 200;
-}
-
 std::mutex HcclTaskInfo::hccl_follow_stream_mutex_;
 
 HcclTaskInfo::~HcclTaskInfo() {
@@ -59,40 +55,40 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m
   GELOGI("HcclTaskInfo Init, op_index is: %u", op_index);
 
   // Get HCCL op
-  op_desc_ = davinci_model->GetOpByIndex(op_index);
-  GE_CHECK_NOTNULL(op_desc_);
+  const auto op_desc = davinci_model_->GetOpByIndex(op_index);
+  GE_CHECK_NOTNULL(op_desc);
 
   // Create the kernel hccl infos
-  CreateKernelHcclInfo(op_desc_);
+  CreateKernelHcclInfo(op_desc);
 
   // Initialize the hccl_type of all kernel hccl info
   HcomOmeUtil::GetHcclType(task_def, kernel_hccl_infos_);
 
   // Only in Horovod scenario should get the inputName and GeShape
-  ret = HcomOmeUtil::GetHorovodInputs(op_desc_, kernel_hccl_infos_);
+  ret = HcomOmeUtil::GetHorovodInputs(op_desc, kernel_hccl_infos_);
   if (ret != SUCCESS) {
     GELOGE(ret, "davinci_model: GetHorovodInputs fail! domi error: %u", ret);
     return ret;
   }
-  Status dmrt = HcomOmeUtil::GetHcclDataType(op_desc_, kernel_hccl_infos_);
+  Status dmrt = HcomOmeUtil::GetHcclDataType(op_desc, kernel_hccl_infos_);
   if (dmrt != SUCCESS) {
     GELOGE(dmrt, "davinci_model: GetHcomDataType fail! domi error: %u", dmrt);
     return dmrt;
   }
-  dmrt = HcomOmeUtil::GetHcclCount(op_desc_, kernel_hccl_infos_);
+  dmrt = HcomOmeUtil::GetHcclCount(op_desc, kernel_hccl_infos_);
   if (dmrt != SUCCESS) {
     GELOGE(dmrt, "davinci_model: GetHcomCount fail! domi error: %u", dmrt);
     return dmrt;
   }
   // Only HCOMBROADCAST and HVDCALLBACKBROADCAST need to get the rootId
-  dmrt = HcomOmeUtil::GetAllRootId(op_desc_, kernel_hccl_infos_);
+  dmrt = HcomOmeUtil::GetAllRootId(op_desc, kernel_hccl_infos_);
   if (dmrt != SUCCESS) {
     GELOGE(dmrt, "davinci_model: Get rootId fail! domi error: %u", dmrt);
     return dmrt;
   }
 
   // GE's new process: hccl declares the number of streams required, creates a stream by GE, and sends it to hccl
-  ret = SetFollowStream(op_desc_, davinci_model);
+  ret = SetFollowStream(op_desc, davinci_model);
   if (ret != SUCCESS) {
     GELOGE(ret, "SetStream Fail.");
     return ret;
@@ -100,21 +96,22 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m
 
   if (davinci_model_->IsKnownNode()) {
     args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
-    GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_);
+    GELOGI("Known node %s args addr %p, offset %u.", op_desc->GetName().c_str(), args_, args_offset_);
   }
 
-  ret = SetAddrs(op_desc_, kernel_hccl_infos_);
+  ret = SetAddrs(op_desc, kernel_hccl_infos_);
   if (ret != SUCCESS) {
     GELOGE(ret, "Setaddrs Fail.");
     return ret;
   }
   // GE's new process: hccl declares the need for Workspace size, and GE allocates Workspace
-  ret = SetWorkspace(op_desc_, kernel_hccl_infos_);
+  ret = SetWorkspace(op_desc, kernel_hccl_infos_);
   if (ret != SUCCESS) {
     GELOGE(ret, "SetWorkspace Fail.");
     return ret;
   }
 
+  SetIoAddrs(op_desc);
   GELOGI("HcclTaskInfo Init Success");
   return SUCCESS;
 }
@@ -144,7 +141,9 @@ Status HcclTaskInfo::SetFollowStream(const ge::ConstOpDescPtr &op_desc, DavinciM
     } else {
       GELOGI("need to reuse follow stream and create new follow stream.");
       size_t created_stream_num = follow_stream_usage.size();
-      hccl_stream_list_ = follow_stream_usage;
+      for (const auto &stream : follow_stream_usage) {
+        hccl_stream_list_.emplace_back(stream);
+      }
       ret = CreateStream(hccl_stream_num - created_stream_num, davinci_model, main_stream_id);
       if (ret != SUCCESS) {
         GELOGE(RT_FAILED, "Create hccl stream failed.");
@@ -229,20 +228,19 @@ Status HcclTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *
   return SUCCESS;
 }
 
-Status HcclTaskInfo::UpdateArgs() {
-  GELOGI("HcclTaskInfo::UpdateArgs in.");
+void HcclTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) {
   const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
-  input_data_addrs_ = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
-  output_data_addrs_ = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
-  workspace_data_addrs_ = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);
-
-  vector<void *> io_addrs;
-  io_addrs.insert(io_addrs.end(), input_data_addrs_.begin(), input_data_addrs_.end());
-  io_addrs.insert(io_addrs.end(), output_data_addrs_.begin(), output_data_addrs_.end());
-  io_addrs.insert(io_addrs.end(), workspace_data_addrs_.begin(), workspace_data_addrs_.end());
-
-  davinci_model_->SetTotalIOAddrs(io_addrs);
+  const auto input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  const auto output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
+  const auto workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
+  io_addrs_.insert(io_addrs_.end(), input_data_addrs.begin(), input_data_addrs.end());
+  io_addrs_.insert(io_addrs_.end(), output_data_addrs.begin(), output_data_addrs.end());
+  io_addrs_.insert(io_addrs_.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
+}
 
+Status HcclTaskInfo::UpdateArgs() {
+  GELOGI("HcclTaskInfo::UpdateArgs in.");
+  davinci_model_->SetTotalIOAddrs(io_addrs_);
   GELOGI("HcclTaskInfo::UpdateArgs success.");
   return SUCCESS;
 }
@@ -261,9 +259,11 @@ Status HcclTaskInfo::SetAddrs(const std::shared_ptr<OpDesc> &op_desc,
   HcclReduceOp op_type = HCCL_REDUCE_SUM;
   GE_CHECK_NOTNULL(davinci_model_);
   GELOGI("Calc opType[%s] input address before. Node name[%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
+  vector<void *> input_data_addrs;
+  vector<void *> output_data_addrs;
   if (!davinci_model_->IsKnownNode()) {
-    input_data_addrs_ = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
-    output_data_addrs_ = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
+    input_data_addrs = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
+    output_data_addrs = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
   }
   void *input_data_addr = nullptr;
   void *output_data_addr = nullptr;
@@ -275,13 +275,14 @@ Status HcclTaskInfo::SetAddrs(const std::shared_ptr<OpDesc> &op_desc,
       output_data_addr = reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(args_) + op_desc->GetInputsSize() + i);
       GELOGI("Hccl task info known input addr %p, output addr %p.", input_data_addr, output_data_addr);
     } else {
-      input_data_addr = input_data_addrs_.empty() ? nullptr : input_data_addrs_[i];
-      output_data_addr = output_data_addrs_.empty() ? nullptr : output_data_addrs_[i];
+      input_data_addr = input_data_addrs.empty() ? nullptr : input_data_addrs[i];
+      output_data_addr = output_data_addrs.empty() ? nullptr : output_data_addrs[i];
     }
     kernel_hccl_infos[i].inputDataAddr = input_data_addr;
-    if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE || hccl_type == HVDCALLBACKALLGATHER || hccl_type == HCOMREDUCE) {
+    if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE || hccl_type == HVDCALLBACKALLGATHER) {
       kernel_hccl_infos[i].outputDataAddr = output_data_addr;
-    } else if (hccl_type == HCOMALLREDUCE || hccl_type == HCOMREDUCESCATTER || hccl_type == HVDCALLBACKALLREDUCE) {
+    } else if (hccl_type == HCOMALLREDUCE ||
+               hccl_type == HCOMREDUCESCATTER || hccl_type == HVDCALLBACKALLREDUCE || hccl_type == HCOMREDUCE) {
       GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type),
                         "davinci_model: GetHcomOperationType fail!");
       kernel_hccl_infos[i].outputDataAddr = output_data_addr;
@@ -365,8 +366,8 @@ Status HcclTaskInfo::SetWorkspace(const std::shared_ptr<OpDesc> &op_desc,
         workspace_addr = reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(args_) + op_desc->GetInputsSize() +
                                                   op_desc->GetOutputsSize());
       } else {
-        workspace_data_addrs_ = ModelUtils::GetWorkspaceDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
-        workspace_addr = workspace_data_addrs_.empty() ? nullptr : workspace_data_addrs_[0];
+        const auto workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
+        workspace_addr = workspace_data_addrs.empty() ? nullptr : workspace_data_addrs[0];
       }
     }
   }
diff --git a/ge/graph/load/new_model_manager/task_info/hccl_task_info.h b/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
index f7ce3468..777f5bbf 100644
--- a/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
@@ -35,7 +35,6 @@ class HcclTaskInfo : public TaskInfo {
         ops_kernel_store_(nullptr),
         private_def_(nullptr),
         private_def_len_(0),
-        op_desc_(nullptr),
         args_(nullptr),
         args_offset_(0) {}
 
@@ -52,7 +51,7 @@ class HcclTaskInfo : public TaskInfo {
   Status UpdateArgs() override;
 
  private:
-  ge::Status SetAddrs(const std::string &hccl_type, const std::shared_ptr<OpDesc> &op);
+  void SetIoAddrs(const OpDescPtr &op_desc);
 
   Status SetAddrs(const std::shared_ptr<OpDesc> &op_desc, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);
 
@@ -76,10 +75,7 @@ class HcclTaskInfo : public TaskInfo {
   uint32_t private_def_len_;
   static std::mutex hccl_follow_stream_mutex_;
   vector<GETaskKernelHcclInfo> kernel_hccl_infos_;
-  vector<void *> input_data_addrs_;
-  vector<void *> output_data_addrs_;
-  vector<void *> workspace_data_addrs_;
-  OpDescPtr op_desc_;
+  vector<void *> io_addrs_;
   void *args_;
   uint32_t args_offset_;
 };
diff --git a/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc b/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
index e5574e47..98d9cb78 100644
--- a/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@@ -30,11 +30,7 @@
 namespace ge {
 Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
   GELOGI("KernelExTaskInfo Init Start.");
-  if (davinci_model == nullptr) {
-    GELOGE(PARAM_INVALID, "davinci_model is null!");
-    return PARAM_INVALID;
-  }
-
+  GE_CHECK_NOTNULL(davinci_model);
   davinci_model_ = davinci_model;
   Status ret = SetStream(task_def.stream_id(), davinci_model_->GetStreamList());
   if (ret != SUCCESS) {
@@ -51,7 +47,6 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
     GELOGE(INTERNAL_ERROR, "Init aicpu task info error, index is out of range!");
     return INTERNAL_ERROR;
   }
-  op_desc_ = op_desc;
 
   // 2. Reconstruct kernelExDef.args to STR_FWK_OP_KERNEL
   STR_FWK_OP_KERNEL fwk_op_kernel = {0};
@@ -79,32 +74,27 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
                     return RT_ERROR_TO_GE_STATUS(rt_ret);)
   }
 
-  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, ext_info_addr_=%p", op_desc_->GetName().c_str(),
-         op_desc_->GetType().c_str(), ext_info.size(), ext_info_addr_);
+  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, ext_info_addr_=%p", op_desc->GetName().c_str(),
+         op_desc->GetType().c_str(), ext_info.size(), ext_info_addr_);
 
   // 2.1 get loop cond variable for tensor array write
-  uint64_t step_id_addr = 0;
-  OpDescPtr step_id_node = davinci_model_->GetVariableOp(NODE_NAME_GLOBAL_STEP);
-  if (step_id_node != nullptr) {
-    vector<void *> v_step_id_addr = ModelUtils::GetOutputDataAddrs(rts_param, step_id_node);
-    if (!v_step_id_addr.empty()) {
-      step_id_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(v_step_id_addr[0]));
-    }
-  }
+  uint64_t step_id_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model_->GetGlobalStep()));
 
   auto session_id = davinci_model_->GetSessionId();
   fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID = session_id;
 
   // 2.2 Collect aicpu kernel
   uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID;
-  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), kernel_id) != SUCCESS,
+  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(),
+                                                                 davinci_model->SubModelId(), kernel_id) != SUCCESS,
                   GELOGE(FAILED, "CreateAicpuKernel error.");
                   return FAILED;)
   // 2.3 Create session
   GE_CHECK_NOTNULL(ModelManager::GetInstance());
-  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
-                  GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id);
-                  return FAILED;)
+  ret = ModelManager::GetInstance()->CreateAicpuSession(session_id);
+  GE_IF_BOOL_EXEC(ret != SUCCESS,
+                  GELOGE(ret, "CreateAicpuSession error. session id: %lu", session_id);
+                  return ret;)
 
   kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL);
   if (davinci_model_->IsKnownNode()) {
@@ -132,6 +122,8 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy error, ret: Ox%X", rt_ret);
                     return RT_ERROR_TO_GE_STATUS(rt_ret);)
 
+    SetIoAddrs(op_desc);
+    InitDumpTask(input_output_addr, op_desc);
     GELOGI("KernelExTaskInfo knonw node Init Success.");
     return SUCCESS;
   }
@@ -166,11 +158,7 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy to input_output_addr_ error: 0x%X", rt_ret);
                     return RT_ERROR_TO_GE_STATUS(rt_ret);)
 
-    if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
-                                                            op_desc->GetName())) {
-      dump_flag_ = RT_KERNEL_DUMPFLAG;
-      dump_args_ = input_output_addr_;
-    }
+    InitDumpTask(input_output_addr_, op_desc);
     if (davinci_model_->GetOpDugReg()) {
       GELOGI("Op debug is open in kernel ex task info");
       dump_args_ = input_output_addr_;
@@ -195,11 +183,19 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
                   return RT_ERROR_TO_GE_STATUS(rt_ret);)
 
   davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, io_addrs.data(), input_output_addr_, addrs_size, 0);
-
+  SetIoAddrs(op_desc);
   GELOGI("KernelExTaskInfo Init Success. session id: %lu", session_id);
   return SUCCESS;
 }
 
+void KernelExTaskInfo::InitDumpTask(void *addr, const OpDescPtr &op_desc) {
+  if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
+                                                          op_desc->GetName())) {
+    dump_flag_ = RT_KERNEL_DUMPFLAG;
+    dump_args_ = input_output_addr_;
+  }
+}
+
 Status KernelExTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
   auto kernel_ex_def = task_def.kernel_ex();
   uint32_t op_index = kernel_ex_def.op_index();
@@ -236,36 +232,38 @@ Status KernelExTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciMod
   return SUCCESS;
 }
 
-Status KernelExTaskInfo::UpdateArgs() {
-  GELOGI("KernelExTaskInfo::UpdateArgs in.");
+void KernelExTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) {
   const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
-  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
-  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
-  vector<void *> io_addrs;
-  if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
-    io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
-    io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
+  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
+  if (!op_desc->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
+    io_addrs_.insert(io_addrs_.end(), input_data_addrs.begin(), input_data_addrs.end());
+    io_addrs_.insert(io_addrs_.end(), output_data_addrs.begin(), output_data_addrs.end());
   } else {
     string peer_input_name;
-    if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) {
+    if (AttrUtils::GetStr(op_desc, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) {
       uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name);
       if (output_index > output_data_addrs.size()) {
         GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.",
                output_data_addrs.size(), output_index);
-        return FAILED;
+        return;
       }
-      io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
+      io_addrs_.insert(io_addrs_.end(), input_data_addrs.begin(), input_data_addrs.end());
       for (size_t i = 0; i < output_data_addrs.size(); ++i) {
         if (i == output_index) {
           void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
-          io_addrs.emplace_back(fixed_addr);
+          io_addrs_.emplace_back(fixed_addr);
           continue;
         }
-        io_addrs.emplace_back(output_data_addrs[i]);
+        io_addrs_.emplace_back(output_data_addrs[i]);
       }
     }
   }
-  davinci_model_->SetTotalIOAddrs(io_addrs);
+}
+
+Status KernelExTaskInfo::UpdateArgs() {
+  GELOGI("KernelExTaskInfo::UpdateArgs in.");
+  davinci_model_->SetTotalIOAddrs(io_addrs_);
   GELOGI("KernelExTaskInfo::UpdateArgs success.");
   return SUCCESS;
 }
diff --git a/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h b/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
index e4d3e6fd..f6873c6c 100644
--- a/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
@@ -59,6 +59,9 @@ class KernelExTaskInfo : public TaskInfo {
   };
  private:
   Status CopyTaskInfo(const domi::KernelExDef &kernel_def, const RuntimeParam &rts_param, const OpDescPtr &op_desc);
+  void SetIoAddrs(const OpDescPtr &op_desc);
+
+  void InitDumpTask(void *addr, const OpDescPtr &op_desc);
 
   uint32_t task_id_;
   uint32_t stream_id_;
@@ -69,7 +72,7 @@ class KernelExTaskInfo : public TaskInfo {
   void *input_output_addr_;
   void *ext_info_addr_;
   void *dump_args_;
-  OpDescPtr op_desc_ = nullptr;
+  vector<void *> io_addrs_;
   uint32_t args_offset_ = 0;
   int64_t fixed_addr_offset_ = 0;
 };
diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
index 04607c02..83bf2779 100755
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -43,12 +43,16 @@ const char *kIsLastNode = "is_last_node";
 const char *kIsFirstNode = "is_first_node";
 const int64_t kCloseSkt = 100;
 const uint32_t kAddrLen = sizeof(void *);
+const int kBaseInt = 10;
+const int kStrtolFail = 0;
+const int kArgsInputDesc = 0;
+const int kArgsInputAddr = 1;
+const int kArgsOutputDesc = 2;
+const int kArgsOutputAddr = 3;
+const int kArgsAttrHandle = 4;
 }  // namespace
 
 namespace ge {
-KernelTaskInfo::SuperKernelTaskInfo KernelTaskInfo::skt_info_ = {
-    0, 0, 0, 0, nullptr, nullptr, {}, {}, {}, {}, {}, RT_KERNEL_DEFAULT, kInvalidGroupKey, 0, nullptr};
-
 Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
   GE_CHECK_NOTNULL(davinci_model);
   davinci_model_ = davinci_model;
@@ -66,7 +70,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
   // get opcontext stored in model
   const domi::KernelContext &context = kernel_def.context();
   // get kernel_type
-  kernel_type_ = static_cast<cce::ccKernelType>(context.kernel_type());
+  kernel_type_ = static_cast<ccKernelType>(context.kernel_type());
   // get opdesc
   op_desc_ = davinci_model_->GetOpByIndex(context.op_index());
   GE_CHECK_NOTNULL(op_desc_);
@@ -83,20 +87,18 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
                   fusion_op_info_.op_index = context.op_index(); fusion_op_info_.original_op_names = original_op_names;
                   fusion_op_info_.op_name = op_desc_->GetName());
 
-  string session_graph_model_id;
-  davinci_model_->GetUniqueId(op_desc_, session_graph_model_id);
-  // get bin_file_key
-  const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id);
   // new aicpu kernel(rtCpuKernelLaunch) no need to check function
-  if (kernel_type_ == cce::ccKernelType::CCE_AI_CORE) {
-    rtError_t rt_ret;
-    rt_ret = rtGetFunctionByName(const_cast<char *>(kernel_def.stub_func().c_str()), &stub_func_);
+  if (kernel_type_ == ccKernelType::CCE_AI_CORE) {
+    rtError_t rt_ret = rtGetFunctionByName(const_cast<char *>(kernel_def.stub_func().c_str()), &stub_func_);
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. stub_func: %s",
                                                     kernel_def.stub_func().c_str());
                     return RT_ERROR_TO_GE_STATUS(rt_ret););
-  } else if (kernel_type_ == cce::ccKernelType::TE) {
-    rtError_t rt_ret;
-    rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_);
+  } else if (kernel_type_ == ccKernelType::TE) {
+    // get bin_file_key
+    string session_graph_model_id;
+    davinci_model_->GetUniqueId(op_desc_, session_graph_model_id);
+    const char *bin_file_key = davinci_model_->GetRegisterStub(op_desc_->GetName(), session_graph_model_id);
+    rtError_t rt_ret = rtGetFunctionByName(bin_file_key, &stub_func_);
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
                     GELOGE(RT_FAILED, "execute rtGetFunctionByName failed. bin_file_key: %s", bin_file_key);
                     return RT_ERROR_TO_GE_STATUS(rt_ret););
@@ -111,7 +113,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     ctx_.opIndex2[i] = context.origin_op_index(i);
   }
   ctx_.opCount = context.origin_op_index_size();
-  if (kernel_type_ == cce::ccKernelType::TE) {
+  if (kernel_type_ == ccKernelType::TE) {
     ctx_.opIndex = context.op_index();
     uint16_t *args_offset_tmp = reinterpret_cast<uint16_t *>(const_cast<char *>(context.args_offset().data()));
     if (context.args_offset().size() / sizeof(uint16_t) < 1) {
@@ -120,9 +122,9 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     }
 
     ret = InitTVMTask(args_offset_tmp[0], kernel_def);
-  } else if (kernel_type_ == cce::ccKernelType::CUSTOMIZED) {
+  } else if (kernel_type_ == ccKernelType::CUSTOMIZED) {
     ret = InitAICPUCustomTask(context.op_index(), kernel_def);
-  } else if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+  } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
     ret = InitAicpuTask(context.op_index(), kernel_def);
   } else {
     if (kernel_def.args().empty() || args_size_ == 0) {
@@ -132,6 +134,7 @@ Status KernelTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci
     ret = InitCceTask(kernel_def);
   }
 
+  SetIoAddrs(op_desc_);
   GELOGD("KernelTaskInfo init finish, result=%u.", ret);
   return ret;
 }
@@ -143,9 +146,10 @@ Status KernelTaskInfo::SaveSKTDumpInfo() {
     return SUCCESS;
   }
   // all op in super kernel share one taskid and streamid
-  for (size_t i = 0; i < skt_info_.op_desc_list.size(); i++) {
-    davinci_model_->SaveDumpTask(skt_info_.last_task_id, skt_info_.last_stream_id, skt_info_.op_desc_list[i],
-                                 skt_info_.dump_args_list[i]);
+  const SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+  for (size_t i = 0; i < skt_info.op_desc_list.size(); i++) {
+    davinci_model_->SaveDumpTask(skt_info.last_task_id, skt_info.last_stream_id, skt_info.op_desc_list[i],
+                                 skt_info.dump_args_list[i]);
   }
   return SUCCESS;
 }
@@ -159,9 +163,10 @@ void KernelTaskInfo::UpdateSKTTaskId() {
       GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
       return;
     }
-    skt_info_.last_task_id = task_id;
-    skt_info_.last_stream_id = stream_id;
-    skt_id_ = skt_info_.last_task_id;
+    SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+    skt_info.last_task_id = task_id;
+    skt_info.last_stream_id = stream_id;
+    skt_id_ = skt_info.last_task_id;
 
     GELOGI("UpdateTaskId:UpdateSKTTaskId [%u],stream id [%u]", task_id, stream_id);
   }
@@ -186,23 +191,25 @@ Status KernelTaskInfo::SKTFinalize() {
   UpdateSKTTaskId();
   GE_CHK_STATUS_RET(SaveSKTDumpInfo(), "skt save dump info failed");
   GELOGI("SuperKernel Distribute [skt_id:%u]", skt_id_);
-  skt_info_.kernel_list.clear();
-  skt_info_.arg_list.clear();
-  skt_info_.dump_flag_list.clear();
-  skt_info_.op_desc_list.clear();
-  skt_info_.dump_args_list.clear();
-  skt_info_.last_stream = nullptr;
-  skt_info_.last_block_dim = 0;
-  skt_info_.last_sm_desc = sm_desc_;
-  skt_info_.last_group_key = kInvalidGroupKey;
-  skt_info_.last_dump_flag = RT_KERNEL_DEFAULT;
-  skt_info_.last_dump_args = 0;
-  skt_info_.last_op = nullptr;
+  SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+  skt_info.kernel_list.clear();
+  skt_info.arg_list.clear();
+  skt_info.dump_flag_list.clear();
+  skt_info.op_desc_list.clear();
+  skt_info.dump_args_list.clear();
+  skt_info.last_stream = nullptr;
+  skt_info.last_block_dim = 0;
+  skt_info.last_sm_desc = sm_desc_;
+  skt_info.last_group_key = kInvalidGroupKey;
+  skt_info.last_dump_flag = RT_KERNEL_DEFAULT;
+  skt_info.last_dump_args = 0;
+  skt_info.last_op = nullptr;
   return SUCCESS;
 }
 
 uint32_t KernelTaskInfo::GetDumpFlag() {
-  for (auto flag : skt_info_.dump_flag_list) {
+  const SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+  for (auto flag : skt_info.dump_flag_list) {
     if (flag == RT_KERNEL_DUMPFLAG) {
       return RT_KERNEL_DUMPFLAG;
     }
@@ -211,19 +218,20 @@ uint32_t KernelTaskInfo::GetDumpFlag() {
 }
 
 Status KernelTaskInfo::SuperKernelLaunch() {
-  if (skt_info_.kernel_list.empty()) {
+  const SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+  if (skt_info.kernel_list.empty()) {
     GELOGI("SuperKernelLaunch: Skt_kernel_list has no task, just return");
     return SUCCESS;
   }
   rtError_t rt_ret;
-  auto &skt_kernel_list = skt_info_.kernel_list;
-  auto &skt_arg_list = skt_info_.arg_list;
+  auto &skt_kernel_list = skt_info.kernel_list;
+  auto &skt_arg_list = skt_info.arg_list;
   GELOGI("SuperKernelLaunch: Skt_kernel_list size[%zu] skt_arg_list[%zu]", skt_kernel_list.size(), skt_arg_list.size());
   if (skt_kernel_list.size() == kSKTSingleSize && skt_arg_list.size() == kSKTSingleSize) {
-    rt_ret = rtKernelLaunchWithFlag(skt_info_.kernel_list[0], static_cast<uint32_t>(skt_info_.last_block_dim),
-                                    skt_info_.arg_list[0], skt_info_.last_args_size,
-                                    static_cast<rtSmDesc_t *>(skt_info_.last_sm_desc), skt_info_.last_stream,
-                                    skt_info_.last_dump_flag);
+    rt_ret = rtKernelLaunchWithFlag(skt_info.kernel_list[0], static_cast<uint32_t>(skt_info.last_block_dim),
+                                    skt_info.arg_list[0], skt_info.last_args_size,
+                                    static_cast<rtSmDesc_t *>(skt_info.last_sm_desc), skt_info.last_stream,
+                                    skt_info.last_dump_flag);
     if (rt_ret != RT_ERROR_NONE) {
       GELOGE(RT_FAILED, "SuperKernelLaunch: Call rt api failed, ret: 0x%X", rt_ret);
       return RT_ERROR_TO_GE_STATUS(rt_ret);
@@ -242,14 +250,14 @@ Status KernelTaskInfo::SuperKernelLaunch() {
   }
   // Call the fuse API
   std::unique_ptr<skt::SuperKernel> superKernel = nullptr;
-  ge_ret = factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info_.last_block_dim, superKernel);
+  ge_ret = factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info.last_block_dim, superKernel);
   if (ge_ret != SUCCESS) {
     GELOGE(ge_ret, "SuperKernelLaunch: fuse call failed");
     return ge_ret;
   }
   // Launch a super kernel
   skt_dump_flag_ = GetDumpFlag();
-  ge_ret = superKernel->Launch(skt_info_.last_stream, skt_dump_flag_);
+  ge_ret = superKernel->Launch(skt_info.last_stream, skt_dump_flag_);
   if (ge_ret != SUCCESS) {
     GELOGE(ge_ret, "SuperKernelLaunch: launch failed");
     return ge_ret;
@@ -264,23 +272,26 @@ Status KernelTaskInfo::SuperKernelLaunch() {
 }
 
 Status KernelTaskInfo::SaveSuperKernelInfo() {
-  skt_info_.kernel_list.push_back(stub_func_);
-  skt_info_.arg_list.push_back(args_);
-  skt_info_.last_stream = stream_;
-  skt_info_.last_block_dim = block_dim_;
-  skt_info_.last_args_size = args_size_;
-  skt_info_.last_sm_desc = sm_desc_;
-  skt_info_.last_dump_flag = dump_flag_;
-  skt_info_.dump_flag_list.push_back(dump_flag_);
-  skt_info_.op_desc_list.push_back(op_desc_);
-  skt_info_.dump_args_list.push_back(reinterpret_cast<uintptr_t>(skt_dump_args_));
-  skt_info_.last_group_key = group_key_;
-  skt_info_.last_dump_args = reinterpret_cast<uintptr_t>(skt_dump_args_);
-  skt_info_.last_op = op_desc_;
+  SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+  skt_info.kernel_list.push_back(stub_func_);
+  skt_info.arg_list.push_back(args_);
+  skt_info.last_stream = stream_;
+  skt_info.last_block_dim = block_dim_;
+  skt_info.last_args_size = args_size_;
+  skt_info.last_sm_desc = sm_desc_;
+  skt_info.last_dump_flag = dump_flag_;
+  skt_info.dump_flag_list.push_back(dump_flag_);
+  skt_info.op_desc_list.push_back(op_desc_);
+  skt_info.dump_args_list.push_back(reinterpret_cast<uintptr_t>(skt_dump_args_));
+  skt_info.last_group_key = group_key_;
+  skt_info.last_dump_args = reinterpret_cast<uintptr_t>(skt_dump_args_);
+  skt_info.last_op = op_desc_;
   // last node in a stream, just launch
   if (IsMarkedLastNode()) {
     return SuperKernelLaunch();
   }
+
+  GELOGI("Save Current task [block_dim:%u, size:%zu].", block_dim_, skt_info.kernel_list.size());
   return SUCCESS;
 }
 
@@ -317,8 +328,9 @@ bool KernelTaskInfo::IsMarkedFirstNode() {
 // then may be saved to skt task list; else
 // call skt launch those saved tasks before
 bool KernelTaskInfo::FirstCallSKTLaunchCheck() {
-  return ((block_dim_ != skt_info_.last_block_dim) || (stream_ != skt_info_.last_stream) ||
-          (has_group_key_ && (group_key_ != skt_info_.last_group_key)));
+  const SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
+  return ((block_dim_ != skt_info.last_block_dim) || (stream_ != skt_info.last_stream) ||
+          (has_group_key_ && (group_key_ != skt_info.last_group_key)));
 }
 
 // current task has group_id or has n ATTR_N_BATCH_SPLIT then save it to skt task list; else
@@ -357,7 +369,6 @@ Status KernelTaskInfo::SuperKernelDistribute() {
       GELOGE(ret, "Call SuperKernelLaunch failed!");
       return ret;
     }
-    GELOGI("Save Current task [block_dim:%u, size:%zu].", block_dim_, skt_info_.kernel_list.size());
   }
   return SUCCESS;
 }
@@ -365,15 +376,19 @@ Status KernelTaskInfo::SuperKernelDistribute() {
 Status KernelTaskInfo::Distribute() {
   GELOGD("KernelTaskInfo Distribute Start.");
   if (davinci_model_->IsKnownNode()) {
-    args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
+    if (kernel_type_ == ccKernelType::TE) {
+      args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
+    } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
+      args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_);
+    }
     GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_);
   }
   rtError_t rt_ret = RT_ERROR_NONE;
   char skt_enable_env[MMPA_MAX_PATH] = { 0x00 };
   INT32 res = mmGetEnv("SKT_ENABLE", skt_enable_env, MMPA_MAX_PATH);
-  int64_t env_flag = (res == EN_OK) ? strtol(skt_enable_env, nullptr, 10) : 0;
+  int64_t env_flag = (res == EN_OK) ? strtol(skt_enable_env, nullptr, kBaseInt) : kStrtolFail;
   bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_);
-  if (kernel_type_ == cce::ccKernelType::AI_CPU || kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+  if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
     GELOGI("distribute task info kernel_type %d, flag %d", kernel_type_, dump_flag_);
     // blockDim is reserved parameter, set to 1
     rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name_.c_str()),
@@ -382,10 +397,11 @@ Status KernelTaskInfo::Distribute() {
     call_save_dump_ = true;
   } else {
     /* default: not skt launch */
+    const SuperKernelTaskInfo &skt_info = davinci_model_->GetSuperKernelTaskInfo();
     GELOGD(
         "KernelTaskInfo Distribute Start, sktenable:%d taskid:%u sktid:%u last_sktid:%u stubfunc_name:%s "
         "stubfunc:%p blockdim:%u stream:%p",
-        call_skt, task_id_, skt_id_, skt_info_.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
+        call_skt, task_id_, skt_id_, skt_info.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
     // l1 fusion enable and env flag open (kCloseSkt for skt debug)
     bool open_dump = false;
     auto all_dump_model = davinci_model_->GetDumpProperties().GetAllDumpModel();
@@ -413,44 +429,45 @@ Status KernelTaskInfo::Distribute() {
       "KernelTaskInfo Distribute Success. sktenable:%d taskid:%d sktid:%d stubfunc_name:%s stubfunc:%p "
       "blockdim:%d stream:%p",
       call_skt, task_id_, skt_id_, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
+  op_desc_.reset(); // Not hold OpDesc after distribute.
   return SUCCESS;
 }
 
-Status KernelTaskInfo::UpdateArgs() {
-  GELOGI("KernelTaskInfo::UpdateArgs in.");
+void KernelTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) {
   const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
-  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
-  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
-  vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);
+  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
 
-  vector<void *> io_addrs;
-  if (!op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
-    io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
-    io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
-    io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
-  } else {
-    string peer_input_name;
-    if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name)) {
-      uint32_t output_index = davinci_model_->GetFixedAddrOutputIndex(peer_input_name);
-      if (output_index > output_data_addrs.size()) {
-        GELOGE(FAILED, "The output data addr size[%zu] and output index[%u] are inconsistent.",
-               output_data_addrs.size(), output_index);
-        return FAILED;
-      }
-      io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
-      for (size_t i = 0; i < output_data_addrs.size(); ++i) {
-        if (i == output_index) {
-          void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
-          io_addrs.emplace_back(fixed_addr);
-          continue;
-        }
-        io_addrs.emplace_back(output_data_addrs[i]);
-      }
-      io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
+  io_addrs_.insert(io_addrs_.end(), input_data_addrs.begin(), input_data_addrs.end());
+  io_addrs_.insert(io_addrs_.end(), output_data_addrs.begin(), output_data_addrs.end());
+  if (kernel_type_ == ccKernelType::TE) {
+    vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
+    io_addrs_.insert(io_addrs_.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
+  }
+}
+
+Status KernelTaskInfo::UpdateArgs() {
+  GELOGI("KernelTaskInfo::UpdateArgs in.");
+  if (kernel_type_ == ccKernelType::TE) {
+    davinci_model_->SetTotalIOAddrs(io_addrs_);
+  } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
+    vector<void *> io_addrs = io_addrs_;
+    davinci_model_->UpdateKnownZeroCopyAddr(io_addrs);
+    uintptr_t io_addr = reinterpret_cast<uintptr_t>(args_addr.get()) + sizeof(aicpu::AicpuParamHead);
+    auto addrs_size = sizeof(uint64_t) * io_addrs.size();
+    errno_t sec_ret = memcpy_s(reinterpret_cast<void *>(io_addr), addrs_size, io_addrs.data(), addrs_size);
+    if (sec_ret != EOK) {
+      GELOGE(FAILED, "memcpy failed, ret: %d", sec_ret);
+      return FAILED;
+    }
+    // copy args to device
+    rtError_t rt_ret = rtMemcpy(args_, args_size_, args_addr.get(), args_size_, RT_MEMCPY_HOST_TO_DEVICE);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
+      return RT_ERROR_TO_GE_STATUS(rt_ret);
     }
   }
 
-  davinci_model_->SetTotalIOAddrs(io_addrs);
   GELOGI("KernelTaskInfo::UpdateArgs success.");
   return SUCCESS;
 }
@@ -526,33 +543,18 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
 }
 
 Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
-  domi::KernelDef kernel_def = task_def.kernel();
-  uint32_t args_size = kernel_def.args_size();
-  args_offset_ = davinci_model->GetTotalArgsSize();
-  davinci_model->SetTotalArgsSize(args_size);
-  GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);
-
-  // get opcontext stored in model
+  const domi::KernelDef &kernel_def = task_def.kernel();
   const domi::KernelContext &context = kernel_def.context();
-  // get opdesc
-  op_desc_ = davinci_model->GetOpByIndex(context.op_index());
-  GE_CHECK_NOTNULL(op_desc_);
-  // alloc fixed addr
-  string peer_input_name;
-  if (AttrUtils::GetStr(op_desc_, ATTR_DYNAMIC_SHAPE_FIXED_ADDR, peer_input_name) && !peer_input_name.empty()) {
-    uint32_t output_index = davinci_model->GetFixedAddrOutputIndex(peer_input_name);
-    if (output_index > op_desc_->GetOutputsSize()) {
-      GELOGE(FAILED, "The output size[%zu] and output index[%u] are inconsistent.", op_desc_->GetOutputsSize(),
-             output_index);
-      return FAILED;
-    }
-    fixed_addr_offset_ = davinci_model->GetFixedAddrsSize(peer_input_name);
-    auto tensor_desc = op_desc_->GetOutputDesc(output_index);
-    int64_t tensor_size = 0;
-    GE_CHK_STATUS(TensorUtils::GetSize(tensor_desc, tensor_size));
-    davinci_model->SetTotalFixedAddrsSize(peer_input_name, tensor_size);
-    GELOGI("Calculate stream switch task args , tensor size is %ld, fixed addr offset %ld", tensor_size,
-           fixed_addr_offset_);
+  kernel_type_ = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type_ == ccKernelType::TE) {
+    uint32_t args_size = kernel_def.args_size();
+    args_offset_ = davinci_model->GetTotalArgsSize();
+    davinci_model->SetTotalArgsSize(args_size);
+    GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);
+  } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) {
+    hybrid_args_offset_ = davinci_model->GetHybridArgsSize();
+    davinci_model->SetHybridArgsSize(kernel_def.args_size());
+    GELOGI("aicpu kernel task name , args_size %u, args_offset %u", kernel_def.args_size(), hybrid_args_offset_);
   }
   return SUCCESS;
 }
@@ -564,6 +566,8 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
   OpDescPtr op_desc = davinci_model_->GetOpByIndex(ctx_.opIndex);
   GE_CHECK_NOTNULL(op_desc);
   if (davinci_model_->IsKnownNode()) {
+    args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
+    InitDumpTask(offset);
     return SUCCESS;
   }
 
@@ -628,15 +632,7 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
     return FAILED;
   }
   skt_dump_args_ = static_cast<char *>(args_) + offset;
-  if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
-                                                          op_desc->GetName())) {
-    if (IsL1FusionOp(op_desc)) {
-      dump_flag_ = RT_FUSION_KERNEL_DUMPFLAG;
-    } else {
-      dump_flag_ = RT_KERNEL_DUMPFLAG;
-    }
-    dump_args_ = static_cast<char *>(args_) + offset;
-  }
+  InitDumpTask(offset);
 
   GE_CHK_BOOL_TRUE_EXEC_INFO(davinci_model_->GetOpDugReg(), dump_args_ = static_cast<char *>(args_) + offset,
                              "Op debug is open in TVM task info");
@@ -749,15 +745,15 @@ Status KernelTaskInfo::InitAICPUCustomTask(uint32_t op_index, const domi::Kernel
       return FAILED;
     }
   }
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[0])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsInputDesc])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.input_descs));  // arg 0
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[1])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsInputAddr])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.input_addrs));  // arg 1
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[2])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsOutputDesc])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.output_descs));  // arg 2
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[3])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsOutputAddr])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.output_addrs));  // arg 3
-  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[4])) =
+  *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[kArgsAttrHandle])) =
       static_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.attr_handle));  // arg 4
 
   rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
@@ -800,7 +796,6 @@ Status KernelTaskInfo::InitCceTask(const domi::KernelDef &kernel_def) {
       GELOGE(FAILED, "flowtable is null.");
       return FAILED;
     }
-    flowtable_size_ = flowtable.size();
   }
 
   // get smDesc stored in model
@@ -865,21 +860,23 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
   GELOGI("Do InitAicpuTask");
   so_name_ = kernel_def.so_name();
   kernel_name_ = kernel_def.kernel_name();
-  GELOGI("node[%s] test so name %s, kernel name %s", op_desc_->GetName().c_str(), so_name_.c_str(),
-         kernel_name_.c_str());
 
   OpDescPtr op_desc = davinci_model_->GetOpByIndex(op_index);
   if (op_desc == nullptr) {
     GELOGE(INTERNAL_ERROR, "index is out of range, index: %u", op_index);
     return INTERNAL_ERROR;
   }
+  GELOGI("node[%s] test so name %s, kernel name %s", op_desc->GetName().c_str(), so_name_.c_str(),
+         kernel_name_.c_str());
 
-  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_), "launch cust aicpu so failed");
+  if (kernel_type_ == ccKernelType::CUST_AI_CPU) {
+    bool loaded = false;
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name_, loaded),
+                      "launch cust aicpu so failed");
   }
 
   // copy args to new host memory
-  std::unique_ptr<uint8_t[]> args_addr(new (std::nothrow) uint8_t[args_size_]);
+  args_addr = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[args_size_]);
   GE_PRINT_DYNAMIC_MEMORY(new, "cce task physical memory.", sizeof(uint8_t) * args_size_)
   errno_t sec_ret = memcpy_s(args_addr.get(), args_size_, kernel_def.args().data(), args_size_);
   if (sec_ret != EOK) {
@@ -887,8 +884,25 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     return FAILED;
   }
 
-  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
+  auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(args_addr.get());
+  const auto &ext_info = kernel_def.kernel_ext_info();
+  auto init_ret = InitAicpuTaskExtInfo(ext_info);
+  if (init_ret != SUCCESS) {
+    GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size());
+    return init_ret;
+  }
+  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc->GetName().c_str(),
+         op_desc->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_);
 
+  aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(aicpu_ext_info_addr_);
+  aicpu_param_head->extInfoLength = static_cast<uintptr_t>(ext_info.size());
+
+  if (davinci_model_->IsKnownNode()) {
+    args_ = davinci_model_->GetCurrentHybridArgsAddr(hybrid_args_offset_);
+    InitDumpTask(sizeof(aicpu::AicpuParamHead));
+    return SUCCESS;
+  }
+  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
   vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
   vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
   vector<void *> io_addrs;
@@ -905,19 +919,6 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     }
   }
 
-  auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(args_addr.get());
-  const auto &ext_info = kernel_def.kernel_ext_info();
-  auto init_ret = InitAicpuTaskExtInfo(ext_info);
-  if (init_ret != SUCCESS) {
-    GELOGE(init_ret, "Init aicpu task ext info failed, ext_info size=%zu", ext_info.size());
-    return init_ret;
-  }
-  GELOGI("Node[%s] type[%s] kernel_ext_info size=%zu, aicpu_ext_info_addr_=%p", op_desc_->GetName().c_str(),
-         op_desc_->GetType().c_str(), ext_info.size(), aicpu_ext_info_addr_);
-
-  aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(aicpu_ext_info_addr_);
-  aicpu_param_head->extInfoLength = static_cast<uintptr_t>(ext_info.size());
-
   // malloc device memory for args
   rtError_t rt_ret = rtMalloc(static_cast<void **>(&args_), args_size_, RT_MEMORY_HBM);
   if (rt_ret != RT_ERROR_NONE) {
@@ -932,21 +933,12 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
     GELOGE(RT_FAILED, "Call rt api(rtMemcpy) failed, ret: 0x%X", rt_ret);
     return RT_ERROR_TO_GE_STATUS(rt_ret);
   }
-
-  if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
-                                                          op_desc->GetName())) {
-    if (IsL1FusionOp(op_desc)) {
-      dump_flag_ = RT_FUSION_KERNEL_DUMPFLAG;
-    } else {
-      dump_flag_ = RT_KERNEL_DUMPFLAG;
-    }
-    dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
-  }
+  InitDumpTask(sizeof(aicpu::AicpuParamHead));
   if (davinci_model_->GetOpDugReg()) {
     GELOGI("Op debug is open in aicpu task info");
     dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
   }
-  if (kernel_type_ == cce::ccKernelType::CUST_AI_CPU) {
+  if (kernel_type_ == ccKernelType::CUST_AI_CPU) {
     dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
   }
 
@@ -955,6 +947,18 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k
   return SUCCESS;
 }
 
+void KernelTaskInfo::InitDumpTask(uint32_t offset) {
+  if (davinci_model_->GetDumpProperties().IsLayerNeedDump(davinci_model_->Name(), davinci_model_->OmName(),
+                                                          op_desc_->GetName())) {
+    if (IsL1FusionOp(op_desc_)) {
+      dump_flag_ = RT_FUSION_KERNEL_DUMPFLAG;
+    } else {
+      dump_flag_ = RT_KERNEL_DUMPFLAG;
+    }
+    dump_args_ = static_cast<char *>(args_) + offset;
+  }
+}
+
 Status KernelTaskInfo::InitAicpuTaskExtInfo(const std::string &ext_info) {
   if (ext_info.empty()) {
     return SUCCESS;
@@ -1076,7 +1080,7 @@ Status KernelTaskInfo::StoreInputOutputTensor(const std::vector<void *> &input_d
 
 Status KernelTaskInfo::SetContext(const domi::KernelDef &kernel_def) {
   const domi::KernelContext &context = kernel_def.context();
-  ctx_.kernelType = static_cast<cce::ccKernelType>(context.kernel_type());
+  ctx_.kernelType = static_cast<ccKernelType>(context.kernel_type());
   ctx_.opId = context.op_id();
   ctx_.kernelFuncId = context.kernel_func_id();
   ctx_.isFlowtable = context.is_flowtable();
@@ -1161,10 +1165,10 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u
     GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", error);
     return FAILED;
   }
-  cce::ccStatus_t cc_ret;
+  ccStatus_t cc_ret;
   std::string update_kernel_args = "ccUpdateKernelArgs";
-  auto cceUpdateKernelArgs = (cce::ccStatus_t(*)(cce::ccOpContext &, uint64_t, uint64_t, uint64_t, void *, uint64_t,
-                                                 void *))mmDlsym(handle, const_cast<char *>(update_kernel_args.c_str()));
+  auto cceUpdateKernelArgs = (ccStatus_t(*)(ccOpContext &, uint64_t, uint64_t,
+      uint64_t, void *, uint64_t, void *))mmDlsym(handle, const_cast<char *>(update_kernel_args.c_str()));
   if (cceUpdateKernelArgs == nullptr) {
     GELOGE(FAILED, "Failed to invoke function ccUpdateKernelArgs");
     if (mmDlclose(handle) != 0) {
@@ -1189,7 +1193,7 @@ Status KernelTaskInfo::CceUpdateKernelArgs(const domi::KernelContext &context, u
     GELOGW("Failed to close handle %s", error);
     return FAILED;
   }
-  if (cc_ret != cce::CC_STATUS_SUCCESS) {
+  if (cc_ret != CC_STATUS_SUCCESS) {
     GELOGE(CCE_FAILED, "Call cce api failed, ret: 0x%X", cc_ret);
     return CCE_FAILED;
   }
diff --git a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
index f2945b0b..cea25320 100644
--- a/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@@ -38,12 +38,11 @@ class KernelTaskInfo : public TaskInfo {
         flowtable_(nullptr),
         block_dim_(0),
         args_size_(0),
-        flowtable_size_(0),
         task_id_(0),
         stream_id_(0),
         so_name_(""),
         kernel_name_(""),
-        kernel_type_(cce::ccKernelType::CCE_AI_CORE),
+        kernel_type_(ccKernelType::CCE_AI_CORE),
         dump_flag_(RT_KERNEL_DEFAULT),
         dump_args_(nullptr),
         op_desc_(nullptr),
@@ -75,7 +74,7 @@ class KernelTaskInfo : public TaskInfo {
 
   Status Release() override;
 
-  cce::ccOpContext *GetCtx() override { return &ctx_; }
+  ccOpContext *GetCtx() override { return &ctx_; }
 
   FusionOpInfo *GetFusionOpInfo() override { return &fusion_op_info_; }
 
@@ -92,7 +91,7 @@ class KernelTaskInfo : public TaskInfo {
 
   bool CallSaveDumpInfo() override  { return call_save_dump_; };
 
-  cce::ccOpContext ctx_;
+  ccOpContext ctx_;
   FusionOpInfo fusion_op_info_;
 
  private:
@@ -128,6 +127,8 @@ class KernelTaskInfo : public TaskInfo {
 
   Status SuperKernelDistribute();
   bool IsL1FusionOp(const OpDescPtr &op_desc);
+  void SetIoAddrs(const OpDescPtr &op_desc);
+  void InitDumpTask(uint32_t offset);
 
   // For super kernel
   Status SaveSKTDumpInfo();
@@ -148,18 +149,20 @@ class KernelTaskInfo : public TaskInfo {
   void *flowtable_;
   uint32_t block_dim_;
   uint32_t args_size_;
-  uint32_t flowtable_size_;
   uint32_t task_id_;
   uint32_t stream_id_;
   std::string so_name_;
   std::string kernel_name_;
-  cce::ccKernelType kernel_type_;
+  ccKernelType kernel_type_;
   uint32_t dump_flag_;
   void *dump_args_;
-  OpDescPtr op_desc_;
+  OpDescPtr op_desc_;   // Clear after distribute.
+  vector<void *> io_addrs_;
   DavinciModel *davinci_model_;
   uint32_t args_offset_ = 0;
+  uint32_t hybrid_args_offset_ = 0;
   int64_t fixed_addr_offset_ = 0;
+  std::unique_ptr<uint8_t[]> args_addr = nullptr;
   bool call_save_dump_ = false;
 
   // aicpu ext_info device mem
@@ -184,25 +187,6 @@ class KernelTaskInfo : public TaskInfo {
     void *output_addrs = nullptr;
     void *attr_handle = nullptr;
   } custom_info_;
-
-  // For super kernel
-  static struct SuperKernelTaskInfo {
-    uint32_t last_block_dim;
-    uint32_t last_args_size;
-    uint32_t last_task_id;
-    uint32_t last_stream_id;
-    void *last_stream;
-    void *last_sm_desc;
-    std::vector<void *> kernel_list;
-    std::vector<void *> arg_list;
-    std::vector<uint32_t> dump_flag_list;
-    std::vector<OpDescPtr> op_desc_list;
-    std::vector<uintptr_t> dump_args_list;
-    uint32_t last_dump_flag;
-    int64_t last_group_key;
-    uintptr_t last_dump_args;
-    OpDescPtr last_op;
-  } skt_info_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_KERNEL_TASK_INFO_H_
diff --git a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
index aa37dd07..fa320d81 100755
--- a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
+++ b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
@@ -30,14 +30,13 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
     return ret;
   }
 
-  memcpy_async_ = task_def.memcpy_async();
-  count_ = memcpy_async_.count();
-  kind_ = memcpy_async_.kind();
-  dst_max_ = memcpy_async_.dst_max();
-  OpDescPtr op_desc = davinci_model_->GetOpByIndex(memcpy_async_.op_index());
-  op_desc_ = op_desc;
+  const domi::MemcpyAsyncDef &memcpy_async = task_def.memcpy_async();
+  count_ = memcpy_async.count();
+  kind_ = memcpy_async.kind();
+  dst_max_ = memcpy_async.dst_max();
+  OpDescPtr op_desc = davinci_model_->GetOpByIndex(memcpy_async.op_index());
   if (op_desc == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async_.op_index());
+    GELOGE(INTERNAL_ERROR, "Task op index:%u out of range", memcpy_async.op_index());
     return INTERNAL_ERROR;
   }
 
@@ -46,13 +45,14 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
     dst_ = reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(src_) + sizeof(void *));
     // for zero copy
     kind_ = RT_MEMCPY_ADDR_DEVICE_TO_DEVICE;
+    GE_CHK_STATUS_RET(SetIoAddrs(op_desc, memcpy_async), "Set addrs failed");
     GELOGI("MemcpyAsyncTaskInfo op name %s, src_ %p, dst_ %p, args_offset %u.",
            op_desc->GetName().c_str(), src_, dst_, args_offset_);
     return SUCCESS;
   }
 
   const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
-  ret = ModelUtils::GetRtAddress(rts_param, memcpy_async_.src(), src_);
+  ret = ModelUtils::GetRtAddress(rts_param, memcpy_async.src(), src_);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -61,23 +61,23 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
   vector<int64_t> memory_type_list;
   (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memory_type_list);
   if (!memory_type_list.empty() && memory_type_list[0] == RT_MEMORY_TS_4G) {  // TS Feature, Just one.
-    uint64_t mem_offset = memcpy_async_.dst() - rts_param.logic_mem_base;
-    dst_ = static_cast<uint8_t *>(rts_param.ts_mem_mall->Acquire(mem_offset, memcpy_async_.dst_max()));
+    uint64_t mem_offset = memcpy_async.dst() - rts_param.logic_mem_base;
+    dst_ = static_cast<uint8_t *>(rts_param.ts_mem_mall->Acquire(mem_offset, memcpy_async.dst_max()));
     if (dst_ == nullptr) {
       return FAILED;
     }
   } else {
-    ret = ModelUtils::GetRtAddress(rts_param, memcpy_async_.dst(), dst_);
+    ret = ModelUtils::GetRtAddress(rts_param, memcpy_async.dst(), dst_);
     if (ret != SUCCESS) {
       return ret;
     }
   }
 
-  GELOGI("MemcpyAsyncTaskInfo Init Success, logic[0x%lx, 0x%lx], src:%p, dst:%p, max:%lu, count:%lu",
-         memcpy_async_.src(), memcpy_async_.dst(), src_, dst_, dst_max_, count_);
-
   davinci_model_->DisableZeroCopy(src_);
   davinci_model_->DisableZeroCopy(dst_);
+  GE_CHK_STATUS_RET(SetIoAddrs(op_desc, memcpy_async), "Set addrs failed");
+  GELOGI("MemcpyAsyncTaskInfo Init Success, logic[0x%lx, 0x%lx], src:%p, dst:%p, max:%lu, count:%lu",
+         memcpy_async.src(), memcpy_async.dst(), src_, dst_, dst_max_, count_);
   return SUCCESS;
 }
 
@@ -115,29 +115,33 @@ Status MemcpyAsyncTaskInfo::CalculateArgs(const domi::TaskDef &task_def, Davinci
   return SUCCESS;
 }
 
-Status MemcpyAsyncTaskInfo::UpdateArgs() {
-  GELOGI("MemcpyAsyncTaskInfo::UpdateArgs in.");
-  GE_CHECK_NOTNULL(davinci_model_);
-  Status ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async_.src(), src_);
-  if (ret != SUCCESS) {
-    return ret;
-  }
-
-  ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async_.dst(), dst_);
+Status MemcpyAsyncTaskInfo::SetIoAddrs(const OpDescPtr &op_desc, const domi::MemcpyAsyncDef &memcpy_async) {
+  uint8_t *src = nullptr;
+  Status ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async.src(), src);
   if (ret != SUCCESS) {
     return ret;
   }
+  io_addrs_.emplace_back(reinterpret_cast<void *>(src));
 
-  vector<void *> io_addrs;
-  io_addrs.emplace_back(reinterpret_cast<void *>(src_));
-  if (op_desc_->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
+  if (op_desc->HasAttr(ATTR_DYNAMIC_SHAPE_FIXED_ADDR)) {
     void *fixed_addr = davinci_model_->GetCurrentFixedAddr(fixed_addr_offset_);
-    io_addrs.emplace_back(fixed_addr);
+    io_addrs_.emplace_back(fixed_addr);
   } else {
-    io_addrs.emplace_back(reinterpret_cast<void *>(dst_));
+    uint8_t *dst = nullptr;
+    ret = ModelUtils::GetRtAddress(davinci_model_->GetRuntimeParam(), memcpy_async.dst(), dst);
+    if (ret != SUCCESS) {
+      return ret;
+    }
+    io_addrs_.emplace_back(reinterpret_cast<void *>(dst));
   }
-  davinci_model_->SetTotalIOAddrs(io_addrs);
 
+  return SUCCESS;
+}
+
+Status MemcpyAsyncTaskInfo::UpdateArgs() {
+  GELOGI("MemcpyAsyncTaskInfo::UpdateArgs in.");
+  GE_CHECK_NOTNULL(davinci_model_);
+  davinci_model_->SetTotalIOAddrs(io_addrs_);
   GELOGI("MemcpyAsyncTaskInfo::UpdateArgs success.");
   return SUCCESS;
 }
diff --git a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
index 7e74ab6f..43b5ba13 100755
--- a/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.h
@@ -39,16 +39,17 @@ class MemcpyAsyncTaskInfo : public TaskInfo {
   Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
 
  private:
+  Status SetIoAddrs(const OpDescPtr &op_desc, const domi::MemcpyAsyncDef &memcpy_async);
+
   uint8_t *dst_;
   uint64_t dst_max_;
   uint8_t *src_;
   uint64_t count_;
   uint32_t kind_;
-  OpDescPtr op_desc_;
+  vector<void *> io_addrs_;
   int64_t fixed_addr_offset_;
   DavinciModel *davinci_model_ = nullptr;
   uint32_t args_offset_ = 0;
-  domi::MemcpyAsyncDef memcpy_async_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ASYNC_TASK_INFO_H_
diff --git a/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h b/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
index 89642cf8..a72d7de2 100755
--- a/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.h
@@ -41,7 +41,7 @@ class StreamSwitchTaskInfo : public TaskInfo {
 
   Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
  private:
-  void SetInputAndValuePtr(DavinciModel *davinci_model, const vector<void *> &input_data_addrs);
+  void SetInputAndValuePtr(DavinciModel *davinci_model, const std::vector<void *> &input_data_addrs);
   void *input_ptr_;
   rtCondition_t cond_;
   void *value_ptr_;
@@ -49,7 +49,7 @@ class StreamSwitchTaskInfo : public TaskInfo {
   uint32_t true_stream_id_;
   rtSwitchDataType_t data_type_;
   static const uint32_t kInputNum = 2;
-  vector<int64_t> fixed_addr_offset_;
+  std::vector<int64_t> fixed_addr_offset_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_STREAM_SWITCH_TASK_INFO_H_
diff --git a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
index 63f29f84..65dca3b3 100644
--- a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
+++ b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
@@ -25,10 +25,11 @@ Status SuperKernel::Launch(rtStream_t stream, uint32_t dump_flag) {
   const void *args[] = {this->GetNavTablePtr(),
                         reinterpret_cast<const void *>(static_cast<uintptr_t>(this->GetNavTableSize()))};
 
-  rtError_t rt_ret = rtMalloc((void **)&(device_args_addr_), sizeof(args), RT_MEMORY_HBM);
-  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failied. error: 0x%X", rt_ret); return
-                  RT_ERROR_TO_GE_STATUS(rt_ret);)
-  rt_ret = rtMemcpy((void *)device_args_addr_, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE);
+  rtError_t rt_ret = rtMalloc(reinterpret_cast<void **>(&device_args_addr_), sizeof(args), RT_MEMORY_HBM);
+  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failied. error: 0x%X", rt_ret);
+                  return RT_ERROR_TO_GE_STATUS(rt_ret);)
+  rt_ret = rtMemcpy(reinterpret_cast<void *>(device_args_addr_), sizeof(args), reinterpret_cast<void *>(args),
+                    sizeof(args), RT_MEMCPY_HOST_TO_DEVICE);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy failied. error: 0x%X", rt_ret);
                   return RT_ERROR_TO_GE_STATUS(rt_ret);)
   rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr_, sizeof(args), NULL, stream,
diff --git a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
index 69f7b159..4e22cd7c 100644
--- a/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
+++ b/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
@@ -19,6 +19,8 @@
 
 namespace ge {
 namespace skt {
+const size_t kFusedKernelMinimumSize = 2;
+const size_t kFusedKernelSizeUnit = 2;
 SuperKernelFactory &SuperKernelFactory::GetInstance() {
   static SuperKernelFactory factory;
   return factory;
@@ -79,17 +81,17 @@ Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list
     return FAILED;
   }
 
-  if (super_kernel_size < 2) {
+  if (super_kernel_size < kFusedKernelMinimumSize) {
     GELOGW(
       "SKT: the number of kernels being fused must be greater than or "
       "equal to 2");
     return FAILED;
   }
   GELOGI("SKT: superkernel start fuse, superkernel size %zu.", stub_func_list.size());
-  const size_t nav_table_len = 2 * stub_func_list.size();
+  const size_t nav_table_len = kFusedKernelSizeUnit * stub_func_list.size();
   std::unique_ptr<uint64_t[]> nav_table(new(std::nothrow) uint64_t[nav_table_len]);
   GE_CHECK_NOTNULL(nav_table);
-  uint64_t nav_table_size = 2 * stub_func_list.size() * sizeof(int64_t);
+  uint64_t nav_table_size = kFusedKernelSizeUnit * stub_func_list.size() * sizeof(int64_t);
 
   rtError_t rt_ret;
   void *hbm_nav_table_addr = nullptr;
@@ -101,21 +103,21 @@ Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list
     GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func);
     // store two uint64_t address
     // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
-    nav_table[i * 2] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
-    GELOGD("SKT: CALL offet %lu", nav_table[i * 2]);
-    nav_table[i * 2 + 1] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
-    GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * 2 + 1]);
+    nav_table[i * kFusedKernelSizeUnit] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
+    GELOGD("SKT: CALL offet %lu", nav_table[i * kFusedKernelSizeUnit]);
+    nav_table[i * kFusedKernelSizeUnit + 1] = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
+    GELOGD("SKT: fuseKernels args base address %lu", nav_table[i * kFusedKernelSizeUnit + 1]);
   }
-  rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
+  rt_ret = rtMalloc(reinterpret_cast<void **>(&hbm_nav_table_addr), nav_table_size, RT_MEMORY_HBM);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMalloc failed. error: 0x%X", rt_ret);
                   return RT_ERROR_TO_GE_STATUS(rt_ret);)
-  rt_ret =
-    rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table.get(), nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
+  rt_ret = rtMemcpy(reinterpret_cast<void *>(hbm_nav_table_addr), nav_table_size,
+                    reinterpret_cast<void *>(nav_table.get()), nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(RT_FAILED, "rtMemcpy failed. error: 0x%X", rt_ret);
                   GE_CHK_RT(rtFree(hbm_nav_table_addr)); return RT_ERROR_TO_GE_STATUS(rt_ret);)
   // Create the necessary metadata for the super kernel
-  h = std::unique_ptr<skt::SuperKernel>(
-      new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim));
+  h =
+    std::unique_ptr<skt::SuperKernel>(new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim));
   return SUCCESS;
 }
 }  // namespace skt
diff --git a/ge/graph/load/new_model_manager/task_info/task_info.h b/ge/graph/load/new_model_manager/task_info/task_info.h
index d296d29e..26f22564 100644
--- a/ge/graph/load/new_model_manager/task_info/task_info.h
+++ b/ge/graph/load/new_model_manager/task_info/task_info.h
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "cce/customize.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/load/new_model_manager/ts_mem_mall.h"
 #include "graph/load/new_model_manager/task_info/task_info_factory.h"
@@ -63,8 +63,8 @@ struct RuntimeParam {
 };
 
 typedef struct FusionOpInfo {
-  vector<string> original_op_names;
-  string op_name;
+  std::vector<std::string> original_op_names;
+  std::string op_name;
   uint32_t op_index;
   uint32_t stream_id;
 } FusionOpInfo;
@@ -87,7 +87,7 @@ class TaskInfo {
 
   virtual Status Release() { return SUCCESS; }
 
-  virtual cce::ccOpContext *GetCtx() { return nullptr; }
+  virtual ccOpContext *GetCtx() { return nullptr; }
 
   virtual uint32_t GetTaskID() { return 0xFFFFFFFF; }
 
diff --git a/ge/graph/load/new_model_manager/ts_mem_mall.h b/ge/graph/load/new_model_manager/ts_mem_mall.h
index 42ad3957..64a64930 100644
--- a/ge/graph/load/new_model_manager/ts_mem_mall.h
+++ b/ge/graph/load/new_model_manager/ts_mem_mall.h
@@ -25,7 +25,7 @@
 #include "framework/common/debug/ge_log.h"
 
 namespace {
-constexpr uint32_t kMaxTsMemBlock = 2 * 1024 * 1024;   // Max block 2M
+constexpr uint32_t kMaxTsMemBlock = 2097152;   // Max block 2M 2 * 1024 * 1024
 constexpr uint32_t kTsMemAligment = 64;   // Malloc for 64 bits align
 constexpr uint32_t kTsMemAlignMask = kTsMemAligment - 1;
 }
diff --git a/ge/graph/load/new_model_manager/zero_copy_offset.cc b/ge/graph/load/new_model_manager/zero_copy_offset.cc
index 970b292c..f27d862d 100644
--- a/ge/graph/load/new_model_manager/zero_copy_offset.cc
+++ b/ge/graph/load/new_model_manager/zero_copy_offset.cc
@@ -35,6 +35,7 @@ Status ZeroCopyOffset::InitInputDataInfo(int64_t output_size, void *virtual_addr
   GELOGI("[ZCPY] Start to InitInputDataInfo of %s, total_data_size is %ld, virtual_addr is %p",
          op_desc->GetName().c_str(), output_size, virtual_addr);
   basic_addr_ = virtual_addr;
+  op_name_ = op_desc->GetName();
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_);
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_);
   GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID,
@@ -82,6 +83,7 @@ Status ZeroCopyOffset::InitOutputDataInfo(const vector<int64_t> &input_size_list
   GELOGD("Tensor data size: GetSize=%ld, GetTensorSizeInBytes=%ld", input_size_list[idx], size);
 
   basic_addr_ = virtual_addr_list[idx];
+  op_name_ = op_desc->GetName();
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset_);
   (void)ge::AttrUtils::GetListInt(op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset_);
   GE_CHK_BOOL_EXEC(zero_copy_basic_offset_.size() == zero_copy_relative_offset_.size(), return PARAM_INVALID,
@@ -181,22 +183,18 @@ void ZeroCopyOffset::SetOutputOutsideAddrs(const int64_t &input_offset, const bo
   addr_count_ = out_count;
 }
 
-bool ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset) {
+void ZeroCopyOffset::SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset) {
   const auto addr_val = reinterpret_cast<uintptr_t>(outside_addr);
-  bool set_batch_label_flag = false;
   for (uint32_t out_count = 0; out_count < GetAddrCount(); ++out_count) {
-    auto &addrs_mapping_list = GetOutsideAddrs();
-    auto args_addrs = addrs_mapping_list[out_count].find(outside_addr);
-    if (args_addrs != addrs_mapping_list[out_count].end()) {
+    auto args_addrs = outside_addrs_[out_count].find(outside_addr);
+    if (args_addrs != outside_addrs_[out_count].end()) {
       GE_CHK_STATUS(zero_copy_task.SetTaskArgsOffset(addr_val, offset), "Input args invalid.");
       void *args_val = static_cast<uint8_t *>(args) + offset;
       args_addrs->second.push_back(args_val);
       GELOGD("[ZCPY] set copy input: virtual_addr: 0x%lx, task_addr: %p, args: %p, offset: %zu.", addr_val, args_val,
              args, offset);
-      set_batch_label_flag = true;
     }
   }
-  return set_batch_label_flag;
 }
 
 }  // namespace ge
diff --git a/ge/graph/load/new_model_manager/zero_copy_offset.h b/ge/graph/load/new_model_manager/zero_copy_offset.h
index 025d1b14..66fcd887 100644
--- a/ge/graph/load/new_model_manager/zero_copy_offset.h
+++ b/ge/graph/load/new_model_manager/zero_copy_offset.h
@@ -51,7 +51,7 @@ class ZeroCopyOffset {
                             const OpDescPtr &op_desc, const size_t &idx, bool &fusion_flag);
   void SetOutputOutsideAddrs(const int64_t &input_offset, const bool &fusion_flag, void *addr,
                              std::vector<void *> &tensor_addrs);
-  bool SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset);
+  void SetOutsideAddrsValue(ZeroCopyTask &zero_copy_task, void *outside_addr, void *args, size_t offset);
 
   // basic_addr of l2-fusion
   void *GetBasicAddr() const { return basic_addr_; }
@@ -65,10 +65,13 @@ class ZeroCopyOffset {
   // data_size of Data/Netoutput
   int64_t GetDataSize() const { return data_size_; }
   // value of *outside_addrs_ from davinci_model
-  std::vector<std::map<const void *, std::vector<void *>>> &GetOutsideAddrs() { return outside_addrs_; }
+  const std::vector<std::map<const void *, std::vector<void *>>> &GetOutsideAddrs() { return outside_addrs_; }
+  // name of op
+  std::string GetOpName() const { return op_name_; }
 
  private:
   void *basic_addr_ = nullptr;
+  std::string op_name_;
   uint32_t data_count_ = 0;
   std::vector<std::pair<int64_t, void *>> data_info_;
   vector<int64_t> relative_offset_;
@@ -80,4 +83,4 @@ class ZeroCopyOffset {
   std::vector<int64_t> zero_copy_relative_offset_;
 };
 }  // namespace ge
-#endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_ZERO_COPY_OFFSET_H_
\ No newline at end of file
+#endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_ZERO_COPY_OFFSET_H_
diff --git a/ge/graph/load/new_model_manager/zero_copy_task.cc b/ge/graph/load/new_model_manager/zero_copy_task.cc
index 9b42d563..b938f14b 100755
--- a/ge/graph/load/new_model_manager/zero_copy_task.cc
+++ b/ge/graph/load/new_model_manager/zero_copy_task.cc
@@ -22,8 +22,6 @@
 #include "common/ge_compiler_options.h"
 
 namespace ge {
-const char *const kDefaultBatchLable = "Batch_default";
-
 ZeroCopyTask::ZeroCopyTask(const string &name, uint8_t *args, size_t size)
     : name_(name), args_addr_(args), args_size_(size), is_updated_(false) {}
 
@@ -66,72 +64,27 @@ void ZeroCopyTask::SetOriginalArgs(const void *info, size_t size) {
   const uint8_t *data = static_cast<const uint8_t *>(info);
   args_info_.assign(data, data + size);
 
-  GELOGI("[ZCPY] %s set info from virtual_addr: %p, args_addr: %p, args size: %zu, info size: %zu", name_.c_str(), info,
+  GELOGI("[ZCPY] %s set original args info: %p, args_addr: %p, args size: %zu, info size: %zu", name_.c_str(), info,
          args_addr_, args_size_, size);
 }
 
 /**
  * @ingroup ge
- * @brief Check is dynamic batch node.
- * @param [in] addr: virtual address value from Op.
- * @param [in] data: data buffer from user.
- * @param [in] batch_addrs: dynamic batch addr info.
- * @param [in] batch_label: batch label.
- * @return: true / false
- */
-bool ZeroCopyTask::CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label,
-                                     uintptr_t addr) {
-  // Used for dynamic batch / resolution scene
-  set<uintptr_t> dynamic_input_addrs;
-  auto dynamic_input_iter = batch_addrs.find(batch_label);
-  if (dynamic_input_iter != batch_addrs.end()) {
-    dynamic_input_addrs = dynamic_input_iter->second;
-  }
-
-  set<uintptr_t> fix_input_addrs;
-  auto fix_input_iter = batch_addrs.find(kDefaultBatchLable);
-  if (fix_input_iter != batch_addrs.end()) {
-    fix_input_addrs = fix_input_iter->second;
-  }
-
-  if (fix_input_addrs.empty()) {
-    if (!dynamic_input_addrs.empty() && dynamic_input_addrs.find(addr) == dynamic_input_addrs.end()) {
-      return false;
-    }
-  } else {
-    if (!dynamic_input_addrs.empty() && dynamic_input_addrs.find(addr) == dynamic_input_addrs.end() &&
-        fix_input_addrs.find(addr) == fix_input_addrs.end()) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-/**
- * @ingroup ge
  * @brief Set user data addr to Task param.
  * @param [in] addr: virtual address value from Op.
  * @param [in] buffer_addr: real_data_buffer_addr from user.
- * @param [in] batch_addrs: dynamic batch addr info.
- * @param [in] batch_label: batch label.
  * @return: void
  */
-Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs,
-                                     const string &batch_label) {
+Status ZeroCopyTask::UpdateTaskParam(uintptr_t addr, void *buffer_addr) {
   auto iter = task_addr_offset_.find(addr);
   if (iter != task_addr_offset_.end()) {
     auto &cur_pair = *iter;
     uint8_t *args_info = args_info_.data();
     for (auto offset : cur_pair.second) {
-      if (!CheckDynamicBatch(batch_addrs, batch_label, reinterpret_cast<uintptr_t>(args_addr_ + offset))) {
-        continue;
-      }
-
       auto dst_addr = static_cast<uint8_t *>(buffer_addr);
       GELOGI("[ZCPY] %s update task, args_addr: %p, size: %zu, offset: %zu, virtual_addr: 0x%lx, user_data_addr: %p",
              name_.c_str(), args_addr_, args_size_, offset, addr, buffer_addr);
-      *(uintptr_t *)(args_info + offset) = reinterpret_cast<uintptr_t>(dst_addr);
+      *reinterpret_cast<uintptr_t *>(args_info + offset)= reinterpret_cast<uintptr_t>(dst_addr);
       is_updated_ = true;
     }
   }
diff --git a/ge/graph/load/new_model_manager/zero_copy_task.h b/ge/graph/load/new_model_manager/zero_copy_task.h
index d0bb2b6d..efabc814 100644
--- a/ge/graph/load/new_model_manager/zero_copy_task.h
+++ b/ge/graph/load/new_model_manager/zero_copy_task.h
@@ -67,12 +67,9 @@ class ZeroCopyTask {
    * @brief Set user data addr to Task param.
    * @param [in] addr: virtual address value from Op.
    * @param [in] buffer_addr: data buffer_addr from user.
-   * @param [in] batch_addrs: dynamic batch addr info.
-   * @param [in] batch_label: batch label.
    * @return: 0 SUCCESS / others FAILED
    */
-  ge::Status UpdateTaskParam(uintptr_t addr, void *buffer_addr, const map<string, set<uintptr_t>> &batch_addrs,
-                             const string &batch_label);
+  ge::Status UpdateTaskParam(uintptr_t addr, void *buffer_addr);
 
   /**
    * @ingroup ge
@@ -91,9 +88,6 @@ class ZeroCopyTask {
     return batch_label_;
   }
 
- protected:
-  bool CheckDynamicBatch(const map<string, set<uintptr_t>> &batch_addrs, const string &batch_label, uintptr_t addr);
-
  private:
   const string name_;
 
diff --git a/ge/graph/manager/graph_caching_allocator.cc b/ge/graph/manager/graph_caching_allocator.cc
index 4ba39ca8..d6027a08 100644
--- a/ge/graph/manager/graph_caching_allocator.cc
+++ b/ge/graph/manager/graph_caching_allocator.cc
@@ -25,13 +25,13 @@
 
 namespace ge {
 const size_t bin_ranges[kNumBins] = {kRoundBlockSize * kKByteSize,
-                                     8 * kMByteSize,
-                                     32 * kMByteSize,
-                                     128 * kMByteSize,
+                                     kBinSizeUnit8 * kMByteSize,
+                                     kBinSizeUnit32 * kMByteSize,
+                                     kBinSizeUnit128 * kMByteSize,
                                      kGByteSize,
-                                     4 * kGByteSize,
-                                     16 * kGByteSize,
-                                     26 * kGByteSize};
+                                     kBinSizeUnit4 * kGByteSize,
+                                     kBinSizeUnit16 * kGByteSize,
+                                     kBinSizeUnit26 * kGByteSize};
 
 static bool BlockComparator(const Block *left, const Block *right) {
   if (left->size != right->size) {
diff --git a/ge/graph/manager/graph_caching_allocator.h b/ge/graph/manager/graph_caching_allocator.h
index dc4af753..e024d5cd 100644
--- a/ge/graph/manager/graph_caching_allocator.h
+++ b/ge/graph/manager/graph_caching_allocator.h
@@ -34,10 +34,17 @@
 
 namespace ge {
 constexpr size_t kRoundBlockSize = 512;         // all block sizes are rounded to at least 512 bytes
+constexpr size_t kBinSizeUnit4 = 4;
+constexpr size_t kBinSizeUnit8 = 8;
+constexpr size_t kBinSizeUnit16 = 16;
+constexpr size_t kBinSizeUnit26 = 26;
+constexpr size_t kBinSizeUnit32 = 32;
+constexpr size_t kBinSizeUnit128 = 128;
+
 constexpr double kSplitThreshold = 0.75;         // split when malloc size <= small block size * kSpliThreshold
 constexpr size_t kKByteSize = 1024;
-constexpr size_t kMByteSize = 1024 * 1024;
-constexpr size_t kGByteSize = 1024 * 1024 * 1024;
+constexpr size_t kMByteSize = 1048576;   // 1024 * 1024
+constexpr size_t kGByteSize = 1073741824;   // 1024 * 1024 * 1024
 
 static const uint32_t kNumBins = 8;
 
diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc
index 87070e79..b0d412dc 100755
--- a/ge/graph/manager/graph_manager.cc
+++ b/ge/graph/manager/graph_manager.cc
@@ -23,31 +23,23 @@
 #include <sstream>
 #include <string>
 #include <thread>
-#include <utility>
 
-#include "common/ge/ge_util.h"
 #include "common/math/math_util.h"
 #include "common/thread_pool.h"
-#include "common/util.h"
-#include "external/graph/types.h"
-#include "framework/common/debug/ge_log.h"
-#include "framework/common/ge_inner_error_codes.h"
-#include "framework/common/ge_types.h"
 #include "analyzer/analyzer.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "graph/common/local_context.h"
 #include "graph/common/transop_util.h"
-#include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
 #include "graph/ge_global_options.h"
-#include "graph/ge_local_context.h"
-#include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/util/rt_context_util.h"
 #include "graph/partition/dynamic_shape_partition.h"
 #include "graph/passes/enter_pass.h"
 #include "graph/partition/stage_partition.h"
 #include "graph/passes/addn_pass.h"
 #include "graph/passes/bitcast_pass.h"
+#include "graph/passes/assign_remove_pass.h"
+#include "graph/passes/inplace_support_check_pass.h"
 #include "graph/passes/atomic_addr_clean_pass.h"
 #include "graph/passes/attach_stream_label_pass.h"
 #include "graph/passes/cast_remove_pass.h"
@@ -61,13 +53,13 @@
 #include "graph/passes/dimension_adjust_pass.h"
 #include "graph/passes/dimension_compute_pass.h"
 #include "graph/passes/flow_ctrl_pass.h"
-#include "graph/passes/hccl_group_pass.h"
-#include "graph/passes/hccl_memcpy_pass.h"
+#include "graph/passes/fuse_data_nodes_with_common_input_pass.h"
 #include "graph/passes/identity_pass.h"
 #include "graph/passes/input_output_connection_identify_pass.h"
 #include "graph/passes/iterator_op_pass.h"
 #include "graph/passes/link_gen_mask_nodes_pass.h"
 #include "graph/passes/mark_graph_unknown_status_pass.h"
+#include "graph/passes/dynamic_single_op_reset_shape_pass.h"
 #include "graph/passes/merge_pass.h"
 #include "graph/passes/merge_input_memcpy_pass.h"
 #include "graph/passes/merge_to_stream_merge_pass.h"
@@ -76,7 +68,7 @@
 #include "graph/passes/permute_pass.h"
 #include "graph/passes/prune_pass.h"
 #include "graph/passes/ref_identity_delete_op_pass.h"
-#include "graph/passes/replace_with_empty_const_pass.h"
+#include "graph/passes/remove_same_const_pass.h"
 #include "graph/passes/reshape_recovery_pass.h"
 #include "graph/passes/reshape_remove_pass.h"
 #include "graph/passes/same_transdata_breadth_fusion_pass.h"
@@ -86,13 +78,12 @@
 #include "graph/passes/switch_logic_remove_pass.h"
 #include "graph/passes/switch_to_stream_switch_pass.h"
 #include "graph/passes/transop_breadth_fusion_pass.h"
-#include "graph/passes/transop_depth_fusion_pass.h"
 #include "graph/passes/transop_nearby_allreduce_fusion_pass.h"
 #include "graph/passes/transop_symmetry_elimination_pass.h"
 #include "graph/passes/transop_without_reshape_fusion_pass.h"
 #include "graph/passes/transpose_transdata_pass.h"
+#include "graph/passes/useless_control_out_remove_pass.h"
 #include "graph/passes/variable_op_pass.h"
-#include "graph/passes/variable_prepare_op_pass.h"
 #include "graph/passes/variable_ref_delete_op_pass.h"
 #include "graph/passes/variable_ref_useless_control_out_delete_pass.h"
 #include "graph/passes/end_of_sequence_add_control_pass.h"
@@ -103,15 +94,13 @@
 #include "graph/passes/memcpy_addr_async_pass.h"
 #include "graph/build/label_allocator.h"
 #include "graph/utils/tensor_adapter.h"
-#include "graph/utils/type_utils.h"
-#include "graph/graph_util.h"
-#include "graph/types.h"
 #include "inc/pass_manager.h"
 #include "init/gelib.h"
 #include "ir_build/atc_ir_common.h"
 #include "graph/common/local_context.h"
 #include "graph/common/omg_util.h"
 #include "common/formats/utils/formats_trans_utils.h"
+#include "register/custom_pass_helper.h"
 
 namespace {
 const char *const kSummary = "Summary";
@@ -124,15 +113,9 @@ const char *const kCheckPointForGetVar = "CheckPointGraphForGetVar";
 const char *const kCheckPointGraph = "checkpoint_graph";
 const char *const kVectorEngine = "VectorEngine";
 const char *const kAIcoreEngine = "AIcoreEngine";
-const char *const kOffOptimize = "off_optimize";
 const int32_t kDynamicDimsTypeIsGetNext = 0;
 const int32_t kDynamicDimsTypeIsData = 1;
-const int64_t kInvalidDynaimcDimsType = -1;
-const char *const kSubstrOfGetNextNosinkName = "IteratorGetNext";
-const char *const kShapeDataName = "ascend_mbatch_shape_data";
 const char *const kGetNextName = "IteratorV2";
-const char *const kExtAttrDataNodes = "data_nodes";
-const char *const kExtAttrGetNextNoSink = "getnext_no_sink";
 
 bool IsTailingOptimization() {
   string is_tailing_optimization_option;
@@ -534,11 +517,18 @@ Status GraphManager::CopySubGraphAndMarkFusion(const ComputeGraphPtr &compute_gr
 }
 
 Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_graph,
-                                                      Graph2SubGraphInfoList &sub_graph_map,
-                                                      uint64_t session_id) {
+                                                      Graph2SubGraphInfoList &sub_graph_map, uint64_t session_id) {
   GE_CHECK_NOTNULL(compute_graph);
   // use default 16 multi thread
-  const uint32_t thread_num = 16;
+  uint32_t thread_num = 16;
+
+  char *env = std::getenv("THREAD_MULTI_NUM");
+  if (env != nullptr) {
+    thread_num = atoi(env);
+    GEEVENT("OptimizeSubGraphWithMultiThreads thread num: %u", thread_num);
+  }
+
+
   ThreadPool executor(thread_num);
   std::vector<std::future<Status>> vector_future;
   const auto &root_subgraph_list = sub_graph_map[compute_graph];
@@ -550,14 +540,15 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
       (void) AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
     }
     std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this,
-                                            compute_graph->GetGraphID(), subgraph, compute_graph, session_id, GetThreadLocalContext());
+                                            compute_graph->GetGraphID(), subgraph,
+                                            compute_graph->GetName(), session_id,
+                                            GetThreadLocalContext());
     if (!f.valid()) {
       GELOGE(FAILED, "Future is invalid");
       return FAILED;
     }
     vector_future.emplace_back(std::move(f));
   }
-
   for (auto &function_graph : compute_graph->GetAllSubgraphs()) {
     auto subgraph_list = sub_graph_map[function_graph];
     for (const auto &subgraph : subgraph_list) {
@@ -565,7 +556,8 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr
         (void) AttrUtils::SetStr(subgraph->GetSubGraph(), ATTR_NAME_OP_COMPILE_STRATEGY, op_compile_strategy);
       }
       std::future<Status> f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this,
-                                              compute_graph->GetGraphID(), subgraph, compute_graph, session_id,
+                                              compute_graph->GetGraphID(), subgraph,
+                                              compute_graph->GetName(), session_id,
                                               GetThreadLocalContext());
       if (!f.valid()) {
         GELOGE(FAILED, "Future is invalid");
@@ -650,63 +642,25 @@ Status GraphManager::ReplaceSubgraphWithOriGraph(const ComputeGraphPtr &compute_
 
 Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph, GraphPartitioner &partitioner) {
   GE_CHECK_NOTNULL(compute_graph);
-  auto sub_graph_map = partitioner.GetSubGraphMap();
-  std::string buffer_optimize;
-  graphStatus graph_status = ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
-  bool need_lx_fusion = (graph_status == GRAPH_SUCCESS) && (buffer_optimize != kOffOptimize);
-  if (options_.build_mode.empty() && need_lx_fusion) {
-    GELOGI("Enter normal mode with buffer_optimize:%s.", buffer_optimize.c_str());
-    /// 1. Copy subgraph for buffer optimize while lx fusion failed.
-    /// 2. Set graph with attr "lx_fusion" for fusion optimize.
-    std::unordered_map<std::string, ComputeGraphPtr> copy_graphs;
-    GE_TIMESTAMP_START(CopySubGraphAndMarkFusion);
-    Status ret = CopySubGraphAndMarkFusion(compute_graph, sub_graph_map, copy_graphs);
-    GE_TIMESTAMP_EVENT_END(CopySubGraphAndMarkFusion, "SetSubgraph:CopySubGraphAndMarkFusion");
-    if (ret != SUCCESS) {
-      GELOGE(ret, "CopySubGraphAndMarkFusion failed.");
-      return ret;
-    }
-
-    // Multiply optimize subgraph with lx fusion
-    ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Multiply optimize subgraph with lx fusion failed.");
-      return ret;
-    }
-
-    // Check whether all subgraph lx fusion success
-    GE_TIMESTAMP_START(CheckAllFusionOptimizeSuccess);
-    if (CheckAllFusionOptimizeSuccess(compute_graph, sub_graph_map)) {
-      GE_TIMESTAMP_EVENT_END(CheckAllFusionOptimizeSuccess, "SetSubgraph:CheckAllFusionOptimizeSuccess");
-      return SUCCESS;
-    }
-
-    // Replace subgraph with original graph for lx buffer
-    ret = ReplaceSubgraphWithOriGraph(compute_graph, sub_graph_map, copy_graphs);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Replace subgraph with original graph failed.");
-      return ret;
-    }
+  PassManager pass_for_dynamic_shape_reset_optimize;
+  GE_CHK_STATUS_RET(pass_for_dynamic_shape_reset_optimize.AddPass(
+    "SetSubgraph::AfterSetSubgraph::DynamicSingleOpResetShapePass", new (std::nothrow) DynamicSingleOpResetShapePass))
+  GE_TIMESTAMP_START(pass_for_dynamic_shape_reset_optimize);
+  Status ret = pass_for_dynamic_shape_reset_optimize.Run(compute_graph);
+  GE_TIMESTAMP_END(pass_for_dynamic_shape_reset_optimize, "SetSubgraph::AfterSetSubgraph");
+  if (ret != SUCCESS && ret != NOT_CHANGED) {
+    GELOGE(ret, "Run passes when optimize subgraph failed");
+    return ret;
+  }
 
-    // Multiply optimize subgraph with lx buffer
-    ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Multiply optimize subgraph with lx buffer failed.");
-      return ret;
-    }
-  } else {
-    /// Multiply optimize subgraph:
-    /// 1. run lx buffer while build_mode is normal and buffer_optimize is empty or "off_optimize";
-    /// 2. run lx fusion or buffer according build_mode and build_step in fe.
-    GELOGD("Directly optimize subgraph with build mode:%s, and step:%s, buffer_optimize:%s.",
-           options_.build_mode.c_str(),
-           options_.build_step.c_str(),
-           buffer_optimize.c_str());
-    Status ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
-    if (ret != SUCCESS) {
-      GELOGE(ret, "Multiply optimize subgraph with lx buffer");
-      return ret;
-    }
+  auto sub_graph_map = partitioner.GetSubGraphMap();
+  GELOGD("Directly optimize subgraph with build mode:%s, and step:%s.",
+         options_.build_mode.c_str(),
+         options_.build_step.c_str());
+  ret = OptimizeSubGraphWithMultiThreads(compute_graph, sub_graph_map, session_id);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "Multiply optimize subgraph failed");
+    return ret;
   }
   return SUCCESS;
 }
@@ -726,7 +680,7 @@ Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node,
   CompilerStages &stages = GetCompilerStages(graph_node->GetGraphId());
   GM_RUN_AND_DUMP_PERF("OptimizeGraphPrepare", stages.optimizer.OptimizeOriginalGraphForQuantize, compute_graph);
   GM_RUN_AND_DUMP_PERF("HandleSummaryOp", stages.optimizer.HandleSummaryOp, compute_graph);
-  GM_RUN_AND_DUMP_PERF("Prepare", stages.preparer.PrepareDynShape, graph_node->GetGraph(), inputs, compute_graph,
+  GM_RUN_AND_DUMP_PERF("Prepare", stages.preparer.PrepareDynShape, graph_node, inputs, compute_graph,
                        session_id);
   GM_RUN_AND_DUMP_PERF("OptimizeOriginalGraph", stages.optimizer.OptimizeOriginalGraph, compute_graph);
 
@@ -771,6 +725,9 @@ Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node,
                                                  GeRootModelPtr &ge_root_model, uint64_t session_id) {
   GE_CHECK_NOTNULL(graph_node);
   GE_CHECK_NOTNULL(compute_graph);
+
+  CompilerStages &stages = GetCompilerStages(graph_node->GetGraphId());
+  GM_RUN_AND_DUMP_PERF("OptimizeWholeGraph", stages.optimizer.OptimizeWholeGraph, compute_graph);
   GM_RUN_AND_DUMP_PERF("Optimize2", OptimizeStage2, compute_graph);
   GM_RUN_AND_DUMP_PERF("OptimizeGraphBeforeBuildForRts",
                        GetCompilerStages(graph_node->GetGraphId()).optimizer.OptimizeGraphBeforeBuildForRts,
@@ -805,10 +762,24 @@ Status GraphManager::SetRtContext(rtContext_t rt_context, rtCtxMode_t mode, uint
   return SUCCESS;
 }
 
+Status GraphManager::RunCustomPass(const GraphNodePtr &graph_node) {
+  ConstGraphPtr const_graph = graph_node->GetGraph();
+  auto comp_graph = GraphUtils::GetComputeGraph(*const_graph);
+  GE_DUMP(comp_graph, "RunCustomPassBegin");
+
+  GE_TIMESTAMP_START(RunCustomPass);
+  GraphPtr graph = std::const_pointer_cast<Graph>(const_graph);
+  GE_CHK_STATUS_RET(CustomPassHelper::Instance().Run(graph), "Graph[%s] run custom pass fail.",
+                    comp_graph->GetName().c_str());
+  GE_TIMESTAMP_END(RunCustomPass, "GraphBuilder::RunCustomPass");
+  return SUCCESS;
+}
+
 Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
                             GeRootModelPtr &ge_root_model, uint64_t session_id) {
   GE_CHECK_NOTNULL(graph_node);
   GE_CHECK_NOTNULL(graph_node->GetGraph());
+  GE_CHK_STATUS_RET_NOLOG(RunCustomPass(graph_node));
   auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph());
   GE_CHECK_NOTNULL(compute_graph);
   compute_graph->SetSessionID(session_id);
@@ -1212,7 +1183,7 @@ Status GraphManager::BuildGraphForUnregisteredOp(const GraphId &graph_id, const
   auto compute_graph = GraphUtils::GetComputeGraph(*graph_node->GetGraph());
   GE_CHECK_NOTNULL(compute_graph);
 
-  GM_RUN_AND_DUMP_PERF("Prepare", GetCompilerStages(graph_id).preparer.PrepareDynShape, graph_node->GetGraph(), inputs,
+  GM_RUN_AND_DUMP_PERF("Prepare", GetCompilerStages(graph_id).preparer.PrepareDynShape, graph_node, inputs,
                        compute_graph, session_id);
 
   for (auto &node : compute_graph->GetAllNodes()) {
@@ -2134,6 +2105,24 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
       after_merge_passes.AddPass("OptimizeStage1_1::SwitchDataEdgesBypass", new (std::nothrow) SwitchDataEdgesBypass));
   GE_CHK_STATUS_RET(
       after_merge_passes.AddPass("OptimizeStage1_1::ConstantFuseSamePass", new (std::nothrow) ConstantFuseSamePass));
+  /*
+   * Do CSE before FuseDataNodesWithCommonInputPass to resolve the scene in bertlarge as following:
+   *            const
+   *    /        |        \
+   * cast1      cast2     cast3
+   *    \         |         /
+   *             case
+   * the node `const` is the fused const node after ConstantFuseSamePass
+   * the nodes `cast1`, `cast2` and 'cast3' will be fused by CSE.
+   * in order to eliminate hard code in FuseDataNodesWithCommonInputPass,
+   * we do CSE before FuseDataNodesWithCommonInputPass
+   * But it is a temp solution, this CSE will be deleted after change pass from graph pass to node pass
+   */
+  GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::CSEBeforeFuseDataNodesWithCommonInputPass",
+                                               new (std::nothrow) CommonSubexpressionEliminationPass));
+  // FuseDataNodesWithCommonInputPass: fuse same data with common input in same graph
+  GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::FuseDataNodesWithCommonInputPass",
+                                               new (std::nothrow) FuseDataNodesWithCommonInputPass));
   GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::CommonSubexpressionEliminationPass",
                                                new (std::nothrow) CommonSubexpressionEliminationPass));
   GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::PermutePass", new (std::nothrow) PermutePass))
@@ -2186,6 +2175,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
   TransposeTransDataPass transpose_transdata_pass;
   TransOpSymmetryEliminationPass symmetry_elimination_pass;
   DimensionComputePass dimension_compute_pass;
+  UselessControlOutRemovePass useless_control_out_remove_pass;
   names_to_passes.emplace_back("EnterPass", &enter_pass);
   names_to_passes.emplace_back("AddNPass", &addn_pass);
   names_to_passes.emplace_back("SwitchDeadBranchElimination", &switch_dead_branch_elimination);
@@ -2199,6 +2189,7 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
   names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass);
   names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass);
   names_to_passes.emplace_back("DimensionAdjustPass", &dimension_adjust_pass);
+  names_to_passes.emplace_back("UselessControlOutRemovePass", &useless_control_out_remove_pass);
   GE_TIMESTAMP_START(names_to_passes);
   ret = GEPass(compute_graph).Run(names_to_passes);
   GE_TIMESTAMP_END(names_to_passes, "GraphManager::OptimizeStage1_2");
@@ -2239,6 +2230,8 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
   GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::VariableRefUselessControlOutDeletePass",
                                        new (std::nothrow) VariableRefUselessControlOutDeletePass))
   GE_CHK_STATUS_RET(graph_pass.AddPass("OptimizeStage1_3::ReshapeRecoveryPass", new (std::nothrow) ReshapeRecoveryPass))
+  GE_CHK_STATUS_RET(
+      graph_pass.AddPass("OptimizeStage1_3::RemoveSameConstPass", new (std::nothrow) RemoveSameConstPass))
   if (options_.train_graph_flag) {
     // Priority: The GlobalStepInsertPass should work before graph partitioner.
     // Reason: Make sure that the var "global_step" can be partitioned to known sub graph and allocated memory
@@ -2252,12 +2245,12 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
     GELOGE(ret, "Run passes when OptimizeStage1_3 failed, ret:%u.", ret);
     return ret;
   }
-  NamesToPass identity_remove_pass;
-  GE_TIMESTAMP_START(identity_remove_pass);
+  NamesToPass node_pass;
+  GE_TIMESTAMP_START(node_pass);
   IdentityPass identity_force_pass(false);  // after SwitchToStreamSwitchPass
-  identity_remove_pass.emplace_back("IdentityPass", &identity_force_pass);
-  ret = GEPass(compute_graph).Run(identity_remove_pass);
-  GE_TIMESTAMP_END(identity_remove_pass, "GraphPrepare::IdentityRemovePass");
+  node_pass.emplace_back("IdentityPass", &identity_force_pass);
+  ret = GEPass(compute_graph).Run(node_pass);
+  GE_TIMESTAMP_END(node_pass, "GraphPrepare::node_pass");
   if (ret != SUCCESS) {
     GELOGE(ret, "Run identity remove pass for preprocess failed, ret:%u.", ret);
     return ret;
@@ -2287,10 +2280,16 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {
   ReshapeRemovePass reshape_remove_pass;
   CondRemovePass condition_remove_pass;
   BitcastPass bitcast_pass;
+  AssignRemovePass assign_remove_pass;
+  InplaceSupportCheckPass inplace_support_check_pass;
   names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass);
   names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
   names_to_passes.emplace_back("CondRemovePass", &condition_remove_pass);
   names_to_passes.emplace_back("BitcastPass", &bitcast_pass);
+  if (GetContext().GetHostExecFlag()) {
+    names_to_passes.emplace_back("AssignRemovePass", &assign_remove_pass);
+    names_to_passes.emplace_back("InplaceSupportCheckPass", &inplace_support_check_pass);
+  }
   GE_TIMESTAMP_START(names_to_passes);
   ret = GEPass(compute_graph).Run(names_to_passes);
   GE_TIMESTAMP_END(names_to_passes, "OptimizeStage2::MergedGraphNameToPasses");
@@ -2469,6 +2468,13 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
       continue;
     }
     auto model_id = model->GetModelId();
+    // unload model not release
+    bool is_unknown_shape = false;
+    GE_CHK_STATUS_RET(model->CheckIsUnknownShape(is_unknown_shape));
+    if (is_unknown_shape) {
+      GELOGD("model_id[%u] graph_id[%u] is unknown model, not release memory", model_id, graph_id);
+      continue;
+    }
     // not loaded,no need unload
     if (!it.second->GetLoadFlag()) {
       GELOGI("CheckAndReleaseMemory graph[%u] has not been loaded.", graph_id);
@@ -2486,7 +2492,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
       GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", model_id, graph_id);
       continue;
     }
-    result = GraphLoader::DestroyAicpuKernel(session_id, model_id);
+    result = GraphLoader::DestroyAicpuKernel(session_id, model_id, 0);
     if (result != SUCCESS) {
       GELOGW("[GraphManager:] destroy aicpu kernel failed when dynamic memory, modelId=%u, graphId=%u.", model_id,
              graph_id);
@@ -2509,13 +2515,13 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra
 
 Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, GraphId root_graph_id,
                                                      const SubGraphInfoPtr &sub_graph_info_ptr,
-                                                     const ComputeGraphPtr &compute_graph, uint64_t session_id,
+                                                     const std::string &root_graph_name,
+                                                     uint64_t session_id,
                                                      const GEThreadLocalContext &ge_context) {
   if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) {
     GetContext().SetSessionId(session_id);
     GetThreadLocalContext() = ge_context;
     graph_manager->UpdateLocalOmgContext(root_graph_id);
-
     ComputeGraphPtr compute_graph_tmp = sub_graph_info_ptr->GetSubGraph();
     const std::string &engine_name = sub_graph_info_ptr->GetEngineName();
     GELOGD("ProcessSubGraphWithMultiThreads start, graph name is %s, engine_name is %s, thread id is %lu",
@@ -2523,9 +2529,17 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager
            pthread_self());
     GE_DUMP(compute_graph_tmp, "OptimizeSubGraphBefore");
     GE_CHECK_NOTNULL(compute_graph_tmp);
+    if (!AttrUtils::SetInt(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_ID, root_graph_id)) {
+      GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_ID for subgraph, graph_id: %u.", root_graph_id);
+      return FAILED;
+    }
+    if (!AttrUtils::SetStr(*compute_graph_tmp, ATTR_NAME_ROOT_GRAPH_NAME, root_graph_name)) {
+      GELOGE(FAILED, "Failed to set attr ATTR_NAME_ROOT_GRAPH_NAME for subgraph, \
+             root_graph_name: %s.", root_graph_name.c_str());
+      return FAILED;
+    }
     compute_graph_tmp->SetSessionID(session_id);
     Status ret = graph_manager->GetCompilerStages(root_graph_id).optimizer.OptimizeSubGraph(compute_graph_tmp,
-                                                                                            compute_graph,
                                                                                             engine_name);
     if (ret != SUCCESS) {
       GELOGE(ret, "SubGraph optimize Failed %s", engine_name.c_str());
@@ -2688,9 +2702,7 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
     }
 
     // it will not execute graph preprocess, optimize, parition, build if the graph has built successful.
-
     GELOGI("Start for run graph async.");
-
     GeRootModelPtr ge_root_model = nullptr;
     if (graph_manager->IsGraphNeedBuild(graph_node)) {
       if (graph_node->GetBuildFlag()) {
@@ -2775,8 +2787,10 @@ Status GraphManager::ParseInputsDims(const std::vector<InputTensorInfo> &input_t
   if (!GetLocalOmgContext().dynamic_node_type.empty()) {
     vector<NodePtr> data_nodes;
     vector<NodePtr> getnext_nosink_nodes;
-    data_nodes = compute_graph_->TryGetExtAttr(kExtAttrDataNodes, data_nodes);
-    getnext_nosink_nodes = compute_graph_->TryGetExtAttr(kExtAttrGetNextNoSink, getnext_nosink_nodes);
+    data_nodes = GetLocalOmgContext().data_nodes;
+    getnext_nosink_nodes = GetLocalOmgContext().getnext_nosink_nodes;
+    GELOGD("Data nodes count is %zu, getnext nosink nodes count is %zu.", data_nodes.size(),
+           getnext_nosink_nodes.size());
     if (GetLocalOmgContext().dynamic_node_type == DATA) {
       if (getnext_nosink_nodes.empty()) {
         // just data or data+getnext_sink
diff --git a/ge/graph/manager/graph_manager.h b/ge/graph/manager/graph_manager.h
index feca02fc..32de7eac 100644
--- a/ge/graph/manager/graph_manager.h
+++ b/ge/graph/manager/graph_manager.h
@@ -219,12 +219,14 @@ class GraphManager {
 
   static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, GraphId root_graph_id,
                                                 const SubGraphInfoPtr &sub_graph_info_ptr,
-                                                const ComputeGraphPtr &compute_graph, uint64_t session_id,
+                                                const std::string &root_graph_name,
+                                                uint64_t session_id,
                                                 const GEThreadLocalContext &ge_context);
   Status ParseInputsDims(const std::vector<InputTensorInfo> &input_tensor);
   void ParseInputsDimsForData(const std::vector<InputTensorInfo> &input_tensor);
   Status ParseInputsDimsForGetNexNosinkAndData(const vector<NodePtr> &dynamic_nodes,
                                                const std::vector<InputTensorInfo> &input_tensor);
+  Status RunCustomPass(const GraphNodePtr &graph_node);
   Status PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs, GeRootModelPtr &ge_root_model,
                 uint64_t session_id = INVALID_SESSION_ID);
 
diff --git a/ge/graph/manager/graph_mem_allocator.cc b/ge/graph/manager/graph_mem_allocator.cc
index b832986b..f3037299 100755
--- a/ge/graph/manager/graph_mem_allocator.cc
+++ b/ge/graph/manager/graph_mem_allocator.cc
@@ -16,13 +16,10 @@
 
 #include "graph/manager/graph_mem_allocator.h"
 
-#include <set>
 #include <string>
-
-#include "framework/common/debug/ge_log.h"
 #include "graph/manager/graph_caching_allocator.h"
 #include "graph/manager/rdma_pool_allocator.h"
-
+#include "graph/manager/host_mem_allocator.h"
 namespace ge {
 void MemoryAllocator::Initialize(uint32_t device_id) {
   GELOGI("MemoryAllocator::Initialize");
@@ -193,6 +190,10 @@ Status MemManager::Initialize(const std::vector<rtMemType_t> &memory_type) {
     GELOGE(ge::INTERNAL_ERROR, "Create RdmaAllocator failed.");
     return ge::INTERNAL_ERROR;
   }
+  if (InitAllocator(memory_type, host_allocator_map_) != SUCCESS) {
+    GELOGE(ge::INTERNAL_ERROR, "Create HostMemAllocator failed.");
+    return ge::INTERNAL_ERROR;
+  }
   return SUCCESS;
 }
 
@@ -214,6 +215,7 @@ void MemManager::Finalize() noexcept {
   // caching and rdma allocator use memory allocator, so finalize them first
   FinalizeAllocatorMap(caching_allocator_map_);
   FinalizeAllocatorMap(rdma_allocator_map_);
+  FinalizeAllocatorMap(host_allocator_map_);
   FinalizeAllocatorMap(memory_allocator_map_);
 }
 
@@ -242,4 +244,7 @@ CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) {
 RdmaPoolAllocator &MemManager::RdmaPoolInstance(rtMemType_t memory_type) {
   return Instance().GetAllocator(memory_type, rdma_allocator_map_);
 }
+HostMemAllocator &MemManager::HostMemInstance(rtMemType_t memory_type) {
+  return Instance().GetAllocator(memory_type, host_allocator_map_);
+}
 }  // namespace ge
diff --git a/ge/graph/manager/graph_mem_allocator.h b/ge/graph/manager/graph_mem_allocator.h
index 2723ae5c..bd75dbb9 100644
--- a/ge/graph/manager/graph_mem_allocator.h
+++ b/ge/graph/manager/graph_mem_allocator.h
@@ -139,7 +139,7 @@ class MemoryAllocator {
 using MemoryAllocatorPtr = std::shared_ptr<MemoryAllocator>;
 class CachingAllocator;
 class RdmaPoolAllocator;
-
+class HostMemAllocator;
 class MemManager {
  public:
   MemManager();
@@ -148,6 +148,7 @@ class MemManager {
   static MemoryAllocator *Instance(rtMemType_t memory_type);
   CachingAllocator &CachingInstance(rtMemType_t memory_type);
   RdmaPoolAllocator &RdmaPoolInstance(rtMemType_t memory_type);
+  HostMemAllocator &HostMemInstance(rtMemType_t memory_type);
   MemManager(const MemManager &) = delete;
   MemManager &operator=(const MemManager &) = delete;
   ///
@@ -235,6 +236,7 @@ class MemManager {
   std::map<rtMemType_t, MemoryAllocator *> memory_allocator_map_;
   std::map<rtMemType_t, CachingAllocator *> caching_allocator_map_;
   std::map<rtMemType_t, RdmaPoolAllocator *> rdma_allocator_map_;
+  std::map<rtMemType_t, HostMemAllocator *> host_allocator_map_;
   std::recursive_mutex allocator_mutex_;
 };
 }  // namespace ge
diff --git a/ge/graph/manager/graph_var_manager.cc b/ge/graph/manager/graph_var_manager.cc
index be7d4eb2..821de257 100755
--- a/ge/graph/manager/graph_var_manager.cc
+++ b/ge/graph/manager/graph_var_manager.cc
@@ -183,51 +183,32 @@ ge::Status VarResource::GetBroadCastInfo(uint32_t graph_id, const string &var_na
 }
 
 ge::Status VarResource::SyncVarData2BroadCast(uint32_t graph_id, const std::string &var_name,
-                                              const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr) {
-  if (var_op_desc == nullptr) {
-    GELOGE(FAILED, "[SyncVarData2BroadCast] var opdesc is null!");
-    return FAILED;
-  }
+                                              const GeTensorDesc &var_tensor_desc, uint8_t *base_ptr) {
   GE_CHECK_NOTNULL(base_ptr);
   GELOGI("SyncVarData2BroadCast graph_id: %u, var_name: %s.", graph_id, var_name.c_str());
 
   VarBroadCastInfo var_broadcast_info = var_broad_cast_info_[graph_id][var_name];
   uint8_t *dst_addr = base_ptr + var_broadcast_info.input_offset;
-  ge::GeTensorDesc var_tensor_desc = var_op_desc->GetOutputDesc(0);
 
   return ge::TransVarDataUtils::SyncVarData2BroadCast(var_name, var_tensor_desc, dst_addr,
                                                       var_broadcast_info.input_size, session_id_);
 }
 
 ge::Status VarResource::SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name,
-                                              const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr) {
+                                              const GeTensorDesc &var_tensor_desc, uint8_t *base_ptr) {
   GELOGI("SyncBroadCastData2Var var_name: %s", var_name.c_str());
-  GE_CHECK_NOTNULL(var_op_desc);
-  string var_is_broadcast;
-  bool is_broadcast = AttrUtils::GetStr(var_op_desc, VAR_ATTR_VAR_IS_BROADCAST, var_is_broadcast);
-  if (!is_broadcast) {
-    return SUCCESS;
-  }
 
   VarBroadCastInfo var_broadcast_info = var_broad_cast_info_[graph_id][var_name];
   // subgraph base_ptr could be nullptr, task it as base 0
   uint8_t *dst_addr = base_ptr + var_broadcast_info.output_offset;
-  ge::GeTensorDesc var_tensor_desc = var_op_desc->GetOutputDesc(0);
 
   return ge::TransVarDataUtils::SyncBroadCastData2Var(dst_addr, var_broadcast_info.output_size, var_name,
                                                       var_tensor_desc, session_id_);
 }
 
 ge::Status VarResource::SyncVarData(uint32_t graph_id, const std::string &var_name,
-                                    const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr) {
-  GE_CHECK_NOTNULL(var_op_desc);
-  string var_is_broadcast;
-  bool is_broadcast = AttrUtils::GetStr(var_op_desc, VAR_ATTR_VAR_IS_BROADCAST, var_is_broadcast);
-  if (!is_broadcast) {
-    return SUCCESS;
-  }
-
-  return SyncVarData2BroadCast(graph_id, var_name, var_op_desc, base_ptr);
+                                    const GeTensorDesc &var_tensor_desc, uint8_t *base_ptr) {
+  return SyncVarData2BroadCast(graph_id, var_name, var_tensor_desc, base_ptr);
 }
 
 bool VarResource::IsVarAddr(const int64_t &offset) { return var_offset_set_.count(offset) > 0; }
@@ -280,9 +261,9 @@ Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uin
     return PARAM_INVALID;
   }
   uint64_t free_size = total_size_ - var_mem_size_;
-  if (free_size < (size + kSessionMemAlignSize * 2)) {
+  if (free_size < (size + kSessionMemAlignSize * kSessionMemAlignUnit)) {
     GELOGE(PARAM_INVALID, "Out of memory : current var size[%lu] exceeds total var size[%lu]",
-           size + kSessionMemAlignSize * 2 + var_mem_size_, total_size_);
+           size + kSessionMemAlignSize * kSessionMemAlignUnit + var_mem_size_, total_size_);
     return PARAM_INVALID;
   }
 
@@ -570,14 +551,14 @@ bool VarManager::IsVarExist(const std::string &var_name) {
   return var_resource_->IsVarExist(var_name);
 }
 
-ge::Status VarManager::SyncVarData(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc,
+ge::Status VarManager::SyncVarData(uint32_t graph_id, const std::string &var_name, const GeTensorDesc &var_tensor_desc,
                                    uint8_t *base_ptr) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   if (var_resource_ == nullptr) {
     GELOGW("VarManager has not been init.");
     return ge::INTERNAL_ERROR;
   }
-  return var_resource_->SyncVarData(graph_id, var_name, std::move(var_op_desc), base_ptr);
+  return var_resource_->SyncVarData(graph_id, var_name, var_tensor_desc, base_ptr);
 }
 
 ge::Status VarManager::GetCurVarDesc(const std::string &var_name, ge::GeTensorDesc &tensor_desc) {
@@ -630,13 +611,13 @@ ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPt
 }
 
 ge::Status VarManager::SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name,
-                                             ge::ConstOpDescPtr var_op_desc, uint8_t *base_ptr) {
+                                             const GeTensorDesc &var_tensor_desc, uint8_t *base_ptr) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   if (var_resource_ == nullptr) {
     GELOGW("VarManager has not been init.");
     return ge::INTERNAL_ERROR;
   }
-  return var_resource_->SyncBroadCastData2Var(graph_id, var_name, std::move(var_op_desc), base_ptr);
+  return var_resource_->SyncBroadCastData2Var(graph_id, var_name, var_tensor_desc, base_ptr);
 }
 
 bool VarManager::IsVarAddr(const int64_t &offset) {
diff --git a/ge/graph/manager/graph_var_manager.h b/ge/graph/manager/graph_var_manager.h
index b4f6aca3..9cf0068c 100755
--- a/ge/graph/manager/graph_var_manager.h
+++ b/ge/graph/manager/graph_var_manager.h
@@ -42,6 +42,7 @@ const size_t kGraphMemoryBuffer = 4UL * 1024UL * 1024UL * 1024UL;
 const size_t kMaxMemorySize = 256UL * 1024UL * 1024UL * 1024UL;
 const char kEnvGeuseStaticMemory[] = "GE_USE_STATIC_MEMORY";
 const uint64_t kSessionMemAlignSize = 512;
+const size_t kSessionMemAlignUnit = 2;
 
 enum MemStatus {
   NORMAL = 0,
@@ -118,12 +119,12 @@ class VarResource {
   ge::Status GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info);
 
   ge::Status SyncVarData2BroadCast(uint32_t graph_id, const std::string &var_name,
-                                   const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr);
+                                   const GeTensorDesc &var_tensor_desc, uint8_t *base_ptr);
 
   ge::Status SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name,
-                                   const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr);
+                                   const GeTensorDesc &var_tensor_desc, uint8_t *base_ptr);
 
-  ge::Status SyncVarData(uint32_t graph_id, const std::string &var_name, const ge::ConstOpDescPtr &var_op_desc,
+  ge::Status SyncVarData(uint32_t graph_id, const std::string &var_name, const GeTensorDesc &var_tensor_desc,
                          uint8_t *base_ptr);
 
   Status SetTransRoad(const std::string &var_name, const VarTransRoad &trans_road) {
@@ -214,14 +215,14 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager {
 
   ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr);
 
-  ge::Status SyncVarData(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc,
+  ge::Status SyncVarData(uint32_t graph_id, const std::string &var_name, const GeTensorDesc &var_tensor_desc,
                          uint8_t *base_ptr);
 
   ge::Status SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &broad_cast_info);
 
   ge::Status GetBroadCastInfo(uint32_t graph_id,  const string &var_name, VarBroadCastInfo &broad_cast_info);
 
-  ge::Status SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc,
+  ge::Status SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name, const GeTensorDesc &var_tensor_desc,
                                    uint8_t *base_ptr);
 
   ge::Status GetCurVarDesc(const std::string &var_name, ge::GeTensorDesc &tensor_desc);
diff --git a/ge/graph/manager/host_mem_allocator.cc b/ge/graph/manager/host_mem_allocator.cc
new file mode 100644
index 00000000..ca2b5124
--- /dev/null
+++ b/ge/graph/manager/host_mem_allocator.cc
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/manager/host_mem_allocator.h"
+#include "framework/common/debug/ge_log.h"
+#include "common/ge/ge_util.h"
+
+namespace ge {
+const void *HostMemAllocator::Malloc(const std::shared_ptr<AlignedPtr> &aligned_ptr, size_t size) {
+  if (aligned_ptr == nullptr) {
+    GELOGW("Insert a null aligned_ptr");
+    return nullptr;
+  }
+  GELOGD("allocate existed host memory succ, size=%zu", size);
+  allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr };
+  return aligned_ptr->Get();
+}
+
+uint8_t *HostMemAllocator::Malloc(size_t size) {
+  GELOGD("start to malloc host memory, size=%zu", size);
+  std::lock_guard<std::mutex> lock(mutex_);
+  std::shared_ptr<AlignedPtr> aligned_ptr = MakeShared<AlignedPtr>(size);
+  if (aligned_ptr == nullptr) {
+    GELOGE(INTERNAL_ERROR, "make shared_ptr for AlignedPtr failed");
+    return nullptr;
+  }
+  allocated_blocks_[aligned_ptr->Get()] = { size, aligned_ptr };
+  GELOGD("allocate host memory succ, size=%zu", size);
+  return aligned_ptr->MutableGet();
+}
+
+Status HostMemAllocator::Free(const void *memory_addr) {
+  if (memory_addr == nullptr) {
+    GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer");
+    return GE_GRAPH_FREE_FAILED;
+  }
+
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto it = allocated_blocks_.find(memory_addr);
+  if (it == allocated_blocks_.end()) {
+    GELOGE(PARAM_INVALID, "Invalid memory pointer");
+    return PARAM_INVALID;
+  }
+  it->second.second.reset();
+  allocated_blocks_.erase(it);
+
+  return SUCCESS;
+}
+
+void HostMemAllocator::Clear() {
+  for (auto &block : allocated_blocks_) {
+    block.second.second.reset();
+  }
+  allocated_blocks_.clear();
+}
+}  // namespace ge
diff --git a/ge/graph/manager/host_mem_allocator.h b/ge/graph/manager/host_mem_allocator.h
new file mode 100644
index 00000000..d10b2475
--- /dev/null
+++ b/ge/graph/manager/host_mem_allocator.h
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_MANAGER_HOST_MEM_ALLOCATOR_H_
+#define GE_GRAPH_MANAGER_HOST_MEM_ALLOCATOR_H_
+
+#include <mutex>
+#include <map>
+
+#include "framework/common/ge_inner_error_codes.h"
+#include "graph/aligned_ptr.h"
+#include "runtime/mem.h"
+
+namespace ge {
+class HostMemAllocator {
+ public:
+  explicit HostMemAllocator(rtMemType_t) {}
+  ~HostMemAllocator() = default;
+
+  HostMemAllocator(const HostMemAllocator &) = delete;
+  HostMemAllocator &operator=(const HostMemAllocator &) = delete;
+
+  Status Initialize() {
+    Clear();
+    return SUCCESS;
+  }
+  void Finalize() { Clear(); }
+
+  const void *Malloc(const std::shared_ptr<AlignedPtr>& aligned_ptr, size_t size);
+  uint8_t *Malloc(size_t size);
+  Status Free(const void *memory_addr);
+
+  std::pair<size_t, std::shared_ptr<AlignedPtr>> GetAlignedPtr(const void *addr) { return allocated_blocks_[addr]; }
+
+ private:
+  void Clear();
+
+  std::map<const void *, std::pair<size_t, std::shared_ptr<AlignedPtr>>> allocated_blocks_;
+  // lock around all operations
+  mutable std::mutex mutex_;
+};
+}  // namespace ge
+
+#endif  // GE_GRAPH_MANAGER_HOST_MEM_ALLOCATOR_H_
diff --git a/ge/graph/manager/host_mem_manager.cc b/ge/graph/manager/host_mem_manager.cc
index d4aceddd..60a7586d 100644
--- a/ge/graph/manager/host_mem_manager.cc
+++ b/ge/graph/manager/host_mem_manager.cc
@@ -43,7 +43,12 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) {
     return GE_GRAPH_MEMORY_ALLOC_FAILED;
   }
   mem_info.fd = output_para.fd;
-  mem_info.host_address = reinterpret_cast<uint8_t *>(output_para.ptr);
+  mem_info.host_aligned_ptr = AlignedPtr::BuildFromAllocFunc([&output_para](std::unique_ptr<uint8_t[], deleter> &ptr) {
+                                                               ptr.reset(reinterpret_cast<uint8_t *>(output_para.ptr));
+                                                             },
+                                                             [](uint8_t *ptr) {
+                                                               ptr = nullptr;
+                                                             });
   mem_info.device_address = reinterpret_cast<uint8_t *>(output_para.devPtr);
   return SUCCESS;
 }
@@ -51,8 +56,7 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) {
 Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) {
   GELOGD("SharedMemAllocator::DeAllocate");
   rtFreeHostSharedMemoryIn free_para = {mem_info.shm_name.c_str(), mem_info.mem_size, mem_info.fd,
-                                        mem_info.host_address, mem_info.device_address};
-
+                                        mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address};
   rtError_t rt_ret = rtFreeHostSharedMemory(&free_para);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rt api(rtFreeHostSharedMemory) failed, ret: 0x%X.", rt_ret);
@@ -106,7 +110,7 @@ Status HostMemManager::QueryVarMemInfo(const string &op_name, uint64_t &base_add
     GELOGE(INTERNAL_ERROR, "Find host base base_addr failed,node name:%s!", op_name.c_str());
     return INTERNAL_ERROR;
   }
-  base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address));
+  base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_memory_base_map_[op_name].device_address));
   data_size = var_memory_base_map_[op_name].mem_size;
   return SUCCESS;
 }
diff --git a/ge/graph/manager/host_mem_manager.h b/ge/graph/manager/host_mem_manager.h
index 66bd5826..be3237c3 100644
--- a/ge/graph/manager/host_mem_manager.h
+++ b/ge/graph/manager/host_mem_manager.h
@@ -42,7 +42,7 @@ struct SharedMemInfo {
   uint64_t mem_size = 0;
   int fd = 0;
   uint8_t *device_address = nullptr;
-  uint8_t *host_address = nullptr;
+  std::shared_ptr<AlignedPtr> host_aligned_ptr = nullptr;
   SharedMemInfo() = default;
   SharedMemInfo(string name, uint64_t size) : op_name(std::move(name)), mem_size(size) {}
 };
diff --git a/ge/graph/manager/memory_api.cc b/ge/graph/manager/memory_api.cc
index 45e4bb65..0798eb51 100644
--- a/ge/graph/manager/memory_api.cc
+++ b/ge/graph/manager/memory_api.cc
@@ -63,7 +63,7 @@ Status RdmaRemoteRegister(const std::vector<HostVarInfo> &var_info, rtMemType_t
   });
 
   auto hcom_remote_mem_register =
-      (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "hcom_remote_access_mem_register");
+      (HcclResult(*)(const MemRegisterAddr *, uint32_t))dlsym(handle, "HcomRegRemoteAccessMem");
   if (hcom_remote_mem_register == nullptr) {
     GELOGE(FAILED, "Failed to invoke hcom_remote_mem_register function.");
     return FAILED;
diff --git a/ge/graph/manager/util/debug.cc b/ge/graph/manager/util/debug.cc
index 45c070c6..2c930d1f 100644
--- a/ge/graph/manager/util/debug.cc
+++ b/ge/graph/manager/util/debug.cc
@@ -32,7 +32,8 @@ Debug::~Debug() = default;
 
 void Debug::DumpProto(const Message &proto, const char *file) {
   std::string file_path = RealPath(file);
-  int fd = mmOpen2(file_path.c_str(), M_WRONLY | M_CREAT | O_TRUNC, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD | M_UMASK_OTHREAD);
+  int fd = mmOpen2(file_path.c_str(), M_WRONLY | M_CREAT | O_TRUNC, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD |
+                   M_UMASK_OTHREAD);
   if (fd == -1) {
     GELOGW("Write %s failed", file_path.c_str());
     return;
diff --git a/ge/graph/manager/util/hcom_util.cc b/ge/graph/manager/util/hcom_util.cc
index 487b24af..50fa9936 100644
--- a/ge/graph/manager/util/hcom_util.cc
+++ b/ge/graph/manager/util/hcom_util.cc
@@ -263,7 +263,8 @@ Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &ro
 Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc,
                                  std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
   GE_CHECK_NOTNULL(op_desc);
-  if (op_desc->GetType() == HCOMBROADCAST || op_desc->GetType() == HVDCALLBACKBROADCAST || op_desc->GetType() == HCOMREDUCE) {
+  if (op_desc->GetType() == HCOMBROADCAST ||
+      op_desc->GetType() == HVDCALLBACKBROADCAST || op_desc->GetType() == HCOMREDUCE) {
     GELOGI("GetAllRootId Node[%s] opType[%s] get hccl rootId.", op_desc->GetName().c_str(), op_desc->GetType().c_str());
     int64_t root_id = 0;
     Status dmrt = GetHcclRootId(op_desc, root_id);
diff --git a/ge/graph/optimize/graph_optimize.cc b/ge/graph/optimize/graph_optimize.cc
index c5ebfda6..8cca5b5d 100644
--- a/ge/graph/optimize/graph_optimize.cc
+++ b/ge/graph/optimize/graph_optimize.cc
@@ -76,8 +76,7 @@ void AddNodeInputProperty(ComputeGraphPtr &compute_graph) {
   }
 }
 
-Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const ComputeGraphPtr &parent_graph,
-                                       const std::string &engine_name) {
+Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std::string &engine_name) {
   if (compute_graph == nullptr) {
     GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeSubGraph]: compute_graph is nullptr.");
     return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL;
@@ -106,10 +105,6 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const Com
       for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) {
         Status ret = (*iter)->OptimizeFusedGraphAfterGraphSlice(*(compute_graph));
         if (ret != SUCCESS) {
-          auto root_graph = ge::GraphUtils::FindRootGraph(parent_graph);
-          if (root_graph != nullptr) {
-            ErrorManager::GetInstance().SaveMstuneCompileFailedMsg(root_graph->GetName());
-          }
           GELOGE(ret, "[OptimizeSubGraph][OptimizeFusedGraphAfterGraphSlice]: graph optimize failed, ret:%d", ret);
           return ret;
         }
@@ -132,6 +127,10 @@ Status GraphOptimize::OptimizeSubGraph(ComputeGraphPtr &compute_graph, const Com
 }
 
 Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) {
+  if (GetContext().GetHostExecFlag()) {
+    // graph exec on host, no need OptimizeOriginalGraph
+    return SUCCESS;
+  }
   if (compute_graph == nullptr) {
     GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeOriginalGraph]: compute_graph is nullptr.");
     return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL;
@@ -167,7 +166,7 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) {
 Status GraphOptimize::OptimizeOriginalGraphJudgeInsert(ComputeGraphPtr &compute_graph) {
   GELOGD("OptimizeOriginalGraphJudgeInsert in");
   if (GetContext().GetHostExecFlag()) {
-    // graph exec on host, no need OptimizeOriginalGraph
+    // graph exec on host, no need OptimizeOriginalGraphJudgeInsert
     return SUCCESS;
   }
 
@@ -341,4 +340,37 @@ Status GraphOptimize::IdentifyReference(ComputeGraphPtr &compute_graph) {
   }
   return SUCCESS;
 }
+Status GraphOptimize::OptimizeWholeGraph(ComputeGraphPtr &compute_graph) {
+  if (compute_graph == nullptr) {
+    GELOGE(GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL, "[OptimizeWholeGraph]: compute_graph is nullptr.");
+    return GE_GRAPH_OPTIMIZE_COMPUTE_GRAPH_NULL;
+  }
+
+  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
+    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeWholeGraph failed.");
+    return GE_CLI_GE_NOT_INITIALIZED;
+  }
+
+  auto graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjsByPriority();
+  GELOGI("optimize by opskernel in OptimizeWholeGraph. num of graph_optimizer is %zu.", graph_optimizer.size());
+  Status ret = SUCCESS;
+  string exclude_core_type = (core_type_ == kVectorCore) ? kAicoreEngine : kVectorEngine;
+  GELOGD("[OptimizeWholeGraph]: engine type will exclude: %s", exclude_core_type.c_str());
+  if (!graph_optimizer.empty()) {
+    for (auto &iter : graph_optimizer) {
+      if (iter.first == exclude_core_type || iter.second == nullptr) {
+        continue;
+      }
+      GELOGI("Begin to optimize whole graph by engine %s", iter.first.c_str());
+      ret = iter.second->OptimizeWholeGraph(*compute_graph);
+      GE_DUMP(compute_graph, "OptimizeWholeGraph" + iter.first);
+      if (ret != SUCCESS) {
+        GELOGE(ret, "[OptimizeWholeGraph]: graph optimize failed, ret:%u", ret);
+        return ret;
+      }
+    }
+  }
+  return ret;
+}
 }  // namespace ge
diff --git a/ge/graph/optimize/graph_optimize.h b/ge/graph/optimize/graph_optimize.h
index 969b4720..3a1960f7 100755
--- a/ge/graph/optimize/graph_optimize.h
+++ b/ge/graph/optimize/graph_optimize.h
@@ -42,8 +42,7 @@ class GraphOptimize {
   ~GraphOptimize() = default;
 
   // subgraph optimize
-  Status OptimizeSubGraph(ComputeGraphPtr &compute_graph, const ComputeGraphPtr &parent_graph,
-                          const std::string &engine_name);
+  Status OptimizeSubGraph(ComputeGraphPtr &compute_graph, const std::string &engine_name);
 
   // original graph optimize
   Status OptimizeOriginalGraph(ComputeGraphPtr &compute_graph);
@@ -53,6 +52,9 @@ class GraphOptimize {
   // for fe prepare optimize in quantize scene
   Status OptimizeOriginalGraphForQuantize(ComputeGraphPtr &compute_graph);
 
+  // for engine to optimize merged whole graph before ge Optimize2
+  Status OptimizeWholeGraph(ComputeGraphPtr &compute_graph);
+
   // for rts optimize before build to add attr and insert memcpy op
   Status OptimizeGraphBeforeBuildForRts(ComputeGraphPtr &compute_graph);
 
diff --git a/ge/graph/optimize/mem_rw_conflict_optimize.cc b/ge/graph/optimize/mem_rw_conflict_optimize.cc
index 2fabc035..dfc6c9df 100644
--- a/ge/graph/optimize/mem_rw_conflict_optimize.cc
+++ b/ge/graph/optimize/mem_rw_conflict_optimize.cc
@@ -26,6 +26,13 @@
 namespace {
 using namespace ge;
 const int kIdentityAnchorIndex = 0;
+const size_t kSerialStringVecSize = 4;
+
+const int kCaseReadOnly = 0;
+const int kCaseScopeWriteable = 2;
+const int kCaseWriteable = 3;
+const int kCaseInvalidRWType = 5;
+
 // rw type of input.
 enum class InputRWType {
   kReadOnly,        // Normal op input only read
@@ -55,7 +62,7 @@ thread_local map<string, NodeInputOutputRWType> node_rwtype_map_;
 /// @return rw_type_name
 ///
 static std::string InputRWTypeToSerialString(InputRWType rw_type) {
-  const static char *names[4] = {"ReadOnly", "Writeable", "ScopeWriteable", "InvalidRWType"};
+  const static char *names[kSerialStringVecSize] = {"ReadOnly", "Writeable", "ScopeWriteable", "InvalidRWType"};
   return names[static_cast<int>(rw_type)];
 }
 
@@ -65,7 +72,7 @@ static std::string InputRWTypeToSerialString(InputRWType rw_type) {
 /// @return rw_type_name
 ///
 static std::string OutputRWTypeToSerialString(OutputRWType rw_type) {
-  const static char *names[4] = {"ReadOnly", "SoftRead", "Writeable", "InvalidRWType"};
+  const static char *names[kSerialStringVecSize] = {"ReadOnly", "SoftRead", "Writeable", "InvalidRWType"};
   return names[static_cast<int>(rw_type)];
 }
 
@@ -118,13 +125,13 @@ InputRWType GetInputRwTypeInConflict(const std::set<int> &rw_type_set) {
   }
 
   switch (total_rw_type) {
-    case 0:
+    case kCaseReadOnly:
       return InputRWType::kReadOnly;  // all input rw type is readonly
-    case 2:
+    case kCaseScopeWriteable:
       return InputRWType::kScopeWriteable;  // readonly 2 scope_writeable
-    case 3:
+    case kCaseWriteable:
       return InputRWType::kWriteable;  // all input rw type is writeable or readonly 2 writeable
-    case 5:
+    case kCaseInvalidRWType:
       return InputRWType::kInvalidRWType;  // writeable 2 scope_writeable
     default:
       return InputRWType::kInvalidRWType;
@@ -643,7 +650,7 @@ Status HandleAllreduceDuplicateInput(ComputeGraphPtr &compute_graph) {
          auto ret = GraphUtils::InsertNodeBetweenDataAnchors(pre_out_anchor, in_data_anchor, identity_node);
          GE_CHK_STATUS_RET(ret, "Fail to insert identity.");
          GELOGI("InsertNode %s between %s and %s successfully.", identity_node->GetName().c_str(),
-               pre_node->GetName().c_str(), node->GetName().c_str());
+                pre_node->GetName().c_str(), node->GetName().c_str());
        }
      }
    }
diff --git a/ge/graph/partition/dynamic_shape_partition.cc b/ge/graph/partition/dynamic_shape_partition.cc
index 95f13b6f..6c81b21f 100755
--- a/ge/graph/partition/dynamic_shape_partition.cc
+++ b/ge/graph/partition/dynamic_shape_partition.cc
@@ -44,18 +44,46 @@
 #define REQUIRE_SUCCESS(cond, ...) REQUIRE(((cond) == SUCCESS), __VA_ARGS__)
 #define REQUIRE_GRAPH_SUCCESS(cond, ...) REQUIRE(((cond) == GRAPH_SUCCESS), __VA_ARGS__)
 
-bool IsExperimental() {
-  const static bool kIsExperimental = (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") != nullptr);
-  return kIsExperimental;
-}
-
 namespace ge {
 using Cluster = DynamicShapePartitioner::Cluster;
 using ClusterPtr = std::shared_ptr<Cluster>;
 
+static bool IsInExperimentalMode(const ComputeGraphPtr &root_graph) {
+  for (const auto &node : root_graph->GetAllNodes()) {
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    for (const auto &input_desc : node->GetOpDesc()->GetAllInputsDesc()) {
+      auto type = input_desc.GetDataType();
+      if (type == DT_STRING || type == DT_RESOURCE || type == DT_STRING_REF) {
+        if (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") == nullptr) {
+          return false;
+        } else {
+          GEEVENT("In dynamic shape scene, model contains data type:"
+                 "DT_STRING/DT_RESOURCE/DT_STRING_REF may not be supported well "
+                 "temporarily, please retry with \"unset EXPERIMENTAL_DYNAMIC_PARTITION\".");
+          break;
+        }
+      }
+    }
+    for (const auto &output_desc : node->GetOpDesc()->GetAllOutputsDesc()) {
+      auto type = output_desc.GetDataType();
+      if (type == DT_STRING || type == DT_RESOURCE || type == DT_STRING_REF) {
+        if (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") == nullptr) {
+          return false;
+        } else {
+          GEEVENT("In dynamic shape scene, model contains data type:"
+                 "DT_STRING/DT_RESOURCE/DT_STRING_REF may not be supported well "
+                 "temporarily, please retry with \"unset EXPERIMENTAL_DYNAMIC_PARTITION\".");
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 Status DynamicShapePartitioner::Partition() {
   REQUIRE_NOT_NULL(root_graph_, "Graph is nullptr.");
-  if (!IsExperimental()) {
+  if (!IsInExperimentalMode(root_graph_)) {
     GELOGD("Skip dynamic shape partition as not in experimental mode.");
     REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false),
             "Failed set dynamic shape partitioned flag on root graph.");
diff --git a/ge/graph/partition/graph_partition.cc b/ge/graph/partition/graph_partition.cc
index 6a1fbb34..fbc13920 100755
--- a/ge/graph/partition/graph_partition.cc
+++ b/ge/graph/partition/graph_partition.cc
@@ -614,32 +614,32 @@ Status ge::GraphPartitioner::AddPartitionsToGraphNode(vector<ge::SubGraphInfoPtr
     }
     // flush parent node of subgraph
     sub_graph->SetParentNode(compute_graph->GetParentNode());
-    (void) AttrUtils::SetStr(*sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
-      auto sgi = MakeShared<SubGraphInfo>();
-      if (sgi == nullptr) {
-        GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed.");
-        return FAILED;
-      }
-      // set engine name
-      sgi->SetEngineName(engine_name);
-      // set stream label
-      string sub_graph_stream;
-      if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
-        sgi->SetStreamLabel(sub_graph_stream);
-      }
-      /// for now inputFlag is the same before and after partition. It should
-      /// be changed according to the real partition
-      std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
-      std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
-      sgi->SetSubGraph(sub_graph);
-      sgi->SetOutputFlag(sub_graph_output);
-      sgi->SetInputFlag(sub_graph_input);
-      sgi->SetOutputContext(graph_info_.output_name_);
-      AddEndPldInformationToSubGraphInfo(sgi);
-      GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s",
-             engine_name.c_str(),
-             sub_graph->GetName().c_str(),
-             sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
+    (void)AttrUtils::SetStr(*sub_graph, ATTR_NAME_PARENT_GRAPH_NAME, compute_graph->GetName());
+    GELOGD("set attr success. subgraph(%s) with parent graph(%s)", sub_graph->GetName().c_str(),
+           compute_graph->GetName().c_str());
+    auto sgi = MakeShared<SubGraphInfo>();
+    if (sgi == nullptr) {
+      GELOGE(GE_GRAPH_PARAM_NULLPTR, "[GraphPartitioner]: MakeShared sub graph info failed.");
+      return FAILED;
+    }
+    // set engine name
+    sgi->SetEngineName(engine_name);
+    // set stream label
+    string sub_graph_stream;
+    if (AttrUtils::GetStr(sub_graph->GetDirectNode().at(0)->GetOpDesc(), ATTR_NAME_STREAM_LABEL, sub_graph_stream)) {
+      sgi->SetStreamLabel(sub_graph_stream);
+    }
+    /// for now inputFlag is the same before and after partition. It should
+    /// be changed according to the real partition
+    std::vector<bool> sub_graph_input(graph_info_.input_size_, true);
+    std::vector<bool> sub_graph_output(graph_info_.output_size_, true);
+    sgi->SetSubGraph(sub_graph);
+    sgi->SetOutputFlag(sub_graph_output);
+    sgi->SetInputFlag(sub_graph_input);
+    sgi->SetOutputContext(graph_info_.output_name_);
+    AddEndPldInformationToSubGraphInfo(sgi);
+    GELOGI("[GraphPartitioner]: subGraph engine name is %s, graph name is %s, stream label is %s", engine_name.c_str(),
+           sub_graph->GetName().c_str(), sgi->GetStreamLabel().empty() ? "null" : sgi->GetStreamLabel().c_str());
     if (engine_name != input_subgraph_name) {  // do not add Data subGraph into SubGraphInfo
       output_subgraphs.push_back(sgi);
     } else {
diff --git a/ge/graph/passes/assign_pass.cc b/ge/graph/passes/assign_remove_pass.cc
similarity index 50%
rename from ge/graph/passes/assign_pass.cc
rename to ge/graph/passes/assign_remove_pass.cc
index bb7a0f04..e198c2db 100644
--- a/ge/graph/passes/assign_pass.cc
+++ b/ge/graph/passes/assign_remove_pass.cc
@@ -14,41 +14,60 @@
  * limitations under the License.
  */
 
-#include "graph/passes/assign_pass.h"
-
-#include "framework/common/debug/ge_log.h"
+#include "graph/passes/assign_remove_pass.h"
 #include "framework/common/debug/log.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/debug/ge_attr_define.h"
 
+namespace ge {
 namespace {
-const uint32_t kValidInputNodeOutputNum = 1;
-const int32_t kAssignRefInputIndex = 0;
-const int32_t kAssignValueInputIndex = 1;
+constexpr uint32_t kValidInputNodeOutputNum = 1;
+constexpr int32_t kAssignRefInputIndex = 0;
+constexpr int32_t kAssignValueInputIndex = 1;
+static const std::set<std::string> kNoTaskNodeTypes = { ge::DATA, ge::ANN_DATA, ge::AIPPDATA,
+                                                        ge::CONSTANT, ge::CONSTANTOP,
+                                                        ge::VARIABLE, ge::VARIABLEV2 };
 }
 
-namespace ge {
-Status AssignPass::Run(NodePtr &node) {
-  GELOGD("AssignPass running");
-  if (node->GetType() != ASSIGN) {
-    GELOGD("No need run AssignPass on [%s, %s].", node->GetName().c_str(), node->GetType().c_str());
-    return SUCCESS;
+Status AssignRemovePass::Run(NodePtr &node) {
+  GELOGD("AssignRemovePass running");
+
+  if (TransformAttr(node) != SUCCESS) {
+    GELOGE(FAILED, "Transform assign_var_name attr failed, node=%s", node->GetName().c_str());
+    return FAILED;
+  }
+
+  if (node->GetType() == ASSIGN) {
+    if (OptimizedAssignNode(node) != SUCCESS) {
+      GELOGE(FAILED, "Optimize for assign_node %s failed", node->GetName().c_str());
+      return FAILED;
+    }
   }
 
-  const auto &ref_in_anchor = node->GetInDataAnchor(kAssignRefInputIndex);
-  const auto &value_in_anchor = node->GetInDataAnchor(kAssignValueInputIndex);
+  GELOGD("AssignRemovePass success");
+  return SUCCESS;
+}
+
+///
+/// @brief Optimize for assign_node
+/// @param [in] assign_node
+/// @return Status
+///
+Status AssignRemovePass::OptimizedAssignNode(NodePtr &assign_node) {
+  const auto &ref_in_anchor = assign_node->GetInDataAnchor(kAssignRefInputIndex);
+  const auto &value_in_anchor = assign_node->GetInDataAnchor(kAssignValueInputIndex);
   if ((ref_in_anchor == nullptr) || (value_in_anchor == nullptr)) {
-    GELOGE(FAILED, "In data anchor is null, node:%s", node->GetName().c_str());
+    GELOGE(FAILED, "In data anchor is null, node:%s", assign_node->GetName().c_str());
     return FAILED;
   }
   const auto &ref_peer_anchor = ref_in_anchor->GetPeerOutAnchor();
   const auto &value_peer_anchor = value_in_anchor->GetPeerOutAnchor();
   if ((ref_peer_anchor == nullptr) || (value_peer_anchor == nullptr)) {
-    GELOGE(FAILED, "Peer data anchor is null, node:%s", node->GetName().c_str());
+    GELOGE(FAILED, "Peer data anchor is null, node:%s", assign_node->GetName().c_str());
     return FAILED;
   }
 
-  if (IsCondMatch(node, ref_peer_anchor, value_peer_anchor)) {
+  if (IsCondMatch(assign_node, ref_peer_anchor, value_peer_anchor)) {
     ///
     ///    variable  not-const               not-const
     ///         \     /                          |
@@ -58,12 +77,11 @@ Status AssignPass::Run(NodePtr &node) {
     ///           |                              |
     ///         node                           node
     ///
-    GELOGI("Optimization for assign_node %s start", node->GetName().c_str());
-    if (IsolateAndDeleteNode(node, {kAssignRefInputIndex}) != SUCCESS) {
-      GELOGE(FAILED, "Isolate and delete assign_node %s failed.", node->GetName().c_str());
+    GELOGD("Optimization for assign_node %s start", assign_node->GetName().c_str());
+    if (IsolateAndDeleteNode(assign_node, {kAssignRefInputIndex}) != SUCCESS) {
+      GELOGE(FAILED, "Isolate and delete assign_node %s failed.", assign_node->GetName().c_str());
       return FAILED;
     }
-    AddNodeDeleted(node);
 
     const auto &ref_input = ref_peer_anchor->GetOwnerNode()->GetOpDesc();
     const auto &value_input = value_peer_anchor->GetOwnerNode()->GetOpDesc();
@@ -71,11 +89,6 @@ Status AssignPass::Run(NodePtr &node) {
       GELOGE(FAILED, "value input is null");
       return FAILED;
     }
-    if (!AttrUtils::SetStr(value_input->MutableOutputDesc(value_peer_anchor->GetIdx()), ASSIGN_VAR_NAME,
-                           ref_input->GetName())) {
-      GELOGE(FAILED, "Set attr ASSIGN_VAR_NAME failed.");
-      return FAILED;
-    }
 
     // variable has and only has one input
     if (ref_input->UpdateInputDesc(0, value_input->GetOutputDesc(value_peer_anchor->GetIdx())) != GRAPH_SUCCESS) {
@@ -86,9 +99,49 @@ Status AssignPass::Run(NodePtr &node) {
       GELOGE(FAILED, "Add data edge %s->%s failed", value_input->GetName().c_str(), ref_input->GetName().c_str());
       return FAILED;
     }
+
+    GELOGD("add attr ASSIGN_VAR_NAME on node %s, var_name=%s",
+           value_input->GetName().c_str(), ref_input->GetName().c_str());
+    if (!AttrUtils::SetStr(value_input->MutableOutputDesc(value_peer_anchor->GetIdx()), ASSIGN_VAR_NAME,
+                           ref_input->GetName())) {
+      GELOGE(FAILED, "Set attr ASSIGN_VAR_NAME failed.");
+      return FAILED;
+    }
+    auto value_node = value_peer_anchor->GetOwnerNode();
+    AddRePassNode(value_node);
   }
+  return SUCCESS;
+}
 
-  GELOGD("AssignPass success");
+///
+/// @brief Transform assign_var_name attr
+/// @param [in] node
+/// @return Status
+///
+Status AssignRemovePass::TransformAttr(NodePtr &node) {
+  GE_CHECK_NOTNULL(node->GetOpDesc());
+  for (const auto &output_desc : node->GetOpDesc()->GetAllOutputsDesc()) {
+    int32_t inplace_input_idx = -1;
+    std::string assign_var_name;
+    if (AttrUtils::GetInt(output_desc, INPLACE_SUPPORT_INPUT_INDEX, inplace_input_idx) &&
+        AttrUtils::GetStr(output_desc, ASSIGN_VAR_NAME, assign_var_name)) {
+      GELOGD("Transform attr ASSIGN_VAR_NAME on node %s, assign_var_name=%s, inplace_input_idx=%d, ",
+             node->GetName().c_str(), assign_var_name.c_str(), inplace_input_idx);
+      const auto &in_data_anchor = node->GetInDataAnchor(inplace_input_idx);
+      GE_CHECK_NOTNULL(in_data_anchor);
+      const auto &peer_data_anchor = in_data_anchor->GetPeerOutAnchor();
+      GE_CHECK_NOTNULL(peer_data_anchor);
+      auto in_node = peer_data_anchor->GetOwnerNode();
+      GE_CHECK_NOTNULL(in_node->GetOpDesc());
+      GELOGD("add attr ASSIGN_VAR_NAME on node %s, var_name=%s", in_node->GetName().c_str(), assign_var_name.c_str());
+      if (!AttrUtils::SetStr(in_node->GetOpDesc()->MutableOutputDesc(peer_data_anchor->GetIdx()),
+                             ASSIGN_VAR_NAME, assign_var_name)) {
+        GELOGE(FAILED, "Set attr ASSIGN_VAR_NAME failed.");
+        return FAILED;
+      }
+      AddRePassNode(in_node);
+    }
+  }
   return SUCCESS;
 }
 
@@ -99,15 +152,14 @@ Status AssignPass::Run(NodePtr &node) {
 /// @param [in] peer_data_anchor for value_input of assign_node
 /// @return Status
 ///
-bool AssignPass::IsCondMatch(const NodePtr &node, const OutDataAnchorPtr &ref_peer_anchor,
-                             const OutDataAnchorPtr &value_peer_anchor) {
+bool AssignRemovePass::IsCondMatch(const NodePtr &node, const OutDataAnchorPtr &ref_peer_anchor,
+                                   const OutDataAnchorPtr &value_peer_anchor) {
   GELOGD("Check if assign_node %s match optimization condition, ref_input: %s, value_input: %s",
          node->GetName().c_str(), ref_peer_anchor->GetOwnerNode()->GetName().c_str(),
          value_peer_anchor->GetOwnerNode()->GetName().c_str());
 
-  const std::string &value_type = value_peer_anchor->GetOwnerNode()->GetType();
-  if ((value_type == CONSTANTOP) || (value_type == CONSTANT)) {
-    GELOGD("value input is const");
+  if (kNoTaskNodeTypes.count(value_peer_anchor->GetOwnerNode()->GetType()) > 0) {
+    GELOGD("value input is not calculate node");
     return false;
   }
 
diff --git a/ge/graph/passes/assign_pass.h b/ge/graph/passes/assign_remove_pass.h
similarity index 70%
rename from ge/graph/passes/assign_pass.h
rename to ge/graph/passes/assign_remove_pass.h
index 11cf1073..6588df7b 100644
--- a/ge/graph/passes/assign_pass.h
+++ b/ge/graph/passes/assign_remove_pass.h
@@ -14,18 +14,32 @@
  * limitations under the License.
  */
 
-#ifndef GE_GRAPH_PASSES_ASSIGN_PASS_H_
-#define GE_GRAPH_PASSES_ASSIGN_PASS_H_
+#ifndef GE_GRAPH_PASSES_ASSIGN_REMOVE_PASS_H_
+#define GE_GRAPH_PASSES_ASSIGN_REMOVE_PASS_H_
 
 #include "graph/passes/base_pass.h"
 
 namespace ge {
-class AssignPass : public BaseNodePass {
+class AssignRemovePass : public BaseNodePass {
  public:
   Status Run(NodePtr &node) override;
 
  private:
   ///
+  /// @brief Optimize for assign_node
+  /// @param [in] assign_node
+  /// @return Status
+  ///
+  Status OptimizedAssignNode(NodePtr &assign_node);
+
+  ///
+  /// @brief Transform assign_var_name attr
+  /// @param [in] node
+  /// @return Status
+  ///
+  Status TransformAttr(NodePtr &node);
+
+  ///
   /// @brief Check if need optimize for assign_node
   /// @param [in] assign_node
   /// @param [in] peer_data_anchor for ref_input of assign_node
@@ -36,4 +50,4 @@ class AssignPass : public BaseNodePass {
                           const OutDataAnchorPtr &value_peer_anchor);
 };
 }  // namespace ge
-#endif  // GE_GRAPH_PASSES_ASSIGN_PASS_H_
+#endif  // GE_GRAPH_PASSES_ASSIGN_REMOVE_PASS_H_
diff --git a/ge/graph/passes/atomic_addr_clean_pass.cc b/ge/graph/passes/atomic_addr_clean_pass.cc
index 60742eb1..7c6ed8ce 100755
--- a/ge/graph/passes/atomic_addr_clean_pass.cc
+++ b/ge/graph/passes/atomic_addr_clean_pass.cc
@@ -74,10 +74,88 @@ Status AtomicAddrCleanPass::Run(ComputeGraphPtr graph) {
   return SUCCESS;
 }
 
+// just hccl may mark atomic from ops kernel now, and hccl's atomic if for all input
+bool AtomicAddrCleanPass::CheckAtomicFromOpsKernel(const NodePtr &node) {
+  // 1.Check if isAtomic attrs exist for HCOM
+  std::shared_ptr<GELib> instance_ptr = GELib::GetInstance();
+  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
+    GELOGW("GELib not initialized, atomic from ops kernel judge false, node_name: %s", node->GetName().c_str());
+    return false;
+  }
+
+  OpsKernelManager &ops_kernel_manager = instance_ptr->OpsKernelManagerObj();
+  vector<OpInfo> op_info_vec = ops_kernel_manager.GetOpsKernelInfo(node->GetType());
+  for (const auto &op_info : op_info_vec) {
+    if (op_info.isAtomic) {
+      // check peer input is DATA
+      for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+        if (in_data_anchor->GetPeerOutAnchor() != nullptr &&
+            in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) {
+          auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode();
+          if (peer_in_node->GetType() == DATA) {
+            GELOGI("Recognized atomic op %s from %s engine and input is DATA.", node->GetName().c_str(), 
+                   op_info.engine.c_str());
+            return false;
+          }
+        }
+      }
+      GELOGI("Recognized atomic op %s from %s engine.", node->GetName().c_str(), op_info.engine.c_str());
+      hcom_node_vec_.push_back(node);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AtomicAddrCleanPass::IsOutputIndexPeerInputAtomic(const NodePtr &node, int64_t output_index) {
+  auto out_data_anchor = node->GetAllOutDataAnchors().at(output_index);
+  if (out_data_anchor == nullptr) {
+    return false;
+  }
+
+  for (auto input_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    auto output_node = input_anchor->GetOwnerNode();
+    // just hccl may mark atomic from ops kernel now, and hccl's atomic if for all input
+    // hccl's attr ATOMIC_ATTR_INPUT_INDEX mark on CalcOpRunningParam, can't be get here
+    if (CheckAtomicFromOpsKernel(output_node)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AtomicAddrCleanPass::CheckSkipInsertInLoopGraph(const NodePtr &node) {
+  OpDescPtr op_desc = node->GetOpDesc();
+  std::map<string, std::map<int, int>> node_workspace_offset;
+  bool has_atomic_input = op_desc->HasAttr(ATOMIC_ATTR_INPUT_INDEX);
+  bool has_atomic_output = op_desc->HasAttr(ATOMIC_ATTR_OUTPUT_INDEX);
+  node_workspace_offset = op_desc->TryGetExtAttr(EXT_ATTR_ATOMIC_WORKSPACE_OFFSET, node_workspace_offset);
+  if (!has_atomic_input && has_atomic_output && node_workspace_offset.empty()) {
+    std::vector<int64_t> atomic_output_index;
+    (void) ge::AttrUtils::GetListInt(op_desc, ATOMIC_ATTR_OUTPUT_INDEX, atomic_output_index);
+    bool is_all_output_peer_also_atomic = true;
+    for (const auto &output_index : atomic_output_index) {
+      if (!IsOutputIndexPeerInputAtomic(node, output_index)) {
+        is_all_output_peer_also_atomic = false;
+        break;
+      }
+    }
+    if (is_all_output_peer_also_atomic) {
+      GELOGI("all out peer node input atomic, skip this out atomic process, node name: %s", node->GetName().c_str());
+      return true;
+    }
+  }
+  return false;
+}
+
 Status AtomicAddrCleanPass::HandleLoopGraph(ComputeGraphPtr &graph, const vector<NodePtr> &atomic_node_vec) {
   // Loop graph , insert clean node follow atomic node
   int index = 0;
   for (const auto &node : atomic_node_vec) {
+    if (CheckSkipInsertInLoopGraph(node)) {
+      continue;
+    }
+
     // Insert atomic clean op
     NodePtr clean_addr_node = InsertAtomicAddrCleanNode(graph);
     if (clean_addr_node == nullptr) {
@@ -249,32 +327,10 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) {
     return false;
   }
   // 1.Check if isAtomic attrs exist for HCOM
-  std::shared_ptr<GELib> instance_ptr = GELib::GetInstance();
-  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
-    GELOGW("GELib not initialized");
-    return false;
+  if (CheckAtomicFromOpsKernel(node)) {
+    return true;
   }
 
-  OpsKernelManager &ops_kernel_manager = instance_ptr->OpsKernelManagerObj();
-  vector<OpInfo> op_info_vec = ops_kernel_manager.GetOpsKernelInfo(op_desc->GetType());
-  for (const auto &op_info : op_info_vec) {
-    if (op_info.isAtomic) {
-      GELOGI("Recognized atomic op %s from DNN_HCCL engine.", op_desc->GetName().c_str());
-      // check peer input is DATA
-      for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
-        if (in_data_anchor->GetPeerOutAnchor() != nullptr &&
-            in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) {
-          auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode();
-          if (peer_in_node->GetType() == DATA) {
-            GELOGI("Recognized atomic op %s from DNN_HCCL engine and input is DATA.", op_desc->GetName().c_str());
-            return false;
-          }
-        }
-      }
-      hcom_node_vec_.push_back(node);
-      return true;
-    }
-  }
   // 2.Check atomic attr in node
   std::map<string, std::map<int, int>> node_workspace_offset;
   bool has_atomic_input = op_desc->HasAttr(ATOMIC_ATTR_INPUT_INDEX);
diff --git a/ge/graph/passes/atomic_addr_clean_pass.h b/ge/graph/passes/atomic_addr_clean_pass.h
index ad60b7b5..8138d511 100755
--- a/ge/graph/passes/atomic_addr_clean_pass.h
+++ b/ge/graph/passes/atomic_addr_clean_pass.h
@@ -84,6 +84,11 @@ class AtomicAddrCleanPass : public GraphPass {
   Status HandleDispersedAtomicNodes(ComputeGraphPtr &graph, const std::vector<NodePtr> &atomic_node_vec,
                                     std::vector<NodePtr> &common_atomic_nodes);
 
+  bool CheckAtomicFromOpsKernel(const NodePtr &node);
+
+  bool IsOutputIndexPeerInputAtomic(const NodePtr &node, int64_t output_index);
+
+  bool CheckSkipInsertInLoopGraph(const NodePtr &node);
 
   vector<NodePtr> hcom_node_vec_;
   bool is_loop_graph_ = false;
diff --git a/ge/graph/passes/attach_stream_label_pass.cc b/ge/graph/passes/attach_stream_label_pass.cc
index b04643a4..cd3509c7 100644
--- a/ge/graph/passes/attach_stream_label_pass.cc
+++ b/ge/graph/passes/attach_stream_label_pass.cc
@@ -18,17 +18,15 @@
 #include "ge/ge_api_types.h"
 #include "graph/common/omg_util.h"
 
+using std::string;
+
 namespace ge {
 Status AttachStreamLabelPass::Run(ComputeGraphPtr graph) {
   GELOGD("AttachStreamLabelPass Enter.");
 
   FindNodes(graph);
   for (const auto &node : need_label_nodes_) {
-    OpDescPtr op_desc = node->GetOpDesc();
-    GE_CHECK_NOTNULL(op_desc);
-    if (!op_desc->HasAttr(ATTR_NAME_STREAM_LABEL)) {
-      GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str());
-    }
+    GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str());
   }
   GE_CHK_STATUS_RET(UpdateEnterNode(), "UpdateEnterNode failed.");
 
@@ -55,13 +53,15 @@ Status AttachStreamLabelPass::ClearStatus() {
 ///
 void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) {
   for (const NodePtr &node : graph->GetDirectNode()) {
-    const std::string &type = node->GetType();
-    if (type == STREAMSWITCH) {
+    const auto &op_desc = node->GetOpDesc();
+    if (op_desc == nullptr) {
+      continue;
+    }
+    const std::string &type = op_desc->GetType();
+    if ((type == STREAMSWITCH) && op_desc->HasAttr(ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG)) {
       stream_switch_nodes_.emplace_back(node);
-    } else if (type == STREAMMERGE) {
-      if ((node->GetOpDesc() != nullptr) && !node->GetOpDesc()->HasAttr(ATTR_NAME_NEXT_ITERATION)) {
-        need_label_nodes_.emplace_back(node);
-      }
+    } else if ((type == STREAMMERGE) && !op_desc->HasAttr(ATTR_NAME_NEXT_ITERATION)) {
+      need_label_nodes_.emplace_back(node);
     } else if ((type == ENTER) || (type == REFENTER)) {
       enter_nodes_.emplace_back(node);
     }
@@ -83,11 +83,15 @@ void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) {
 ///
 Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
   std::string stream_label;
+  if (AttachFlag(node, stream_label) != SUCCESS) {
+    GELOGE(FAILED, "Attach flag for node %s failed.", node->GetName().c_str());
+    return FAILED;
+  }
+
   std::unordered_set<NodePtr> branch_nodes;
   std::unordered_set<NodePtr> visited;
   std::stack<NodePtr> nodes;
   nodes.push(node);
-
   static const std::set<std::string> end_type_set = {STREAMSWITCH, STREAMMERGE, MERGE};
   while (!nodes.empty()) {
     NodePtr cur_node = nodes.top();
@@ -95,10 +99,6 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
     if (visited.count(cur_node) > 0) {
       continue;
     }
-    if (AttachFlag(cur_node, stream_label) != SUCCESS) {
-      GELOGE(FAILED, "Attach flag for node %s failed.", cur_node->GetName().c_str());
-      return FAILED;
-    }
 
     const std::string &type = cur_node->GetType();
     for (const auto &out_node : cur_node->GetOutAllNodes()) {
@@ -115,10 +115,6 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
     visited.insert(cur_node);
   }
 
-  if (node->GetType() == STREAMSWITCH) {
-    GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed.");
-  }
-
   for (const NodePtr &tmp_node : branch_nodes) {
     GELOGD("Attach label %s to node: %s.", stream_label.c_str(), tmp_node->GetName().c_str());
     GE_CHK_STATUS_RET(SetStreamLabel(tmp_node, stream_label), "Set stream label failed.");
@@ -148,11 +144,10 @@ Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &strea
     GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), return FAILED,
                      "StreamSwitch get attr TRUE_BRANCH_STREAM failed.");
     stream_label += (value ? "_t" : "_f");
+    GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed.");
   } else if (type == STREAMMERGE) {
     stream_label = node->GetName();
     GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
-  } else if ((type == EXIT) || (type == REFEXIT)) {
-    GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
   }
 
   return SUCCESS;
@@ -166,12 +161,13 @@ Status AttachStreamLabelPass::UpdateEnterNode() {
   std::unordered_map<NodePtr, std::vector<NodePtr>> enter_active_map;
   for (const auto &enter_node : enter_nodes_) {
     for (const auto &out_ctrl_node : enter_node->GetOutControlNodes()) {
-      if (out_ctrl_node->GetType() == STREAMACTIVE) {
-        if (enter_active_map.find(out_ctrl_node) == enter_active_map.end()) {
-          enter_active_map[out_ctrl_node] = {enter_node};
-        } else {
-          enter_active_map[out_ctrl_node].emplace_back(enter_node);
-        }
+      if (out_ctrl_node->GetType() != STREAMACTIVE) {
+        continue;
+      }
+      if (enter_active_map.find(out_ctrl_node) == enter_active_map.end()) {
+        enter_active_map[out_ctrl_node] = {enter_node};
+      } else {
+        enter_active_map[out_ctrl_node].emplace_back(enter_node);
       }
     }
   }
@@ -193,21 +189,10 @@ Status AttachStreamLabelPass::UpdateEnterNode() {
     }
 
     std::stack<NodePtr> enter_nodes;
-    std::string batch_label;
     for (const auto &enter_node : pair.second) {
       enter_nodes.emplace(enter_node);
-      std::string tmp_label;
-      (void)AttrUtils::GetStr(enter_node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, tmp_label);
-      if (!tmp_label.empty()) {
-        if (batch_label.empty()) {
-          batch_label = tmp_label;
-        } else if (batch_label != tmp_label) {
-          GELOGE(FAILED, "multi batch_label exist, label1=%s, label2=%s.", batch_label.c_str(), tmp_label.c_str());
-          return FAILED;
-        }
-      }
     }
-    if (UpdateLoopBranch(enter_nodes, active_label_list[0], batch_label) != SUCCESS) {
+    if (UpdateLoopBranch(enter_nodes, active_label_list[0]) != SUCCESS) {
       GELOGE(FAILED, "Update stream_label for loop_branch failed.");
       return FAILED;
     }
@@ -226,19 +211,14 @@ Status AttachStreamLabelPass::SetEnterLabel(const std::vector<NodePtr> &enter_no
   std::string stream_label;
   GE_CHECK_NOTNULL(active_node);
   (void)AttrUtils::GetStr(active_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label);
-
   if (stream_label.empty()) {
-    GELOGW("stream_label of enter_active & enter_nodes is empty.");
+    GELOGD("stream_label of enter_active %s is empty.", active_node->GetName().c_str());
     return SUCCESS;
   }
 
   for (const auto &enter_node : enter_nodes) {
-    GE_CHECK_NOTNULL(enter_node->GetOpDesc());
-    if (enter_node->GetOpDesc()->HasAttr(ATTR_NAME_STREAM_LABEL)) {
-      GE_CHK_STATUS_RET(SetStreamLabel(enter_node, stream_label), "Set stream label failed.");
-    }
+    GE_CHK_STATUS_RET(SetStreamLabel(enter_node, stream_label), "Set stream label failed.");
   }
-  GE_CHK_STATUS_RET(SetStreamLabel(active_node, stream_label), "Set stream label failed.");
   return SUCCESS;
 }
 
@@ -249,8 +229,7 @@ Status AttachStreamLabelPass::SetEnterLabel(const std::vector<NodePtr> &enter_no
 /// @param [in] batch_label
 /// @return Status
 ///
-Status AttachStreamLabelPass::UpdateLoopBranch(const std::stack<NodePtr> &enter_nodes, const std::string &stream_label,
-                                               const std::string &batch_label) {
+Status AttachStreamLabelPass::UpdateLoopBranch(const std::stack<NodePtr> &enter_nodes, const string &stream_label) {
   std::stack<NodePtr> nodes(enter_nodes);
   NodePtr cur_node = nullptr;
   while (!nodes.empty()) {
@@ -259,11 +238,6 @@ Status AttachStreamLabelPass::UpdateLoopBranch(const std::stack<NodePtr> &enter_
     for (const NodePtr &out_node : cur_node->GetOutAllNodes()) {
       OpDescPtr out_desc = out_node->GetOpDesc();
       GE_CHECK_NOTNULL(out_desc);
-      std::string tmp_label;
-      (void)AttrUtils::GetStr(out_desc, ATTR_NAME_BATCH_LABEL, tmp_label);
-      if (!tmp_label.empty() && (tmp_label != batch_label)) {
-        continue;
-      }
       std::string out_type = out_desc->GetType();
       bool need_skip =
           out_desc->HasAttr(ATTR_NAME_STREAM_LABEL) || (out_type == ENTER) || (out_type == REFENTER) ||
diff --git a/ge/graph/passes/attach_stream_label_pass.h b/ge/graph/passes/attach_stream_label_pass.h
index 19f11480..ad71d58f 100755
--- a/ge/graph/passes/attach_stream_label_pass.h
+++ b/ge/graph/passes/attach_stream_label_pass.h
@@ -58,11 +58,9 @@ class AttachStreamLabelPass : public GraphPass {
   /// @brief Update stream_label for loop_branch
   /// @param [in] enter_nodes
   /// @param [in] stream_label
-  /// @param [in] batch_label
   /// @return Status
   ///
-  static Status UpdateLoopBranch(const std::stack<NodePtr> &enter_nodes, const std::string &stream_label,
-                                 const std::string &batch_label);
+  static Status UpdateLoopBranch(const std::stack<NodePtr> &enter_nodes, const std::string &stream_label);
 
   ///
   /// @brief Update stream_label start with enter nodes
diff --git a/ge/graph/passes/base_pass.cc b/ge/graph/passes/base_pass.cc
index 68efbeb9..3b854c18 100755
--- a/ge/graph/passes/base_pass.cc
+++ b/ge/graph/passes/base_pass.cc
@@ -96,7 +96,7 @@ Status RunPasses(NodePtr &node, const NamesToPass &names_to_passes, std::unorder
                node->GetName().c_str(), node->GetType().c_str());
         continue;
       }
-      if (node_to_re_pass->IsAllInNodesSeen(nodes_seen)) {
+      if (nodes_seen.count(node_to_re_pass.get()) > 0 || node_to_re_pass->IsAllInNodesSeen(nodes_seen)) {
         GELOGD("The node %s will be re-pass later", node_to_re_pass->GetName().c_str());
         nodes_re_pass.insert(node_to_re_pass);
       } else {
diff --git a/ge/graph/passes/common_subexpression_elimination_pass.cc b/ge/graph/passes/common_subexpression_elimination_pass.cc
index a4662d5d..7d9724fc 100644
--- a/ge/graph/passes/common_subexpression_elimination_pass.cc
+++ b/ge/graph/passes/common_subexpression_elimination_pass.cc
@@ -26,6 +26,10 @@
 
 namespace ge {
 namespace {
+std::set<std::string> un_compute_attrs = {
+    {ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES},
+};
+
 std::string GetCseKey(const NodePtr &node) {
   std::stringstream ss;
   ss << node->GetType() << "-data-inputs-";
@@ -49,7 +53,7 @@ std::string GetCseKey(const NodePtr &node) {
     ss << name << "-";
   }
 
-  ss << "attrs-" << AttrUtils::GetAllAttrsStr(node->GetOpDesc());
+  ss << "attrs-" << AttrUtils::GetAttrsStrAfterRid(node->GetOpDesc(), un_compute_attrs);
 
   return ss.str();
 }
diff --git a/ge/graph/passes/cond_pass.cc b/ge/graph/passes/cond_pass.cc
index a2d77a1b..372af921 100644
--- a/ge/graph/passes/cond_pass.cc
+++ b/ge/graph/passes/cond_pass.cc
@@ -21,7 +21,6 @@
 
 namespace {
   const std::string kStringLength = "StringLength";
-  const size_t kScalarDimNum = 1;
 }
 
 namespace ge {
diff --git a/ge/graph/passes/cond_remove_pass.cc b/ge/graph/passes/cond_remove_pass.cc
index e8d1493f..bf2e1170 100644
--- a/ge/graph/passes/cond_remove_pass.cc
+++ b/ge/graph/passes/cond_remove_pass.cc
@@ -37,6 +37,12 @@ Status CondRemovePass::Run(NodePtr &node) {
   OutDataAnchorPtr cond_out_anchor = nullptr;
   InDataAnchorPtr cond_in_anchor = nullptr;
   Status ret = GetCondInfo(node, graph, cond_out_anchor, cond_in_anchor);
+  if (ret == NOT_CHANGED) {
+    return SUCCESS;
+  } else if (ret != SUCCESS) {
+    GELOGE(FAILED, "Get cond_info for node %s failed.", node->GetName().c_str());
+    return FAILED;
+  }
   int32_t cond_index = 0;
   GELOGD("Handle cond remove for node %s.", node->GetOpDesc()->GetName().c_str());
   bool if_cond_const = CheckIfCondConstInput(cond_out_anchor, cond_in_anchor, cond_index);
@@ -322,11 +328,11 @@ Status CondRemovePass::GetCondInfo(const NodePtr &node, ComputeGraphPtr &graph,
   std::string type = node->GetType();
   if ((kIfOpTypes.count(type) != 0) || (kCaseOpTypes.count(type) != 0)) {
     if (GetCondInfoForIfCase(node, graph, cond_out_anchor, cond_in_anchor) != SUCCESS) {
-      GELOGE(FAILED, "Get cond_info for if node failed.");
+      GELOGE(FAILED, "Get cond_info for if/case node failed.");
       return FAILED;
     }
   } else {
-    GELOGD("no need cond_pass for node %s.", node->GetName().c_str());
+    GELOGD("no need cond_remove_pass for node %s.", node->GetName().c_str());
     return NOT_CHANGED;
   }
 
diff --git a/ge/graph/passes/constant_fuse_same_pass.cc b/ge/graph/passes/constant_fuse_same_pass.cc
index d0970c59..eb8b3470 100644
--- a/ge/graph/passes/constant_fuse_same_pass.cc
+++ b/ge/graph/passes/constant_fuse_same_pass.cc
@@ -19,13 +19,7 @@
 #include <map>
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
-
-#include "common/ge/ge_util.h"
-#include "framework/common/debug/ge_log.h"
-#include "framework/common/ge_inner_error_codes.h"
-#include "graph/debug/ge_attr_define.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/type_utils.h"
 
@@ -121,11 +115,15 @@ void ConstantFuseSamePass::GetFuseConstNodes(ComputeGraphPtr &graph,
              TypeUtils::DataTypeToSerialString(data_type).c_str());
       continue;
     }
+    if ((type_size != 0) && (weight->MutableData().GetAlignedPtr() == nullptr)) {
+      GELOGW("aligned_ptr is null while size is not 0");
+      continue;
+    }
     ++insert_const_nums;
 
     SameConstKey map_key;
     map_key.data_size = type_size;
-    map_key.data = weight->GetData().GetData();
+    map_key.aligned_ptr = weight->MutableData().GetAlignedPtr();
     map_key.data_type = data_type;
     map_key.format = output_tensor->GetFormat();
     map_key.shape = output_tensor->GetShape().GetDims();
diff --git a/ge/graph/passes/constant_fuse_same_pass.h b/ge/graph/passes/constant_fuse_same_pass.h
index 4935da84..3ff2d6b7 100755
--- a/ge/graph/passes/constant_fuse_same_pass.h
+++ b/ge/graph/passes/constant_fuse_same_pass.h
@@ -21,14 +21,14 @@
 #include <set>
 #include <utility>
 #include <vector>
-
+#include "graph/aligned_ptr.h"
 #include "graph/types.h"
 #include "inc/graph_pass.h"
 
 namespace ge {
 struct SameConstKey {
   int data_size;
-  const uint8_t *data;
+  std::shared_ptr<AlignedPtr> aligned_ptr;
   DataType data_type;
   Format format;
   std::vector<int64_t> shape;
@@ -38,9 +38,11 @@ struct SameConstKey {
     if (data_size != key.data_size) {
       return data_size < key.data_size;
     }
-    int ret = memcmp(data, key.data, data_size);
-    if (ret != 0) {
-      return ret < 0;
+    if (data_size != 0) {
+      int ret = memcmp(aligned_ptr->Get(), key.aligned_ptr->Get(), data_size);
+      if (ret != 0) {
+        return ret < 0;
+      }
     }
     if (data_type != key.data_type) {
       return data_type < key.data_type;
diff --git a/ge/graph/passes/ctrl_edge_transfer_pass.cc b/ge/graph/passes/ctrl_edge_transfer_pass.cc
index f53dc7be..a538a10c 100755
--- a/ge/graph/passes/ctrl_edge_transfer_pass.cc
+++ b/ge/graph/passes/ctrl_edge_transfer_pass.cc
@@ -38,7 +38,6 @@ namespace ge {
  *   \  /
  *    B
  */
-
 Status CtrlEdgeTransferPass::Run(ge::ComputeGraphPtr graph) {
   GELOGD("CtrlEdgeTransferPass start running");
   GE_CHECK_NOTNULL(graph);
diff --git a/ge/graph/passes/data_pass.cc b/ge/graph/passes/data_pass.cc
index 4ec8743e..5bbd2fb1 100644
--- a/ge/graph/passes/data_pass.cc
+++ b/ge/graph/passes/data_pass.cc
@@ -21,6 +21,7 @@
 
 namespace ge {
 namespace {
+const int kDataIndexOffset = 2;
 Status MappingSubgraphInput(const ComputeGraphPtr &graph, const std::function<int(int data_index)> &input) {
   for (const auto &node : graph->GetDirectNode()) {
     if (node->GetType() != DATA) {
@@ -111,7 +112,7 @@ Status ParseSubgraphPostFnWhile(const string &subgraph_name, const ComputeGraphP
 
 Status ParseSubgraphPostFnFor(const string &subgraph_name, const ComputeGraphPtr &graph) {
   return MappingSubgraphIndex(graph,
-      [](int data_index) { return (data_index == 0) ? 0 : data_index + 2; },
+      [](int data_index) { return (data_index == 0) ? 0 : data_index + kDataIndexOffset; },
       [](int retval_index) { return retval_index; });
 }
 
diff --git a/ge/graph/passes/dimension_adjust_pass.cc b/ge/graph/passes/dimension_adjust_pass.cc
index fc5fe69f..5701faf5 100755
--- a/ge/graph/passes/dimension_adjust_pass.cc
+++ b/ge/graph/passes/dimension_adjust_pass.cc
@@ -80,7 +80,71 @@ Status DimensionAdjustPass::Run(ge::NodePtr &node) {
     }
   }
 
+  ret = DealWithInNodes(node);
+  if (ret != SUCCESS) {
+    GELOGE(ret, "DealWithInNodes of %s failed.", node->GetName().c_str());
+    return ret;
+  }
+
   std::vector<int> data_relink_io_map = {kDataInputIndex};
   return IsolateAndDeleteNode(node, data_relink_io_map);
 }
+
+Status DimensionAdjustPass::DealWithInNodes(NodePtr &node) {
+  GE_CHECK_NOTNULL(node);
+  GE_CHECK_NOTNULL(node->GetOpDesc());
+  auto graph = node->GetOwnerComputeGraph();
+  auto in_data_anchors = node->GetAllInDataAnchors();
+  for (auto &in_data_anchor : in_data_anchors) {
+    if (in_data_anchor == nullptr) {
+      continue;
+    }
+    auto in_node_anchor = in_data_anchor->GetPeerOutAnchor();
+    if (in_node_anchor == nullptr) {
+      continue;
+    }
+    auto in_node = in_node_anchor->GetOwnerNode();
+    if (in_node->GetType() == SWITCHN) {
+      auto identity_name = node->GetName() + "_ctrl_identity_" + std::to_string(in_data_anchor->GetIdx());
+      auto identity =
+          AddIdentityNodeToGraph(identity_name, node->GetOpDesc()->GetInputDesc(in_data_anchor->GetIdx()), graph);
+      GE_CHECK_NOTNULL(identity);
+      GELOGI("Create new identity node[%s] after node %s[type: %s] success.", identity->GetName().c_str(),
+             in_node->GetName().c_str(), in_node->GetType().c_str());
+      GE_CHK_STATUS_RET(GraphUtils::AddEdge(in_node_anchor, identity->GetInDataAnchor(0)))
+      GE_CHECK_NOTNULL(identity->GetOutControlAnchor());
+      if (identity->GetOutControlAnchor()->IsLinkedWith(node->GetInControlAnchor())) {
+        continue;
+      }
+      GE_CHK_STATUS_RET(GraphUtils::AddEdge(identity->GetOutControlAnchor(), node->GetInControlAnchor()))
+    }
+  }
+
+  return SUCCESS;
+}
+
+NodePtr DimensionAdjustPass::AddIdentityNodeToGraph(const string &name, const GeTensorDesc &tensor,
+                                                    ComputeGraphPtr &graph) {
+  if (graph == nullptr) {
+    GELOGE(INTERNAL_ERROR, "Comput graph ptr is null in creating identity node.");
+    return nullptr;
+  }
+
+  OpDescPtr desc = MakeShared<OpDesc>("", "");
+  if (desc == nullptr) {
+    GELOGE(MEMALLOC_FAILED, "Failed to create op desc.");
+    return nullptr;
+  }
+
+  desc->SetName(name);
+  desc->SetType(IDENTITY);
+  auto ret = desc->AddInputDesc(tensor);
+  auto ret2 = desc->AddOutputDesc(tensor);
+  if ((ret != GRAPH_SUCCESS) || (ret2 != GRAPH_SUCCESS)) {
+    GELOGE(INTERNAL_ERROR, "Failed to add input/output desc in creating identity.");
+    return nullptr;
+  }
+
+  return graph->AddNodeFront(desc);
+}
 }  // namespace ge
diff --git a/ge/graph/passes/dimension_adjust_pass.h b/ge/graph/passes/dimension_adjust_pass.h
index 685d9694..7766f140 100755
--- a/ge/graph/passes/dimension_adjust_pass.h
+++ b/ge/graph/passes/dimension_adjust_pass.h
@@ -34,6 +34,10 @@ namespace ge {
 class DimensionAdjustPass : public BaseNodePass {
  public:
   Status Run(ge::NodePtr &node) override;
+
+ private:
+  Status DealWithInNodes(ge::NodePtr &node);
+  NodePtr AddIdentityNodeToGraph(const std::string &name, const GeTensorDesc &tensor, ComputeGraphPtr &graph);
 };
 }  // namespace ge
 
diff --git a/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc b/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc
new file mode 100644
index 00000000..6fa63642
--- /dev/null
+++ b/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc
@@ -0,0 +1,155 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/passes/dynamic_single_op_reset_shape_pass.h"
+#include "common/ge_inner_error_codes.h"
+#include "graph/utils/node_utils.h"
+#include "graph/utils/graph_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/utils/op_desc_utils.h"
+#include "graph/utils/type_utils.h"
+#include "graph/debug/ge_attr_define.h"
+
+namespace ge {
+namespace {
+const int64_t kDynamicShapeDim = -2;
+const char *const kEngineNameAiCpu = "DNN_VM_AICPU_ASCEND";
+const char *const kEngineNameAiCpuTf = "DNN_VM_AICPU";
+}  // namespace
+Status DynamicSingleOpResetShapePass::Run(ComputeGraphPtr graph) {
+  GE_CHECK_NOTNULL(graph);
+
+  std::shared_ptr<GELib> instance = ge::GELib::GetInstance();
+  if (instance == nullptr || !instance->InitFlag()) {
+    GELOGE(ge::GE_CLI_GE_NOT_INITIALIZED, "Run CompileNodesPass failed.");
+    return ge::GE_CLI_GE_NOT_INITIALIZED;
+  }
+
+  // pass if graph has not aicpu node.
+  bool is_not_aicpu = false;
+  if (CheckAllAicpuNodes(graph, is_not_aicpu) != SUCCESS) {
+    GELOGE(ge::GE_CLI_GE_NOT_INITIALIZED, "Check if graph has not aicpu node failed.");
+    return ge::GE_CLI_GE_NOT_INITIALIZED;
+  }
+  if (is_not_aicpu) {
+    GELOGI("The graph [%s] has not aicpu node, whose aicpu nodes would not be reset dynamic shape",
+           graph->GetName().c_str());
+    return SUCCESS;
+  }
+
+  for (const auto &node : graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    // pass input and output node
+    if (node->GetType() == DATA || node->GetType() == CONSTANT || node->GetType() == CONSTANTOP ||
+        node->GetType() == NETOUTPUT) {
+      continue;
+    }
+
+    // pass node without attr: ATTR_DYNAMIC_SHAPE_SINGLE_AICPU
+    bool single_aicpu_unknown = false;
+    if (!AttrUtils::GetBool(node->GetOpDesc(), ATTR_DYNAMIC_SHAPE_SINGLE_AICPU, single_aicpu_unknown) ||
+        !single_aicpu_unknown) {
+      continue;
+    }
+
+    // reset aicpu shape to unknown shape
+    auto op_desc = node->GetOpDesc();
+    if (ResetOpShape(op_desc) != SUCCESS) {
+      GELOGE(ge::GE_CLI_GE_NOT_INITIALIZED, "Reset node[%s] dynamic shapr failed.", node->GetName().c_str());
+      return ge::GE_CLI_GE_NOT_INITIALIZED;
+    }
+    GELOGD("Reset dynamic aicpu node [%s] shape success!", node->GetName().c_str());
+  }
+
+  GELOGD("Reset dynamic aicpu nodes shape of graph [%s] success!", graph->GetName().c_str());
+  return SUCCESS;
+}
+
+Status DynamicSingleOpResetShapePass::CheckAllAicpuNodes(const ComputeGraphPtr &graph, bool &is_not_aicpu) {
+  is_not_aicpu = false;
+  for (const auto &node : graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    // pass input and output node
+    if (node->GetType() == DATA || node->GetType() == CONSTANT || node->GetType() == CONSTANTOP ||
+        node->GetType() == NETOUTPUT) {
+      continue;
+    }
+
+    // find if there are aicpu nodes.
+    auto op_desc = node->GetOpDesc();
+    string engine_name = op_desc->GetOpEngineName();
+    if (engine_name.empty()) {
+      GELOGE(GRAPH_FAILED, "Get engine failed of node[%s].", node->GetName().c_str());
+      return GRAPH_FAILED;
+    }
+    if (engine_name != kEngineNameAiCpu && engine_name != kEngineNameAiCpuTf) {
+      is_not_aicpu = true;
+      return SUCCESS;
+    }
+  }
+  return SUCCESS;
+}
+
+bool DynamicSingleOpResetShapePass::CheckIfConstInput(const GeTensorDescPtr &input_tensor_desc) {
+  bool is_const = false;
+  (void)AttrUtils::GetBool(input_tensor_desc, CONST_ATTR_NAME_INPUT, is_const);
+  return is_const;
+}
+
+Status DynamicSingleOpResetShapePass::ResetOpShape(OpDescPtr &op_desc) {
+  GE_CHECK_NOTNULL(op_desc);
+  std::vector<int64_t> dynamic_shape_dims = {kDynamicShapeDim};
+  GeShape dynamic_shape(dynamic_shape_dims);
+  (void)ResetInputTensorShape(op_desc, dynamic_shape);
+  (void)ResetOutputTensorShape(op_desc, dynamic_shape);
+  return SUCCESS;
+}
+
+Status DynamicSingleOpResetShapePass::ResetInputTensorShape(OpDescPtr &op_desc,
+                                                            const GeShape &dynamic_shape) {
+  GE_CHECK_NOTNULL(op_desc);
+  for (size_t i = 0; i < op_desc->GetAllInputsDesc().size(); i++) {
+    auto input_desc = op_desc->MutableInputDesc(static_cast<uint32_t>(i));
+    GE_CHECK_NOTNULL(input_desc);
+    // pass scalar input desc
+    auto dims_ori = input_desc->GetShape().GetDims();
+    if (dims_ori.size() == 0) {
+      continue;
+    }
+    // pass const input
+    if (CheckIfConstInput(input_desc)) {
+      continue;
+    }
+    input_desc->SetShape(dynamic_shape);
+  }
+  return SUCCESS;
+}
+
+Status DynamicSingleOpResetShapePass::ResetOutputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape) {
+  GE_CHECK_NOTNULL(op_desc);
+  for (size_t i = 0; i < op_desc->GetAllOutputsDesc().size(); i++) {
+    auto output_desc = op_desc->MutableOutputDesc(static_cast<uint32_t>(i));
+    GE_CHECK_NOTNULL(output_desc);
+    // pass scalar input desc
+    auto output_dims_ori = output_desc->GetShape().GetDims();
+    if (output_dims_ori.size() == 0) {
+      continue;
+    }
+    output_desc->SetShape(dynamic_shape);
+  }
+  return SUCCESS;
+}
+}  // namespace ge
\ No newline at end of file
diff --git a/ge/graph/passes/dynamic_single_op_reset_shape_pass.h b/ge/graph/passes/dynamic_single_op_reset_shape_pass.h
new file mode 100644
index 00000000..765448ff
--- /dev/null
+++ b/ge/graph/passes/dynamic_single_op_reset_shape_pass.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_PASSES_DYNAMIC_SINGLE_OP_RESET_SHAPE_PASS_H_
+#define GE_GRAPH_PASSES_DYNAMIC_SINGLE_OP_RESET_SHAPE_PASS_H_
+#include "graph/graph.h"
+#include "inc/graph_pass.h"
+#include "init/gelib.h"
+
+namespace ge {
+class DynamicSingleOpResetShapePass : public GraphPass {
+ public:
+  Status Run(ComputeGraphPtr graph) override;
+
+ private:
+  Status ResetOpShape(OpDescPtr &op_desc);
+  Status ResetInputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape);
+  Status ResetOutputTensorShape(OpDescPtr &op_desc, const GeShape &dynamic_shape);
+  Status CheckAllAicpuNodes(const ComputeGraphPtr &graph, bool &is_not_aicpu);
+  bool CheckIfConstInput(const GeTensorDescPtr &input_tensor_desc);
+};
+}  // namespace ge
+#endif  // GE_GRAPH_PASSES_DYNAMIC_SINGLE_OP_RESET_SHAPE_PASS_H_
diff --git a/ge/graph/passes/enter_pass.cc b/ge/graph/passes/enter_pass.cc
index 206d271c..066c97cf 100644
--- a/ge/graph/passes/enter_pass.cc
+++ b/ge/graph/passes/enter_pass.cc
@@ -16,12 +16,14 @@
 
 #include "graph/passes/enter_pass.h"
 
+#include "graph/debug/ge_attr_define.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
 #include "graph/utils/graph_utils.h"
 
 namespace {
 const size_t kOutNodesNum = 1;
+const size_t kInCtrlNodesNum = 1;
 }
 
 namespace ge {
@@ -54,6 +56,7 @@ Status EnterPass::Run(NodePtr &node) {
       if (out_ctrl_node == nullptr) {
         continue;
       }
+      GELOGI("Remove control edge from %s to %s.", node->GetName().c_str(), out_ctrl_node->GetName().c_str());
       if (GraphUtils::RemoveEdge(node->GetOutControlAnchor(), out_ctrl_node->GetInControlAnchor()) != GRAPH_SUCCESS) {
         GELOGE(FAILED, "Remove Enter ctrl output fail, %s->%s", node->GetName().c_str(),
                out_ctrl_node->GetName().c_str());
@@ -61,8 +64,12 @@ Status EnterPass::Run(NodePtr &node) {
       }
     }
   } else {
-    if (OptimizeEnter(node, in_node) != SUCCESS) {
-      GELOGE(FAILED, "Optimize enter node[%s] failed.", node->GetName().c_str());
+    if (OptimizeEnterWithOnlyDataOut(node, in_node) != SUCCESS) {
+      GELOGE(FAILED, "Optimize enter node[%s] with only out data node failed.", node->GetName().c_str());
+      return FAILED;
+    }
+    if (UnlinkCtrlEdgeBeforeConst(node) != SUCCESS) {
+      GELOGE(FAILED, "Unlink control edge before const of node[%s]'s out nodes failed.", node->GetName().c_str());
       return FAILED;
     }
   }
@@ -71,36 +78,72 @@ Status EnterPass::Run(NodePtr &node) {
   return SUCCESS;
 }
 
-Status EnterPass::OptimizeEnter(NodePtr &node, NodePtr &in_node) {
-  auto out_nodes_of_in_node = in_node->GetOutAllNodes();
-  if (out_nodes_of_in_node.size() != kOutNodesNum) {
+Status EnterPass::OptimizeEnterWithOnlyDataOut(NodePtr &node, NodePtr &in_node) {
+  if ((in_node->GetOutAllNodes().size() != kOutNodesNum) || !node->GetOutControlNodes().empty()) {
     return SUCCESS;
   }
-
-  if (!node->GetOutControlNodes().empty()) {
+  bool is_constant_flag = true;
+  (void)AttrUtils::GetBool(node->GetOpDesc(), ENTER_ATTR_CONSTANT_FLAG, is_constant_flag);
+  if (!is_constant_flag) {
     return SUCCESS;
   }
 
-  for (const auto &out_node : node->GetOutDataNodes()) {
-    GE_CHECK_NOTNULL(out_node);
-    if (out_node->GetType() == MERGE) {
-      return SUCCESS;
-    }
-  }
-
   GE_CHECK_NOTNULL(in_node->GetOutDataAnchor(0));
-  GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->Unlink(node->GetInDataAnchor(0)));
-  auto out_data_anchor = node->GetOutDataAnchor(0);
+  GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->Unlink(node->GetInDataAnchor(0)))
+  const auto &out_data_anchor = node->GetOutDataAnchor(0);
   GE_CHECK_NOTNULL(out_data_anchor);
-  for (auto peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-    GE_CHK_STATUS_RET(out_data_anchor->Unlink(peer_in_data_anchor));
-    GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->LinkTo(peer_in_data_anchor));
+  for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    GE_CHK_STATUS_RET(out_data_anchor->Unlink(peer_in_data_anchor))
+    GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->LinkTo(peer_in_data_anchor))
   }
-
-  auto graph = node->GetOwnerComputeGraph();
-  GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph, node))
+  GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(node->GetOwnerComputeGraph(), node))
+  AddNodeDeleted(node);
   AddRePassNodesWithInOut(in_node);
 
   return SUCCESS;
 }
+
+Status EnterPass::UnlinkCtrlEdgeBeforeConst(NodePtr &node) {
+  auto out_ctrl_nodes = node->GetOutControlNodes();
+  if (out_ctrl_nodes.empty()) {
+    return SUCCESS;
+  }
+  auto out_ctrl_anchor = node->GetOutControlAnchor();
+  GE_CHECK_NOTNULL(out_ctrl_anchor);
+
+  for (auto &out_ctrl_node : out_ctrl_nodes) {
+    GE_CHECK_NOTNULL(out_ctrl_node);
+    if ((out_ctrl_node->GetType() != CONSTANT) && (out_ctrl_node->GetType() != CONSTANTOP)) {
+      continue;
+    }
+    auto in_ctrl_nodes = out_ctrl_node->GetInControlNodes();
+    if (in_ctrl_nodes.size() != kInCtrlNodesNum) {
+      continue;
+    }
+
+    // Skip when has merge out
+    bool has_merge_out = false;
+    auto out_nodes_of_const = out_ctrl_node->GetOutAllNodes();
+    for (const auto &out_node_of_const : out_nodes_of_const) {
+      GE_CHECK_NOTNULL(out_node_of_const);
+      if (out_node_of_const->GetType() == MERGE || out_node_of_const->GetType() == REFMERGE) {
+        has_merge_out = true;
+        break;
+      }
+    }
+    if (has_merge_out) {
+      continue;
+    }
+
+    GELOGI("Unlink control edge from %s to %s.", node->GetName().c_str(), out_ctrl_node->GetName().c_str());
+    GE_CHK_STATUS_RET(out_ctrl_anchor->Unlink(out_ctrl_node->GetInControlAnchor()))
+    for (auto &out_node_of_const : out_nodes_of_const) {
+      if (!out_ctrl_anchor->IsLinkedWith(out_node_of_const->GetInControlAnchor())) {
+        GELOGI("Link control edge from %s to %s.", node->GetName().c_str(), out_node_of_const->GetName().c_str());
+        GE_CHK_STATUS_RET(out_ctrl_anchor->LinkTo(out_node_of_const->GetInControlAnchor()))
+      }
+    }
+  }
+  return SUCCESS;
+}
 }  // namespace ge
diff --git a/ge/graph/passes/enter_pass.h b/ge/graph/passes/enter_pass.h
index 677516ff..1417b1f0 100644
--- a/ge/graph/passes/enter_pass.h
+++ b/ge/graph/passes/enter_pass.h
@@ -25,7 +25,8 @@ class EnterPass : public BaseNodePass {
   Status Run(NodePtr &node) override;
 
  private:
-  Status OptimizeEnter(NodePtr &node, NodePtr &in_node);
+  Status OptimizeEnterWithOnlyDataOut(NodePtr &node, NodePtr &in_node);
+  Status UnlinkCtrlEdgeBeforeConst(NodePtr &node);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_ENTER_PASS_H_
diff --git a/ge/graph/passes/folding_pass.cc b/ge/graph/passes/folding_pass.cc
index 93dc2c40..227a0f61 100755
--- a/ge/graph/passes/folding_pass.cc
+++ b/ge/graph/passes/folding_pass.cc
@@ -173,10 +173,7 @@ Status FoldingPass::DealWithInNodes(NodePtr &node) {
       continue;
     }
     auto in_node = in_node_anchor->GetOwnerNode();
-    if (in_node == nullptr) {
-      continue;
-    }
-    if ((in_node->GetType() == SWITCH) || (in_node->GetType() == REFSWITCH)) {
+    if ((in_node->GetType() == SWITCH) || (in_node->GetType() == REFSWITCH) || (in_node->GetType() == SWITCHN)) {
       GELOGI("The in_node name is %s, and node type is %s.", in_node->GetName().c_str(), in_node->GetType().c_str());
       auto ret = in_node_anchor->Unlink(in_data_anchor);
       if (ret != SUCCESS) {
diff --git a/ge/graph/passes/for_pass.cc b/ge/graph/passes/for_pass.cc
index f3caea35..31dee390 100644
--- a/ge/graph/passes/for_pass.cc
+++ b/ge/graph/passes/for_pass.cc
@@ -37,6 +37,7 @@ namespace {
   const uint32_t kSubgraphLoopVarInputIndex = 0;
   const uint32_t kSubgraphInputIndex = 1;
   const uint32_t kWhileOutputIndex = 5;
+  const size_t kIDiffValue = 2;
   const std::string kAbs = "Abs";
 }
 
@@ -137,7 +138,7 @@ Status ForPass::BuildForInfo(const ComputeGraphPtr &root_graph, const NodePtr &n
   for_info.ctrl_inputs = std::move(ctrl_inputs);
   for_info.ctrl_outputs = std::move(ctrl_outputs);
 
-  GELOGI("Build for_info for node %s succ.", node->GetName().c_str());
+  GELOGI("Build for_info for node %s success.", node->GetName().c_str());
   return SUCCESS;
 }
 
@@ -159,13 +160,7 @@ OutDataAnchorPtr ForPass::FindInputWithIndex(const NodePtr &node, uint32_t index
     return nullptr;
   }
 
-  OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
-  if (peer_out_anchor == nullptr) {
-    GELOGE(FAILED, "FindInputWithIndex %s:%u failed: peer_out_anchor is NULL.", node->GetName().c_str(), index);
-    return nullptr;
-  }
-
-  return peer_out_anchor;
+  return in_data_anchor->GetPeerOutAnchor();
 }
 
 ///
@@ -186,20 +181,13 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc
   uint32_t input_data_num = node->GetAllInDataAnchorsSize();
   for (uint32_t index = FOR_DATA_INPUT; index < input_data_num; index++) {
     InDataAnchorPtr in_data_anchor = node->GetInDataAnchor(index);
-    if (in_data_anchor == nullptr) {
-      GELOGE(FAILED, "FindInputWithIndex %s:%u failed: in_data_anchor is NULL.", node->GetName().c_str(), index);
-      return FAILED;
-    }
-    GE_IF_BOOL_EXEC(in_data_anchor->GetPeerOutAnchor() == nullptr,
-                    GELOGW("Get null input by index %d from node %s ",
-                           in_data_anchor->GetIdx(), node->GetName().c_str());
-                    continue);
+    GE_CHECK_NOTNULL(in_data_anchor);
     data_inputs.emplace_back(in_data_anchor->GetPeerOutAnchor());
   }
 
-  for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
+  for (const auto &out_data_anchor : node->GetAllOutDataAnchors()) {
     std::vector<ge::InDataAnchorPtr> peer_in_data_anchors;
-    for (auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    for (const auto &peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
       peer_in_data_anchors.emplace_back(peer_in_data_anchor);
     }
     data_outputs.emplace_back(peer_in_data_anchors);
@@ -207,13 +195,13 @@ Status ForPass::FindInputsAndOutputs(const NodePtr &node, std::vector<OutDataAnc
 
   InControlAnchorPtr in_ctrl_anchor = node->GetInControlAnchor();
   GE_CHECK_NOTNULL(in_ctrl_anchor);
-  for (auto &peer_out_ctrl_anchor : in_ctrl_anchor->GetPeerOutControlAnchors()) {
+  for (const auto &peer_out_ctrl_anchor : in_ctrl_anchor->GetPeerOutControlAnchors()) {
     ctrl_inputs.emplace_back(peer_out_ctrl_anchor);
   }
 
   OutControlAnchorPtr out_ctrl_anchor = node->GetOutControlAnchor();
   GE_CHECK_NOTNULL(out_ctrl_anchor);
-  for (auto &peer_in_ctrl_anchor : out_ctrl_anchor->GetPeerInControlAnchors()) {
+  for (const auto &peer_in_ctrl_anchor : out_ctrl_anchor->GetPeerInControlAnchors()) {
     ctrl_outputs.emplace_back(peer_in_ctrl_anchor);
   }
 
@@ -707,7 +695,7 @@ Status ForPass::UpdateForBodyInputMapping(const WhileInfo &while_info) {
     } else if ((i == FOR_LIMIT_INPUT) || (i == FOR_DELTA_INPUT)) {
       continue;
     } else {
-      input_mapping[i] = i - 2;
+      input_mapping[i] = i - kIDiffValue;
     }
   }
   for_body->UpdateInputMapping(input_mapping);
diff --git a/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc b/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc
new file mode 100644
index 00000000..ab8fc39b
--- /dev/null
+++ b/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/passes/fuse_data_nodes_with_common_input_pass.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <set>
+#include "common/ge_inner_error_codes.h"
+#include "graph/utils/op_desc_utils.h"
+#include "graph/utils/type_utils.h"
+#include "graph/utils/node_utils.h"
+
+using std::map;
+using std::vector;
+using std::set;
+using std::string;
+
+namespace ge {
+Status FuseDataNodesWithCommonInputPass::Run(ge::ComputeGraphPtr graph) {
+  if (graph == nullptr) {
+    GELOGE(GE_GRAPH_PARAM_NULLPTR, "Compute graph is null.");
+    return GE_GRAPH_PARAM_NULLPTR;
+  }
+  GELOGD("FuseDataNodesWithCommonInputPass in.");
+  // key: subgraph, value:--key: peer out anchor to parent node, --value: parent indexes to parent node
+  map<ComputeGraphPtr, map<OutDataAnchorPtr, set<uint32_t>>> subgraphs_to_need_fuse_nodes_info;
+  if (InitNeedFuseNodesInfo(graph, subgraphs_to_need_fuse_nodes_info) != SUCCESS) {
+    GELOGE(FAILED, "InitNeedFuseNodesInfo failed.");
+    return FAILED;
+  }
+  return FuseDataNodes(subgraphs_to_need_fuse_nodes_info);
+}
+
+Status FuseDataNodesWithCommonInputPass::InitNeedFuseNodesInfo(ComputeGraphPtr &graph,
+    map<ComputeGraphPtr, map<OutDataAnchorPtr, set<uint32_t>>> &subgraphs_to_need_fuse_nodes_info) {
+  for (const auto &subgraph : graph->GetAllSubgraphs()) {
+    GE_CHECK_NOTNULL(subgraph);
+    auto parent_node = subgraph->GetParentNode();
+    GE_CHECK_NOTNULL(parent_node);
+    if (parent_node->GetType() == CASE || parent_node->GetType() == IF) {
+      auto &peer_out_anchors_to_parent_indexes = subgraphs_to_need_fuse_nodes_info[subgraph];
+      for (const auto &in_data_anchor : parent_node->GetAllInDataAnchors()) {
+        GE_CHECK_NOTNULL(in_data_anchor);
+        OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
+        uint32_t parent_index = static_cast<uint32_t>(in_data_anchor->GetIdx());
+        GE_CHECK_NOTNULL(peer_out_anchor);
+        peer_out_anchors_to_parent_indexes[peer_out_anchor].insert(parent_index);
+        GELOGD("Peer node %s is the %d input of parent node %s in %s.",
+               peer_out_anchor->GetOwnerNode()->GetName().c_str(), parent_index, parent_node->GetName().c_str(),
+               subgraph->GetName().c_str());
+      }
+    }
+  }
+  return SUCCESS;
+}
+
+Status FuseDataNodesWithCommonInputPass::FuseDataNodes(
+    const map<ComputeGraphPtr, map<OutDataAnchorPtr, set<uint32_t>>> &subgraphs_to_need_fuse_nodes_info) {
+  for (const auto &subgraph_to_need_fuse_nodes_info : subgraphs_to_need_fuse_nodes_info) {
+    auto subgraph = subgraph_to_need_fuse_nodes_info.first;
+    for (const auto &peer_out_anchors_to_parent_indexes : subgraph_to_need_fuse_nodes_info.second) {
+      if (peer_out_anchors_to_parent_indexes.second.size() <= 1) {
+        continue;
+      }
+      // key: out anchor, value: data nodes with common input will be fused
+      map<OutDataAnchorPtr, vector<NodePtr>> peer_out_anchors_to_need_fuse_nodes;
+      for (const auto &node : subgraph->GetDirectNode()) {
+        if (node->GetType() != DATA) {
+          continue;
+        }
+        GE_CHECK_NOTNULL(node->GetOpDesc());
+        uint32_t parent_index = 0;
+        if (AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+          if (peer_out_anchors_to_parent_indexes.second.count(parent_index) > 0) {
+            peer_out_anchors_to_need_fuse_nodes[peer_out_anchors_to_parent_indexes.first].emplace_back(node);
+          }
+        }
+      }
+      for (const auto &peer_out_anchor_to_need_fuse_nodes : peer_out_anchors_to_need_fuse_nodes) {
+        auto need_fuse_data_nodes = peer_out_anchor_to_need_fuse_nodes.second;
+        auto first_node = need_fuse_data_nodes.at(0);
+        for (size_t i = 1; i < need_fuse_data_nodes.size(); ++i) {
+          auto node = need_fuse_data_nodes.at(i);
+          GELOGI("Replace redundant data node %s by %s exist in graph: %s.", node->GetName().c_str(),
+                 first_node->GetName().c_str(), subgraph->GetName().c_str());
+          // the data node which can be fused has none input(both data and control in)
+          if (GraphUtils::MoveOutCtrlEdges(node, first_node) != SUCCESS) {
+            return FAILED;
+          }
+          if (GraphUtils::ReplaceNodeDataAnchors(first_node, node, {}, {0}) != SUCCESS) {
+            return FAILED;
+          }
+          if (GraphUtils::RemoveNodeWithoutRelink(subgraph, node) != SUCCESS) {
+            GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str());
+            return FAILED;
+          }
+        }
+      }
+    }
+  }
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/ge/graph/passes/fuse_data_nodes_with_common_input_pass.h b/ge/graph/passes/fuse_data_nodes_with_common_input_pass.h
new file mode 100755
index 00000000..9ff6ab89
--- /dev/null
+++ b/ge/graph/passes/fuse_data_nodes_with_common_input_pass.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_PASSES_FUSE_DATA_NODES_WITH_COMMON_INPUT_PASS_H_
+#define GE_GRAPH_PASSES_FUSE_DATA_NODES_WITH_COMMON_INPUT_PASS_H_
+
+#include <set>
+#include <map>
+#include <vector>
+#include "graph/types.h"
+#include "inc/graph_pass.h"
+
+namespace ge {
+class FuseDataNodesWithCommonInputPass : public GraphPass {
+ public:
+  Status Run(ge::ComputeGraphPtr graph) override;
+
+ private:
+  Status InitNeedFuseNodesInfo(ComputeGraphPtr &graph,
+      map<ComputeGraphPtr, map<OutDataAnchorPtr, std::set<uint32_t>>> &subgraphs_to_need_fuse_nodes_info);
+  Status FuseDataNodes(
+      const map<ComputeGraphPtr, map<OutDataAnchorPtr, std::set<uint32_t>>> &subgraphs_to_need_fuse_nodes_info);
+};
+} // namespace ge
+#endif // GE_GRAPH_PASSES_FUSE_DATA_NODES_WITH_COMMON_INPUT_PASS_H_
diff --git a/ge/graph/passes/inplace_support_check_pass.cc b/ge/graph/passes/inplace_support_check_pass.cc
new file mode 100644
index 00000000..44ad8361
--- /dev/null
+++ b/ge/graph/passes/inplace_support_check_pass.cc
@@ -0,0 +1,82 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/passes/inplace_support_check_pass.h"
+#include "framework/common/debug/log.h"
+#include "graph/utils/graph_utils.h"
+#include "graph/debug/ge_attr_define.h"
+
+namespace ge {
+namespace {
+constexpr uint32_t kInplaceSupportOutputIndex = 0;
+constexpr uint32_t kInplaceSupportOutputNum = 1;
+static const std::set<std::string> kSrcNodeTypes = { ge::DATA, ge::ANN_DATA, ge::AIPPDATA,
+                                                     ge::CONSTANT, ge::CONSTANTOP,
+                                                     ge::VARIABLE, ge::VARIABLEV2 };
+}
+Status InplaceSupportCheckPass::Run(NodePtr &node) {
+  GELOGD("InplaceSupportCheckPass running");
+  if (node->GetAllOutDataAnchorsSize() != kInplaceSupportOutputNum) {
+    GELOGD("output num of node %s is not %u, skip InplaceSupportCheckPass",
+           node->GetName().c_str(), kInplaceSupportOutputNum);
+    return SUCCESS;
+  }
+  GE_CHECK_NOTNULL(node->GetOpDesc());
+  const DataType &output_type = node->GetOpDesc()->GetOutputDesc(kInplaceSupportOutputIndex).GetDataType();
+  const GeShape &output_shape = node->GetOpDesc()->GetOutputDesc(kInplaceSupportOutputIndex).GetShape();
+  GELOGD("process InplaceSupportCheckPass on node %s", node->GetName().c_str());
+  for (const auto &in_data_anchor : node->GetAllInDataAnchors()) {
+    const auto &peer_data_anchor = in_data_anchor->GetPeerOutAnchor();
+    if (peer_data_anchor == nullptr) {
+      continue;
+    }
+    auto in_node = peer_data_anchor->GetOwnerNode();
+    if (kSrcNodeTypes.count(in_node->GetType()) > 0) {
+      GELOGD("meet src_node %s", in_node->GetName().c_str());
+      continue;
+    }
+    if (peer_data_anchor->GetPeerInDataNodesSize() != kInplaceSupportOutputNum) {
+      GELOGD("peer_data_anchor links with multi in_data_anchors");
+      continue;
+    }
+
+    int32_t inplace_input_idx = in_data_anchor->GetIdx();
+    const DataType &input_type = node->GetOpDesc()->GetInputDesc(inplace_input_idx).GetDataType();
+    const GeShape &input_shape = node->GetOpDesc()->GetInputDesc(inplace_input_idx).GetShape();
+    if (input_type !=  output_type) {
+      GELOGW("DataType mismatch, in_idx=%d, input_type=%u, output_type=%u", inplace_input_idx, input_type, output_type);
+      continue;
+    }
+    if (input_shape.GetDims() != output_shape.GetDims()) {
+      GELOGW("Shape mismatch, in_idx=%d, input_shape=[%s], output_shape=[%s]",
+             inplace_input_idx, input_shape.ToString().c_str(), output_shape.ToString().c_str());
+      continue;
+    }
+
+    GELOGD("add attr INPLACE_SUPPORT_INPUT_INDEX on node %s, input_idx=%d", node->GetName().c_str(), inplace_input_idx);
+    if (!AttrUtils::SetInt(node->GetOpDesc()->MutableOutputDesc(kInplaceSupportOutputIndex),
+                           INPLACE_SUPPORT_INPUT_INDEX, inplace_input_idx)) {
+      GELOGE(FAILED, "Set attr INPLACE_SUPPORT_INPUT_INDEX on node %s failed.", node->GetName().c_str());
+      return FAILED;
+    }
+    AddRePassNode(node);
+    break;
+  }
+
+  GELOGD("InplaceSupportCheckPass success");
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/ge/graph/passes/inplace_support_check_pass.h b/ge/graph/passes/inplace_support_check_pass.h
new file mode 100644
index 00000000..be2d6c75
--- /dev/null
+++ b/ge/graph/passes/inplace_support_check_pass.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_PASSES_INPLACE_SUPPORT_CHECK_PASS_H_
+#define GE_GRAPH_PASSES_INPLACE_SUPPORT_CHECK_PASS_H_
+
+#include "graph/passes/base_pass.h"
+
+namespace ge {
+class InplaceSupportCheckPass : public BaseNodePass {
+ public:
+  Status Run(NodePtr &node) override;
+};
+}  // namespace ge
+#endif  // GE_GRAPH_PASSES_INPLACE_SUPPORT_CHECK_PASS_H_
diff --git a/ge/graph/passes/mark_agnostic_pass.cc b/ge/graph/passes/mark_agnostic_pass.cc
index 8c9a0451..30fa1742 100644
--- a/ge/graph/passes/mark_agnostic_pass.cc
+++ b/ge/graph/passes/mark_agnostic_pass.cc
@@ -19,6 +19,8 @@
 #include "graph/utils/tensor_utils.h"
 
 namespace ge {
+const size_t kTwoInputNodesSize = 2;
+
 Status MarkAgnosticPass::Run(ComputeGraphPtr graph) {
   for (const auto &node : graph->GetDirectNode()) {
     auto node_type = NodeUtils::GetNodeType(*node);
@@ -52,7 +54,7 @@ Status MarkAgnosticPass::Run(ComputeGraphPtr graph) {
       /// Enter-----------+
       ///                 +-> Merge
       /// NextIteration---+
-      if (input_nodes.size() == 2) {
+      if (input_nodes.size() == kTwoInputNodesSize) {
         if (input_nodes.at(0)->GetType() == ENTER && input_nodes.at(1)->GetType() == NEXTITERATION) {
           continue;
         }
diff --git a/ge/graph/passes/merge_pass.cc b/ge/graph/passes/merge_pass.cc
index d2340037..26d82820 100644
--- a/ge/graph/passes/merge_pass.cc
+++ b/ge/graph/passes/merge_pass.cc
@@ -21,18 +21,16 @@
 #include <vector>
 
 #include "framework/common/debug/ge_log.h"
-#include "common/ge_inner_error_codes.h"
 #include "common/ge/ge_util.h"
 #include "graph/common/omg_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/passes/pass_utils.h"
 
-using domi::PARAM_INVALID;
-using domi::SUCCESS;
-
 namespace ge {
 const int kValueIndexOutputIndex = 1;
+const size_t kCaseNoInput = 0;
+const size_t kCaseOneInput = 1;
 
 Status MergePass::Run(NodePtr &node) {
   GELOGD("MergePass running");
@@ -47,15 +45,14 @@ Status MergePass::Run(NodePtr &node) {
     return SUCCESS;
   }
 
-  auto out_data_anchors = node->GetAllOutDataAnchors();
-  if (out_data_anchors.empty()) {
+  if (node->GetAllOutDataAnchors().empty()) {
     GELOGE(PARAM_INVALID, "[%s] Merge node output anchor is empty", node->GetName().c_str());
     return PARAM_INVALID;
   }
 
-  auto in_data_nodes = node->GetInDataNodes();
+  const auto &in_data_nodes = node->GetInDataNodes();
   switch (in_data_nodes.size()) {
-    case 0: {
+    case kCaseNoInput: {
       /// Case A: input_count = 0, the output of merge node is inactive as well
       /// In which case the output branch can be removed
       /// until another merge node is met
@@ -70,7 +67,7 @@ Status MergePass::Run(NodePtr &node) {
       }
       return ret;
     }
-    case 1: {  // Case B: input_count = 1, the merge node can be optimized out
+    case kCaseOneInput: {  // Case B: input_count = 1, the merge node can be optimized out
       std::vector<int> merge_io_map = {PassUtils::GetUniqueInDataAnchorIndex(node), -1};
       if (merge_io_map[0] != -1 && IsNeedChangeIndexToConstant(node)) {
         int index = merge_io_map[0];
diff --git a/ge/graph/passes/merge_to_stream_merge_pass.cc b/ge/graph/passes/merge_to_stream_merge_pass.cc
index 103fbb1b..c1a57a61 100644
--- a/ge/graph/passes/merge_to_stream_merge_pass.cc
+++ b/ge/graph/passes/merge_to_stream_merge_pass.cc
@@ -89,16 +89,6 @@ Status MergeToStreamMergePass::ReplaceMergeNode(const ComputeGraphPtr &graph, co
     GE_CHK_STATUS_RET(SetNextIteration(stream_merge, next_iteration_name), "Set next iteration failed");
   }
 
-  if (merge_op_desc->HasAttr(ATTR_NAME_BATCH_LABEL)) {
-    string batch_label;
-    (void)AttrUtils::GetStr(merge_op_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-    if (!batch_label.empty()) {
-      auto stream_merge_desc = stream_merge->GetOpDesc();
-      GE_CHECK_NOTNULL(stream_merge_desc);
-      (void)AttrUtils::SetStr(stream_merge_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-    }
-  }
-
   return AddActiveNodes(graph, stream_merge);
 }
 
diff --git a/ge/graph/passes/multi_batch_clone_pass.cc b/ge/graph/passes/multi_batch_clone_pass.cc
index 87d9749a..b7efa070 100755
--- a/ge/graph/passes/multi_batch_clone_pass.cc
+++ b/ge/graph/passes/multi_batch_clone_pass.cc
@@ -22,32 +22,68 @@
 #include "graph/preprocess/multi_batch_options.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/op_desc_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/utils/type_utils.h"
 #include "register/op_registry.h"
+#include "graph/common/omg_util.h"
 
 namespace ge {
 namespace {
 constexpr uint8_t kDataInIndex = 0;
 constexpr uint8_t kDataOutIndex = 0;
 constexpr uint8_t kCaseArgIndex = 1;
+const int kDivisionConst = 2;
+const size_t kNumOfGetnextNode = 1;
 
 const std::string kMultiBatchCaseNode = "ascend_mbatch_shape_case";
 const std::string kMultiBatchDataNode = "ascend_mbatch_shape_data";
+const std::string kMultiBatchGetDynamicDimsNode = "ascend_mbatch_get_dynamic_dims_node";
 const std::string kMultiBatchConstNode = "ascend_mbatch_shape_const";
 const std::string kMultiBatchMapIndexNode = "ascend_mbatch_shape_mapindex";
 const std::string kMultiBatchNodePostfix = "_ascend_mbatch_batch_";
+const char *const kGetNextName = "IteratorV2";
 }  // namespace
 
+inline bool IsGetNextType(const NodePtr &node) {
+  std::string original_type;
+  GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS,
+                  GELOGW("Get original type failed."); return false);
+  return (original_type == kGetNextName);
+}
+
 Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
+  GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(FAILED, "Original graph is nullptr"); return FAILED);
   if (graph->GetParentGraph() != nullptr) {
     GELOGD("Subgraph %s skip the MultiBatchClonePass", graph->GetName().c_str());
     return SUCCESS;
   }
-
+  if (!GetLocalOmgContext().need_multi_batch) {
+    GELOGI("No need to process_multi for no_train graph.");
+    return SUCCESS;
+  }
+  std::vector<NodePtr> data_nodes;
+  std::vector<NodePtr> getnext_nosink_nodes;
+  std::vector<NodePtr> getnext_sink_nodes;
+  if (multibatch::CheckSequenceOfOptions(graph, data_nodes, getnext_nosink_nodes, getnext_sink_nodes) != SUCCESS) {
+    GELOGE(PARAM_INVALID, "[Train_Dynamic] CheckSequenceOfOptions failed.");
+    return PARAM_INVALID;
+  }
+  if (multibatch::UpdateNameOfInputShape(graph, data_nodes, getnext_nosink_nodes, getnext_sink_nodes) != SUCCESS) {
+    GELOGE(PARAM_INVALID, "[Train_Dynamic] UpdateNameForInputShapeOfOption failed.");
+    return PARAM_INVALID;
+  }
+  if (multibatch::DeleteIdentityInsertByAdapter(graph) != SUCCESS) {
+    GELOGE(PARAM_INVALID, "[Train_Dynamic] DeleteIdentityInsertByAdapter failed.");
+    return PARAM_INVALID;
+  }
   if (!multibatch::InitDynamicParams(batch_shapes_)) {
     GELOGD("There is no multi-batch options, no need clone multi-batch graph");
     return SUCCESS;
   }
-
+  if (multibatch::CheckNegativeCountOfOptions(batch_shapes_) != SUCCESS) {
+    GELOGE(PARAM_INVALID, "[Train_Dynamic] Input_shape and dynamic_dims should set correct params.");
+    return PARAM_INVALID;
+  }
   GELOGD("Begin to run Multi-batch clone on graph: %s", graph->GetName().c_str());
   GE_CHK_STATUS_RET(multibatch::CheckDynamicParams(batch_shapes_), "Invalid multi-batch param");
   if (CollectIoNodes(graph) != SUCCESS) {
@@ -64,21 +100,14 @@ Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
 
   (void)AttrUtils::GetStr(graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_);
   ComputeGraphPtr branch = MakeShared<ComputeGraph>(graph->GetName());
-  if (branch == nullptr) {
-    GELOGE(OUT_OF_MEMORY, "Create multi-batch graph failed");
-    return OUT_OF_MEMORY;
-  }
+  GE_IF_BOOL_EXEC(branch == nullptr, GELOGE(OUT_OF_MEMORY, "Create multi batch graph failed"); return OUT_OF_MEMORY);
   (void)AttrUtils::SetStr(branch, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id_);
 
   graph->InValid();  // Will modify, need topological again.
   graph->Swap(*branch);
-  if (CreateRootGraph(graph) != SUCCESS) {
-    return FAILED;
-  }
-
-  if (CreateSubgraphs(graph, branch) != SUCCESS) {
-    return FAILED;
-  }
+  GE_CHK_STATUS_RET(CreateRootGraph(graph), "Construct root graph failed.");
+  GE_CHK_STATUS_RET(CreateOriGraph(branch), "Construct original graph failed.")
+  GE_CHK_STATUS_RET(CreateSubgraphs(graph, branch), "Construct subgraph failed.");
 
   GE_CHK_STATUS_RET(PruneDirectOutput(graph), "Prune direct output failed");
   GELOGD("MultiBatchClonePass Leave");
@@ -93,9 +122,13 @@ Status MultiBatchClonePass::Run(ComputeGraphPtr graph) {
 ///
 Status MultiBatchClonePass::CollectIoNodes(const ComputeGraphPtr &graph) {
   for (const auto &node : graph->GetDirectNode()) {
+    if (!GetLocalOmgContext().dynamic_node_type.empty() && IsGetNextType(node)) {
+      all_data_nodes_.emplace_back(node);
+      GE_CHK_STATUS_RET(InitParamsOfGetNext(node), "Init params of %s failed.", node->GetName().c_str());
+    }
     if (node->GetType() == DATA) {
       all_data_nodes_.emplace_back(node);
-    } else if (node->GetType() == CONSTANT) {
+    } else if (node->GetType() == CONSTANT || node->GetType() == CONSTANTOP) {
       all_const_nodes_.emplace_back(node);
     } else if (node->GetType() == NETOUTPUT) {
       all_output_nodes_.emplace_back(node);
@@ -112,10 +145,16 @@ Status MultiBatchClonePass::CollectIoNodes(const ComputeGraphPtr &graph) {
   }
 
   int64_t data_index = 0;
+  size_t getnext_node_count = 0;
   for (size_t i = 0; i < all_data_nodes_.size(); ++i) {
+    if (IsGetNextType(all_data_nodes_[i])) {
+      // just one getnext node in graph
+      getnext_node_count++;
+      continue;
+    }
     const auto &op_desc = all_data_nodes_[i]->GetOpDesc();
     if (!AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
-      (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, i);
+      (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, i - getnext_node_count);
     }
   }
 
@@ -131,7 +170,43 @@ Status MultiBatchClonePass::CollectIoNodes(const ComputeGraphPtr &graph) {
           "Remove edge failed");
     }
   }
+  GELOGD("Data count is %zu, const count is %zu, getnext count is %zu, output count is %zu, direct out count is %zu.",
+         all_data_nodes_.size(), all_const_nodes_.size(), getnext_node_count, all_output_nodes_.size(),
+         direct_output_.size());
+
+  return SUCCESS;
+}
 
+Status MultiBatchClonePass::InitParamsOfGetNext(const NodePtr &node) {
+  data_count_from_getnext_ = 0;
+  getnext_sink_dynamic_dims_ = false;
+  GE_CHECK_NOTNULL(node->GetOpDesc());
+  data_count_from_getnext_ = node->GetOpDesc()->GetOutputsSize();
+  if (GetLocalOmgContext().dynamic_node_type == GETNEXT) {
+    data_count_from_getnext_ = data_count_from_getnext_ / kDivisionConst;
+    for (size_t i = 0; i < data_count_from_getnext_; ++i) {
+      GeTensorDesc output_desc = node->GetOpDesc()->GetOutputDesc(i);
+      GELOGD("The %zu data shape from getnext sink is %s.", i,
+             formats::JoinToString(output_desc.GetShape().GetDims()).c_str());
+      const auto &dims = output_desc.GetShape().GetDims();
+      if (std::all_of(dims.begin(), dims.end(), [](int64_t val) {return val >= 0; })) {
+        GELOGD("The %zu data from %s is static.", i, node->GetName().c_str());
+      } else {
+        getnext_sink_dynamic_dims_ = true;
+        GELOGD("Dynamic dims in the pattern of getnext sink.");
+      }
+    }
+  }
+  if (node->GetOutControlAnchor() != nullptr) {
+    for (const auto &peer_in_control_anchor : node->GetOutControlAnchor()->GetPeerInControlAnchors()) {
+      NodePtr next_node = peer_in_control_anchor->GetOwnerNode();
+      GE_CHECK_NOTNULL(next_node);
+      if (next_node->GetType() == CONSTANTOP) {
+        out_control_nodes_.insert(next_node);
+        GELOGD("Control edge: %s connect with %s.", node->GetName().c_str(), next_node->GetName().c_str());
+      }
+    }
+  }
   return SUCCESS;
 }
 
@@ -142,7 +217,11 @@ Status MultiBatchClonePass::CollectIoNodes(const ComputeGraphPtr &graph) {
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) {
+  GELOGD("Start create root graph of %s.", graph->GetName().c_str());
   uint32_t input_num = all_data_nodes_.size() + all_const_nodes_.size();
+  if (data_count_from_getnext_ != 0) {
+    input_num = input_num + data_count_from_getnext_ - kNumOfGetnextNode;
+  }
   uint32_t output_num = all_output_nodes_[0]->GetAllInDataAnchorsSize();
 
   OpDescBuilder op_builder(kMultiBatchCaseNode, CASE);
@@ -183,6 +262,10 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) {
            op_desc->GetName().c_str());
     return FAILED;
   }
+  if (!AttrUtils::SetBool(op_desc, ATTR_INSERT_BY_MBATCH, true)) {
+    GELOGE(INTERNAL_ERROR, "Failed to add insert attr on case node %s", op_desc->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
   GE_CHK_STATUS_RET(multibatch::StampDynamicType(op_desc), "Set dynamic type failed");
 
   GE_CHK_STATUS_RET(CreateIndexNode(graph), "Create index node failed");
@@ -200,7 +283,7 @@ Status MultiBatchClonePass::CreateRootGraph(const ComputeGraphPtr &graph) {
 /// @param [in] NodePtr node: index data node.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &node) {
+Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &shape_node) {
   const OpDescPtr data_desc = MakeShared<OpDesc>(kMultiBatchDataNode, DATA);
   if (data_desc == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed");
@@ -218,11 +301,12 @@ Status MultiBatchClonePass::CreateIndexDataNode(const ComputeGraphPtr &graph, No
   }
 
   size_t data_index = all_data_nodes_.size();
+  data_index = data_count_from_getnext_ != 0 ? data_index - kNumOfGetnextNode : data_index;
   (void)AttrUtils::SetInt(data_desc, ATTR_NAME_INDEX, data_index);
   (void)AttrUtils::SetBool(data_desc, ATTR_INSERT_BY_MBATCH, true);
 
-  node = graph->AddNode(data_desc);
-  if (node == nullptr) {
+  shape_node = graph->AddNode(data_desc);
+  if (shape_node == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Create multi-batch data node failed");
     return OUT_OF_MEMORY;
   }
@@ -284,15 +368,19 @@ Status MultiBatchClonePass::CreateIndexConstNode(const ComputeGraphPtr &graph, N
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) {
-  // Data --> MapIndex --> Case
-  NodePtr data_node;
-  GE_CHK_STATUS_RET(CreateIndexDataNode(graph, data_node), "Create data node failed");
+  // Data/GetDynamicDims --> MapIndex --> Case
+  if (!getnext_sink_dynamic_dims_) {
+    GE_CHK_STATUS_RET(CreateIndexDataNode(graph, shape_node_), "Create data node failed");
+  } else {
+    GE_CHK_STATUS_RET(CreateGetDynamicDimsNode(graph, shape_node_), "Create get dynamic dims node failed");
+  }
 
   NodePtr const_node;
   GE_CHK_STATUS_RET(CreateIndexConstNode(graph, const_node), "Create const node failed");
-
+  GELOGD("Shape node name is %s, type is %s, const node name is %s.", shape_node_->GetName().c_str(),
+         shape_node_->GetType().c_str(), const_node->GetName().c_str());
   OpDescBuilder op_builder(kMultiBatchMapIndexNode, "MapIndex");
-  op_builder.AddInput("x", data_node->GetOpDesc()->GetOutputDesc(0))
+  op_builder.AddInput("x", shape_node_->GetOpDesc()->GetOutputDesc(0))
       .AddInput("data_seq", const_node->GetOpDesc()->GetOutputDesc(0))
       .AddOutput("y", GeTensorDesc(GeShape(), FORMAT_ND, DT_INT32));
 
@@ -307,8 +395,10 @@ Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) {
     return OUT_OF_MEMORY;
   }
 
-  if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(0), index_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) {
-    GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", data_node->GetName().c_str(),
+  GE_CHK_STATUS_RET(AddAttrForGetDynamicDims(shape_node_), "Failed to add attr for %s.",
+                    shape_node_->GetName().c_str());
+  if (GraphUtils::AddEdge(shape_node_->GetOutDataAnchor(0), index_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) {
+    GELOGE(FAILED, "Failed to add edge between node:%s to MapIndex:%s", shape_node_->GetName().c_str(),
            index_node->GetName().c_str());
     return FAILED;
   }
@@ -326,6 +416,120 @@ Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) {
   return SUCCESS;
 }
 
+Status MultiBatchClonePass::CreateGetDynamicDimsNode(const ComputeGraphPtr &graph, NodePtr &shape_node) {
+  const OpDescPtr data_desc = MakeShared<OpDesc>(kMultiBatchGetDynamicDimsNode, GETDYNAMICDIMS);
+  if (data_desc == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch get dynamic dims node failed");
+    return OUT_OF_MEMORY;
+  }
+
+  // input of GetDynamicDims is shape_of_each_data, output is gear_info
+  for (size_t i = 0; i < GetLocalOmgContext().user_input_dims.size(); ++i) {
+    size_t input_shape_dims = GetLocalOmgContext().user_input_dims.at(i).second.size();
+    // add input desc without GeShape for const input, value of input_shape is 1 transferred by adapter
+    if (input_shape_dims == 1 && GetLocalOmgContext().user_input_dims.at(i).second.at(0) == 0) {
+      GeTensorDesc tensor_desc;
+      tensor_desc.SetFormat(FORMAT_ND);
+      tensor_desc.SetDataType(DT_INT32);
+      auto ret = data_desc->AddInputDesc(tensor_desc);
+      GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data");
+          return FAILED);
+      continue;
+    }
+    GeTensorDesc tensor_desc(GeShape({static_cast<int32_t>(input_shape_dims)}), FORMAT_ND, DT_INT32);
+    auto ret = data_desc->AddInputDesc(tensor_desc);
+    GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add input desc for created data");
+        return FAILED);
+  }
+  GeTensorDesc tensor_desc(GeShape({static_cast<int32_t>(batch_shapes_.at(0).size())}), FORMAT_ND, DT_INT32);
+  auto ret = data_desc->AddOutputDesc(tensor_desc);
+  GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to add output desc for created data");
+      return FAILED);
+
+  (void)AttrUtils::SetBool(data_desc, ATTR_INSERT_BY_MBATCH, true);
+
+  shape_node = graph->AddNode(data_desc);
+  if (shape_node == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create multi-batch dynamic dims node failed");
+    return OUT_OF_MEMORY;
+  }
+  return SUCCESS;
+}
+
+Status MultiBatchClonePass::AddAttrForGetDynamicDims(const NodePtr &shape_node) {
+  if (!getnext_sink_dynamic_dims_) {
+    GELOGD("No need to add attr when not insert get dynamic dims node.");
+    return SUCCESS;
+  }
+  GELOGD("Add attr for :%s, type is %s:", shape_node->GetName().c_str(), shape_node->GetType().c_str());
+  if (!AttrUtils::SetInt(shape_node->GetOpDesc(), ATTR_GETNEXT_SINK_DATA_COUNT, data_count_from_getnext_)) {
+    GELOGE(INTERNAL_ERROR, "set ATTR_GETNEXT_SINK_DATA_COUNT failed");
+    return INTERNAL_ERROR;
+  }
+  vector<int64_t> shape_info;
+  for (size_t i = 0; i < GetLocalOmgContext().user_input_dims.size(); ++i) {
+    if (GetLocalOmgContext().user_input_dims.at(i).second.size() == 1 &&
+        GetLocalOmgContext().user_input_dims.at(i).second.at(0) == 0) {
+      shape_info.emplace_back(0);
+      continue;
+    }
+    shape_info.emplace_back(GetLocalOmgContext().user_input_dims.at(i).second.size());
+    for (size_t j = 0; j < GetLocalOmgContext().user_input_dims.at(i).second.size(); ++j) {
+      shape_info.emplace_back(GetLocalOmgContext().user_input_dims.at(i).second.at(j));
+    }
+  }
+  if (!AttrUtils::SetListInt(shape_node->GetOpDesc(), ATTR_GETNEXT_SINK_SHAPE_INFO, shape_info)) {
+    GELOGE(INTERNAL_ERROR, "set ATTR_GETNEXT_SINK_SHAPE_INFO failed");
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status MultiBatchClonePass::LinkGetNextToGetDynamicDims(const NodePtr &getnext_node, const NodePtr &shape_node) {
+  GELOGD("Start relink shape anchor of %s to %s.", getnext_node->GetName().c_str(), shape_node->GetName().c_str());
+  size_t input_index = 0;
+  size_t data_count = getnext_node->GetAllOutDataAnchors().size() / kDivisionConst;
+  for (size_t out_index = data_count; out_index < getnext_node->GetAllOutDataAnchors().size(); ++out_index,
+      ++input_index) {
+    GELOGD("Start add %s of %zu out_anchor to %s of %zu in_anchor.", getnext_node->GetName().c_str(), out_index,
+           shape_node->GetName().c_str(), input_index);
+    auto out_data_anchor =  getnext_node->GetOutDataAnchor(out_index);
+    auto ret = GraphUtils::AddEdge(out_data_anchor, shape_node->GetInDataAnchor(input_index));
+    GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link getnext %s to getdynamicdims %s",
+                                                 getnext_node->GetName().c_str(), shape_node->GetName().c_str());
+        return INTERNAL_ERROR);
+  }
+  return SUCCESS;
+}
+
+Status MultiBatchClonePass::LinkGetDynamicDimsToNetOutput(const NodePtr &output_node) {
+  if (!GetLocalOmgContext().dynamic_node_type.empty()) {
+    if (!AttrUtils::SetStr(output_node->GetOpDesc(), ATTR_ALL_GEARS_INFO, GetLocalOmgContext().dynamic_dims)) {
+      GELOGE(INTERNAL_ERROR, "Failed to set all gears info attr on netoutput %s.", output_node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+  }
+  if (getnext_sink_dynamic_dims_) {
+    GELOGD("Start link %s to %s.", shape_node_->GetName().c_str(), output_node->GetName().c_str());
+    size_t input_index = output_node->GetAllInDataAnchors().size();
+    if (NodeUtils::AppendInputAnchor(output_node, input_index + 1) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Append input anchor of %s of %zu failed.", output_node->GetName().c_str(), input_index);
+      return INTERNAL_ERROR;
+    }
+    auto ret = GraphUtils::AddEdge(shape_node_->GetOutDataAnchor(kDataOutIndex),
+                                   output_node->GetInDataAnchor(input_index));
+    GE_IF_BOOL_EXEC(ret != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "Failed to link netoutput %s to getdynamicdims %s",
+                                                 output_node->GetName().c_str(), shape_node_->GetName().c_str());
+        return INTERNAL_ERROR);
+    if (!AttrUtils::SetBool(output_node->GetOpDesc(), ATTR_GETNEXT_SINK_DYNMAIC, true)) {
+      GELOGE(INTERNAL_ERROR, "Failed to set getnext sink dynamic attr on netoutput %s.",
+             output_node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
 ///
 /// @ingroup ge
 /// @brief Create input node for root graph.
@@ -335,8 +539,10 @@ Status MultiBatchClonePass::CreateIndexNode(const ComputeGraphPtr &graph) {
 Status MultiBatchClonePass::CreateInputNode(const ComputeGraphPtr &graph) {
   // Data --> Case
   std::vector<NodePtr> all_data_nodes;
-  const size_t arg_index = kCaseArgIndex;
-  for (size_t i = 0; i < all_data_nodes_.size(); ++i) {
+  size_t case_input_index = kCaseArgIndex;
+  NodePtr getnext_node = nullptr;
+  size_t input_index_of_getnext = 0;
+  for (size_t i = 0; i < all_data_nodes_.size(); ++i, ++case_input_index) {
     const auto &node = all_data_nodes_[i];
     const OpDescPtr op_desc = AttrUtils::CopyOpDesc(node->GetOpDesc());
     if (op_desc == nullptr) {
@@ -351,22 +557,60 @@ Status MultiBatchClonePass::CreateInputNode(const ComputeGraphPtr &graph) {
     op_desc->SetName(node->GetName());
     const NodePtr &data = graph->AddNode(op_desc);
     GE_CHK_BOOL_EXEC(data != nullptr, return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str());
-    if (GraphUtils::AddEdge(data->GetOutDataAnchor(0), case_node_->GetInDataAnchor(arg_index + i)) != GRAPH_SUCCESS) {
-      GELOGE(FAILED, "Failed to add edge between Data:%s to Case:%s",
-             data->GetName().c_str(), case_node_->GetName().c_str());
-      return FAILED;
+    if (IsGetNextType(node)) {
+      getnext_node = data;
+      input_index_of_getnext = case_input_index;
+      case_input_index = case_input_index + data_count_from_getnext_;
+      continue;
+    } else {
+      if (GraphUtils::AddEdge(data->GetOutDataAnchor(0), case_node_->GetInDataAnchor(case_input_index)) !=
+          GRAPH_SUCCESS) {
+        GELOGE(FAILED, "Failed to add edge between Data:%s to Case:%s", data->GetName().c_str(),
+               case_node_->GetName().c_str());
+        return FAILED;
+      }
     }
 
-    if (SetMaxShapeToData(data) != SUCCESS) {
+    if (SetMaxShape(data) != SUCCESS) {
+      GELOGE(FAILED, "Set max shape of %s failed.", data->GetName().c_str());
       return FAILED;
     }
     all_data_nodes.emplace_back(data);
   }
+  if (getnext_node != nullptr) {
+    if (LinkEdgeForGetNext(getnext_node, input_index_of_getnext) != SUCCESS) {
+      GELOGE(FAILED, "Failed to link edge for %s.", getnext_node->GetName().c_str());
+      return FAILED;
+    }
+    if (SetMaxShape(getnext_node) != SUCCESS) {
+      GELOGE(FAILED, "Set max shape of %s failed.", getnext_node->GetName().c_str());
+      return FAILED;
+    }
+    all_data_nodes.emplace_back(getnext_node);
+  }
 
   all_data_nodes_.swap(all_data_nodes);
   return SUCCESS;
 }
 
+Status MultiBatchClonePass::LinkEdgeForGetNext(const NodePtr &getnext_node, size_t &case_input_index) {
+  GELOGD("Start link edge for %s, which is the %zu input of %s.", getnext_node->GetName().c_str(),
+         case_input_index, case_node_->GetName().c_str());
+  for (size_t out_index = 0; out_index < data_count_from_getnext_; ++out_index, ++case_input_index) {
+    if (GraphUtils::AddEdge(getnext_node->GetOutDataAnchor(out_index),
+                            case_node_->GetInDataAnchor(case_input_index)) != GRAPH_SUCCESS) {
+      GELOGE(FAILED, "Failed to add data edge between %zu Data:%s to %zu Case:%s", out_index,
+             getnext_node->GetName().c_str(), case_input_index, case_node_->GetName().c_str());
+      return FAILED;
+    }
+  }
+  if (getnext_sink_dynamic_dims_) {
+    GE_CHK_STATUS_RET(LinkGetNextToGetDynamicDims(getnext_node, shape_node_), "Failed to add link for %s.",
+                      shape_node_->GetName().c_str());
+  }
+  return SUCCESS;
+}
+
 ///
 /// @ingroup ge
 /// @brief Create Const node for root graph.
@@ -376,7 +620,11 @@ Status MultiBatchClonePass::CreateInputNode(const ComputeGraphPtr &graph) {
 Status MultiBatchClonePass::CreateConstNode(const ComputeGraphPtr &graph) {
   // Const --> Case
   std::vector<NodePtr> all_const_nodes;
-  const size_t arg_index = kCaseArgIndex + all_data_nodes_.size();
+  size_t arg_index = kCaseArgIndex + all_data_nodes_.size();
+  if (data_count_from_getnext_ != 0) {
+    arg_index = arg_index + data_count_from_getnext_ - kNumOfGetnextNode;
+  }
+
   for (size_t i = 0; i < all_const_nodes_.size(); ++i) {
     const auto &node = all_const_nodes_[i];
     const OpDescPtr op_desc = AttrUtils::CopyOpDesc(node->GetOpDesc());
@@ -393,15 +641,33 @@ Status MultiBatchClonePass::CreateConstNode(const ComputeGraphPtr &graph) {
     const NodePtr &data = graph->AddNode(op_desc);
     GE_CHK_BOOL_EXEC(data != nullptr, return FAILED, "Add node[%s] to graph failed", op_desc->GetName().c_str());
     if (GraphUtils::AddEdge(data->GetOutDataAnchor(0), case_node_->GetInDataAnchor(arg_index + i)) != GRAPH_SUCCESS) {
-      GELOGE(FAILED, "Failed to add edge between Const:%s to Case:%s",
-             data->GetName().c_str(), case_node_->GetName().c_str());
+      GELOGE(FAILED, "Failed to add edge between Const:%s to Case:%s", data->GetName().c_str(),
+             case_node_->GetName().c_str());
       return FAILED;
     }
     all_const_nodes.emplace_back(data);
   }
+  ChangeConstToData();
+  all_const_nodes_.swap(all_const_nodes);
+  return SUCCESS;
+}
 
+void MultiBatchClonePass::ChangeConstToData() {
   size_t data_index = all_data_nodes_.size();
+  if (data_count_from_getnext_ != 0) {
+    data_index = data_index + data_count_from_getnext_ - kNumOfGetnextNode;
+  }
   for (size_t i = 0; i < all_const_nodes_.size(); ++i, ++data_index) {  // Trans subgraph Const to Data.
+    auto &const_node = all_const_nodes_[i];
+    bool need_change_type = true;
+    if (out_control_nodes_.find(const_node) != out_control_nodes_.end()) {
+      GELOGD("No need to change %s to data type.", const_node->GetName().c_str());
+      need_change_type = false;
+      break;
+    }
+    if (!need_change_type) {
+      continue;
+    }
     const OpDescPtr &op_desc = all_const_nodes_[i]->GetOpDesc();
     op_desc->SetType(DATA);
     (void)op_desc->DelAttr(ATTR_NAME_WEIGHTS);  // Delete weight.
@@ -411,9 +677,6 @@ Status MultiBatchClonePass::CreateConstNode(const ComputeGraphPtr &graph) {
     (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index);
     (void)NodeUtils::AppendInputAnchor(all_const_nodes_[i], 1);
   }
-
-  all_const_nodes_.swap(all_const_nodes);
-  return SUCCESS;
 }
 
 ///
@@ -459,7 +722,8 @@ Status MultiBatchClonePass::CreateOutputNode(const ComputeGraphPtr &graph) {
       }
     }
   }
-
+  GE_CHK_STATUS_RET(LinkGetDynamicDimsToNetOutput(node), "Failed to add edge between %s to netoutput: %s.",
+                    shape_node_->GetName().c_str(), output->GetName().c_str());
   all_output_nodes_.clear();
   all_output_nodes_.emplace_back(node);
   return SUCCESS;
@@ -471,15 +735,70 @@ Status MultiBatchClonePass::CreateOutputNode(const ComputeGraphPtr &graph) {
 /// @param [in] const NodePtr &data: data in Root/Case graph.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) {
-  auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
-  auto data_name = data->GetName();
+Status MultiBatchClonePass::SetMaxShape(const NodePtr &data) {
+  GELOGD("Start set max shape for %s.", data->GetName().c_str());
+  if (!IsGetNextType(data)) {
+    if (SetMaxShapeToData(data, kDataOutIndex) != SUCCESS) {
+      GELOGE(PARAM_INVALID, "Failed to update max shape of %s.", data->GetName().c_str());
+      return PARAM_INVALID;
+    }
+  } else {
+    for (size_t out_anchor_index = 0; out_anchor_index < data_count_from_getnext_; ++out_anchor_index) {
+      if (SetMaxShapeToData(data, out_anchor_index) != SUCCESS) {
+        GELOGE(PARAM_INVALID, "Failed to update max shape of %s.", data->GetName().c_str());
+        return PARAM_INVALID;
+      }
+    }
+  }
+  return SUCCESS;
+}
+
+Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &node, size_t out_anchor_index) {
+  GELOGD("Start update max shape of %s, %zu output.", node->GetName().c_str(), out_anchor_index);
+  auto data_shape = NodeUtils::GetOutputDesc(*node, out_anchor_index).GetShape();
+  string data_name = node->GetName();
+  if (IsGetNextType(node)) {
+    data_name.append("_").append(std::to_string(out_anchor_index));
+  }
+  GELOGD("Update max shape of %s, shape dims is %s.", data_name.c_str(),
+         formats::JoinToString(data_shape.GetDims()).c_str());
   const auto &dims = data_shape.GetDims();
-  if (std::all_of(dims.begin(), dims.end(), [](int64_t val) { return val >= 0; })) {
-    return SUCCESS;
+  if (!IsGetNextType(node)) {
+    if (std::all_of(dims.begin(), dims.end(), [](int64_t val) { return val >= 0; })) {
+      GELOGD("No need to do anything for static data.");
+      return SUCCESS;
+    }
+  } else {
+    if (std::all_of(dims.begin(), dims.end(), [](int64_t val) { return val >= 0; })) {
+      if (getnext_sink_dynamic_dims_) {
+        // need to update shape of Shape_node when getnext node has dynamic data
+        GE_CHK_STATUS_RET(UpdateShapeOfShapeNode(node, out_anchor_index), "Failed to update shape of shape node");
+      }
+      return SUCCESS;
+    }
   }
+  (void)AttrUtils::SetListInt(node->GetOpDesc(), ATTR_MBATCH_ORIGIN_INPUT_DIMS, data_shape.GetDims());
+
+  GeTensorDesc tensor(NodeUtils::GetOutputDesc(*node, kDataOutIndex));
+  std::vector<std::string> input_dims_str;
+  for (size_t i = 0; i < batch_shapes_.size(); ++i) {
+    auto shape = data_shape;
+    auto ret = multibatch::CalcShape(data_to_dynamic_info_.at(data_name).at(i), shape);
+    if (ret != SUCCESS) {
+      GELOGE(ret, "Failed to calculate the shape for data node %s, the shape may not match", node->GetName().c_str());
+      return ret;
+    }
+    tensor.SetShape(shape);
+    int64_t tensor_size = 0;
+    (void)TensorUtils::GetTensorSizeInBytes(tensor, tensor_size);
+    string input_str = TypeUtils::FormatToSerialString(tensor.GetFormat()) + ":" +
+	               TypeUtils::DataTypeToSerialString(tensor.GetDataType()) + ":" + node->GetName() + ":" +
+	               std::to_string(tensor_size) + ":" + std::to_string(tensor.GetShape().GetDimNum()) + ":" +
+                       formats::JoinToString(tensor.GetShape().GetDims());
+    input_dims_str.emplace_back(input_str);
+  }
+  (void)AttrUtils::SetListStr(node->GetOpDesc(), "_all_origin_gears_inputs", input_dims_str);
 
-  (void)AttrUtils::SetListInt(data->GetOpDesc(), ATTR_MBATCH_ORIGIN_INPUT_DIMS, data_shape.GetDims());
   size_t max_shape_index = 0;
   int64_t max_size = 0;
   for (size_t i = 0; i < batch_shapes_.size(); ++i) {
@@ -497,20 +816,88 @@ Status MultiBatchClonePass::SetMaxShapeToData(const NodePtr &data) {
       max_shape_index = i;
     }
   }
+  return SetShapeToData(data_to_dynamic_info_.at(data_name).at(max_shape_index), node, data_shape, out_anchor_index);
+}
+
+///
+/// @ingroup ge
+/// @brief Set max shape to Data/GetNext node in root graph.
+/// @param [in] const std::vector<int64_t> &shapes: dims of shape.
+/// @param [in] const NodePtr &data: data in Root/Case graph.
+/// @param [in] GeShape &data_shape: dims of data node.
+/// @param [in] size_t out_anchor_index: out anchor index of data node.
+/// @return 0: SUCCESS / others: FAILED
+///
+Status MultiBatchClonePass::SetShapeToData(const std::vector<int64_t> &shapes, const NodePtr &data, GeShape &data_shape,
+                                           size_t out_anchor_index) {
+  GELOGD("Start set shape to %zu out of %s.", out_anchor_index, data->GetName().c_str());
+  if (multibatch::CalcShape(shapes, data_shape) != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Failed to calculate the batched shape for data node %s, the shapes may not match",
+           data->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+
+  if (NodeUtils::UpdateOutputShape(*data, out_anchor_index, data_shape) != GRAPH_SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Failed to update output shape for data %s", data->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+  if (!IsGetNextType(data)) {
+    if (NodeUtils::UpdateInputShape(*data, kDataInIndex, data_shape) != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Failed to update input shape for data %s", data->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+  } else {
+    if (getnext_sink_dynamic_dims_) {
+      // need to update shape of Shape_node when getnext_sink_dynamic
+      GE_CHK_STATUS_RET(UpdateShapeOfShapeNode(data, out_anchor_index), "Failed to update shape of shape node");
+    }
+  }
 
-  return SetShapeToData(data_to_dynamic_info_.at(data_name).at(max_shape_index), data, data_shape);
+  GELOGI("Update the data %s input/output shape to the max %s", data->GetName().c_str(),
+         formats::ShapeToString(data_shape).c_str());
+  return SUCCESS;
+}
+
+Status MultiBatchClonePass::UpdateShapeOfShapeNode(const NodePtr &node, size_t out_anchor_index) {
+  GELOGD("Start update output shape of shape node insert by adapter, which is the %zu out of %s.", out_anchor_index,
+         node->GetName().c_str());
+  auto data_shape = NodeUtils::GetOutputDesc(*node, out_anchor_index).GetShape();
+  size_t shape_index = out_anchor_index + (node->GetAllOutDataAnchors().size() / kDivisionConst);
+  GeTensorDesc output_desc = node->GetOpDesc()->GetOutputDesc(shape_index);
+  std::vector<int64_t> output_dims = {static_cast<int64_t>(data_shape.GetDims().size())};
+  GeShape output_shape(output_dims);
+  output_desc.SetShape(output_shape);
+  if (node->GetOpDesc()->UpdateOutputDesc(shape_index, output_desc) != SUCCESS) {
+    GELOGE(FAILED, "Update output desc fail.");
+    return FAILED;
+  }
+  return SUCCESS;
 }
 
 ///
 /// @ingroup ge
-/// @brief Set shape to Data node in branch.
-/// @param [in] const NodePtr &data: data in branch.
-/// @param [in] size_t index: The batch index.
+/// @brief Update Data node in Subgraph.
+/// @param [in] const NodePtr &data: data in Subgraph.
+/// @param [in] size_t batch_index: The batch index.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status MultiBatchClonePass::UpdateShapeToData(const NodePtr &data, size_t index) {
+Status MultiBatchClonePass::UpdateSubgraphData(const NodePtr &data, size_t batch_index) {
+  int node_index = -1;
+  if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_INDEX, node_index)) {
+    GELOGE(FAILED, "Failed to get index from data[%s]", data->GetName().c_str());
+    return FAILED;
+  }
+
+  int parent_index = node_index + 1;
+  if (!AttrUtils::SetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+    GELOGE(FAILED, "Failed to set parent index for node %s", data->GetName().c_str());
+    return FAILED;
+  }
+
   auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
   const auto &dims = data_shape.GetDims();
+  GELOGD("Start update shape of %s , batch index is %zu, dims is %s.", data->GetName().c_str(), batch_index,
+         formats::JoinToString(dims).c_str());
   if (std::all_of(dims.begin(), dims.end(), [](int64_t val) { return val >= 0; })) {
     return SUCCESS;
   }
@@ -525,35 +912,77 @@ Status MultiBatchClonePass::UpdateShapeToData(const NodePtr &data, size_t index)
   }
 
   auto parent_name = data_name.substr(0, pos);
-  return SetShapeToData(data_to_dynamic_info_.at(parent_name).at(index), data, data_shape);
+  return SetShapeToData(data_to_dynamic_info_.at(parent_name).at(batch_index), data, data_shape, kDataOutIndex);
 }
 
-///
-/// @ingroup ge
-/// @brief Set max shape to Data node in root graph.
-/// @param [in] const std::vector<int64_t> &shapes: dims of shape.
-/// @param [in] const NodePtr &data: data in Root/Case graph.
-/// @param [in] GeShape &data_shape: dims of data node.
-/// @return 0: SUCCESS / others: FAILED
-///
-Status MultiBatchClonePass::SetShapeToData(const vector<int64_t> &shapes, const NodePtr &data, GeShape &data_shape) {
-  // must not be error, the calc result has been checked in function InsertSwitchNForData
-  if (multibatch::CalcShape(shapes, data_shape) != SUCCESS) {
-    return INTERNAL_ERROR;
+Status MultiBatchClonePass::CreateOriGraph(const ComputeGraphPtr &graph) {
+  if (data_count_from_getnext_ == 0) {
+    GELOGD("No need to change original graph without getnext node.");
+    return SUCCESS;
   }
-
-  if (NodeUtils::UpdateInputShape(*data, kDataInIndex, data_shape) != GRAPH_SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Failed to update input shape for data %s", data->GetName().c_str());
-    return INTERNAL_ERROR;
+  GELOGD("Start change original graph: %s when exit getnext node.", graph->GetName().c_str());
+  size_t data_index = all_data_nodes_.size() - kNumOfGetnextNode;
+  for (const auto &node : graph->GetDirectNode()) {
+    if (IsGetNextType(node)) {
+      for (size_t out_index = 0; out_index < data_count_from_getnext_; ++out_index, ++data_index) {
+        auto out_data_anchor =  node->GetOutDataAnchor(out_index);
+        GE_IF_BOOL_EXEC(out_data_anchor == nullptr, continue);
+        NodePtr data_node = CreateDataNode(graph, out_data_anchor, data_index);
+        GE_IF_BOOL_EXEC(data_node == nullptr, GELOGE(INTERNAL_ERROR, "Create %zu data node failed.",
+                                                     out_data_anchor->GetIdx()); return INTERNAL_ERROR);
+        for (auto &in_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+          GE_IF_BOOL_EXEC(in_anchor == nullptr, continue);
+          NodePtr dst_node = in_anchor->GetOwnerNode();
+          if (GraphUtils::RemoveEdge(out_data_anchor, in_anchor) != GRAPH_SUCCESS) {
+            GELOGE(INTERNAL_ERROR, "Failed to remove edge between %s to %s", node->GetName().c_str(),
+                   dst_node->GetName().c_str());
+            return INTERNAL_ERROR;
+          }
+          if (GraphUtils::AddEdge(data_node->GetOutDataAnchor(0), dst_node->GetInDataAnchor(in_anchor->GetIdx())) !=
+              GRAPH_SUCCESS) {
+            GELOGE(INTERNAL_ERROR, "Failed to add edge between %s to %s", data_node->GetName().c_str(),
+                   dst_node->GetName().c_str());
+            return INTERNAL_ERROR;
+          }
+        }
+      }
+      if (graph->RemoveNode(node) != GRAPH_SUCCESS) {
+        GELOGE(GRAPH_FAILED, "Remove node %s failed!", node->GetName().c_str());
+        return GRAPH_FAILED;
+      }
+      break;
+    }
   }
+  return SUCCESS;
+}
 
-  if (NodeUtils::UpdateOutputShape(*data, kDataOutIndex, data_shape) != GRAPH_SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Failed to update output shape for data %s", data->GetName().c_str());
-    return INTERNAL_ERROR;
+NodePtr MultiBatchClonePass::CreateDataNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor,
+                                            size_t data_index) {
+  size_t out_anchor_index = out_data_anchor->GetIdx();
+  std::string node_name = out_data_anchor->GetOwnerNode()->GetName() + "_" +  std::to_string(out_anchor_index);
+  OpDescPtr op_desc = MakeShared<OpDesc>(node_name, DATA);
+  if (op_desc == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Create data node failed.");
+    return nullptr;
   }
+  (void)AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index);
 
-  GELOGI("Update %s input/output shape to %s", data->GetName().c_str(), formats::ShapeToString(data_shape).c_str());
-  return SUCCESS;
+  OpDescPtr getnext_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc();
+  if (getnext_op_desc == nullptr) {
+    GELOGE(OUT_OF_MEMORY, "Op desc of %s is nullptr.", out_data_anchor->GetOwnerNode()->GetName().c_str());
+    return nullptr;
+  }
+  if (op_desc->AddInputDesc(getnext_op_desc->GetOutputDesc(out_anchor_index)) != GRAPH_SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Add %s input desc failed.", op_desc->GetName().c_str());
+    return nullptr;
+  }
+  if (op_desc->AddOutputDesc(getnext_op_desc->GetOutputDesc(out_anchor_index)) != GRAPH_SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Add %s output desc failed.", op_desc->GetName().c_str());
+    return nullptr;
+  }
+  NodePtr data_node = graph->AddNode(op_desc);
+  GELOGD("Success create %s node.", data_node->GetName().c_str());
+  return data_node;
 }
 
 ///
@@ -564,29 +993,29 @@ Status MultiBatchClonePass::SetShapeToData(const vector<int64_t> &shapes, const
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status MultiBatchClonePass::CreateSubgraphs(const ComputeGraphPtr &graph, const ComputeGraphPtr &branch) {
+  GELOGD("Start create subgraphs for %s.", graph->GetName().c_str());
   const auto &op_desc = case_node_->GetOpDesc();
   for (size_t i = 0; i < batch_shapes_.size(); ++i) {
     std::vector<NodePtr> input_nodes;
     std::vector<NodePtr> output_nodes;
     const std::string postfix = kMultiBatchNodePostfix + std::to_string(i);
     ComputeGraphPtr subgraph = (i == 0) ? branch : GraphUtils::CloneGraph(branch, postfix, input_nodes, output_nodes);
-    if (subgraph == nullptr) {
-      GELOGE(FAILED, "Create multi-batch case node failed");
-      return FAILED;
-    }
-
+    GE_IF_BOOL_EXEC(subgraph == nullptr, GELOGE(FAILED, "Create multi-batch case node failed"); return FAILED);
     subgraph->SetName("Batch_" + std::to_string(i));
     subgraph->SetParentNode(case_node_);
     subgraph->SetParentGraph(graph);
     graph->AddSubgraph(subgraph->GetName(), subgraph);
     all_branch_output_[subgraph] = subgraph->FindFirstNodeMatchType(NETOUTPUT);
+    GE_CHK_STATUS_RET(UpdateSubgraphOutput(all_branch_output_[subgraph]),
+                      "Update %s failed", all_branch_output_[subgraph]->GetName().c_str());
 
     const string key_name = "branches" + std::to_string(i);
     op_desc->AddSubgraphName(key_name);
     op_desc->SetSubgraphInstanceName(i, subgraph->GetName());
 
+    GELOGD("The %s has %zu input, %zu output.", subgraph->GetName().c_str(), input_nodes.size(), output_nodes.size());
     for (const auto &data : input_nodes) {
-      GE_CHK_STATUS_RET(UpdateShapeToData(data, i), "Update %s failed", subgraph->GetName().c_str());
+      GE_CHK_STATUS_RET(UpdateSubgraphData(data, i), "Update %s failed", subgraph->GetName().c_str());
     }
   }
 
@@ -595,55 +1024,27 @@ Status MultiBatchClonePass::CreateSubgraphs(const ComputeGraphPtr &graph, const
     const auto &op_desc = n->GetOpDesc();
     op_desc->SetName(n->GetName() + kMultiBatchNodePostfix + "0");
     if (n->GetType() == DATA) {
-      GE_CHK_STATUS_RET(UpdateShapeToData(n, 0), "Update %s failed", branch->GetName().c_str());
+      GE_CHK_STATUS_RET(UpdateSubgraphData(n, 0), "Update %s failed", branch->GetName().c_str());
     }
   }
 
-  return PostProcSubgraph(graph);
+  return SUCCESS;
 }
 
 ///
 /// @ingroup ge
-/// @brief Assign parent index for branches.
-/// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
+/// @brief Update output_node in Subgraph.
+/// @param [in] const NodePtr &output_node: output_node in Subgraph.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status MultiBatchClonePass::PostProcSubgraph(const ComputeGraphPtr &graph) {
-  auto func_desc = case_node_->GetOpDesc();
-  domi::ParseSubgraphFuncV2 parse_func_v2 = nullptr;
-  auto post_func = domi::OpRegistry::Instance()->GetParseSubgraphPostFunc(func_desc->GetType());
-  if (post_func == nullptr) {
-    GELOGW("The subgraph post func for node %s type %s is null.", case_node_->GetName().c_str(),
-           case_node_->GetType().c_str());
-    if (domi::OpRegistry::Instance()->GetParseSubgraphPostFunc(func_desc->GetType(), parse_func_v2) != SUCCESS ||
-      parse_func_v2 == nullptr) {
-      GELOGW("The subgraph new post func v2 for node %s type %s is null", case_node_->GetName().c_str(),
-             case_node_->GetType().c_str());
-      return FAILED;
-    }
-  }
-
-  for (const auto &name : func_desc->GetSubgraphInstanceNames()) {
-    const auto &subgraph = graph->GetSubgraph(name);
-    if (subgraph == nullptr) {
-      GELOGE(FAILED, "Subgraph not found, name: %s", name.c_str());
-      return FAILED;
-    }
-
-    std::string subgraph_name;
-    GE_CHK_STATUS_RET(func_desc->GetSubgraphNameByInstanceName(subgraph->GetName(), subgraph_name),
-                      "Subgraph: %s get subgraph name failed.", subgraph->GetName().c_str());
-
-    auto graph = GraphUtils::CreateGraphFromComputeGraph(subgraph);
-    Status ret = FAILED;
-    if (post_func != nullptr) {
-      ret = post_func(subgraph_name, graph);
-    } else if (parse_func_v2 != nullptr) {
-      ret = parse_func_v2(subgraph_name.c_str(), graph);
-    }
-    if (ret != SUCCESS) {
-      GELOGE(FAILED, "Failed to post-process subgraph %s on node %s type %s", graph.GetName().c_str(),
-             case_node_->GetName().c_str(), case_node_->GetType().c_str());
+Status MultiBatchClonePass::UpdateSubgraphOutput(const NodePtr &output_node) {
+  const auto &op_desc = output_node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  for (size_t index = 0; index < op_desc->GetInputsSize(); ++index) {
+    GeTensorDescPtr tensor = op_desc->MutableInputDesc(index);
+    GE_CHECK_NOTNULL(tensor);
+    if (!AttrUtils::SetInt(tensor, ATTR_NAME_PARENT_NODE_INDEX, index)) {
+      GELOGE(FAILED, "Failed to set parent index for node %s", output_node->GetName().c_str());
       return FAILED;
     }
   }
@@ -658,6 +1059,7 @@ Status MultiBatchClonePass::PostProcSubgraph(const ComputeGraphPtr &graph) {
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status MultiBatchClonePass::PruneDirectOutput(const ComputeGraphPtr &graph) {
+  GELOGD("Start prune direct output.");
   const auto &func_desc = case_node_->GetOpDesc();
   uint32_t unused_num = 0;
   uint32_t output_num = func_desc->GetOutputsSize();
@@ -702,6 +1104,7 @@ Status MultiBatchClonePass::PruneDirectOutput(const ComputeGraphPtr &graph) {
 ///
 Status MultiBatchClonePass::UpdateOutputTensor(uint32_t parent_index, uint32_t unused_num) {
   if (unused_num == 0) {
+    GELOGD("No need to update output tensor.");
     return SUCCESS;
   }
 
diff --git a/ge/graph/passes/multi_batch_clone_pass.h b/ge/graph/passes/multi_batch_clone_pass.h
index 1155dfc8..66e92892 100755
--- a/ge/graph/passes/multi_batch_clone_pass.h
+++ b/ge/graph/passes/multi_batch_clone_pass.h
@@ -36,6 +36,7 @@ class MultiBatchClonePass : public GraphPass {
   /// @return 0: SUCCESS / others: FAILED
   ///
   Status CollectIoNodes(const ComputeGraphPtr &graph);
+  Status InitParamsOfGetNext(const NodePtr &node);
 
   ///
   /// @ingroup ge
@@ -49,10 +50,12 @@ class MultiBatchClonePass : public GraphPass {
   /// @ingroup ge
   /// @brief Create index data node for root graph.
   /// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
-  /// @param [in] NodePtr node: index data node.
+  /// @param [in] NodePtr shape_node: index data node, DATA or GETDYNAMICDIMS type.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &node);
+  Status CreateIndexDataNode(const ComputeGraphPtr &graph, NodePtr &shape_node);
+
+  Status CreateGetDynamicDimsNode(const ComputeGraphPtr &graph, NodePtr &shape_node);
 
   ///
   /// @ingroup ge
@@ -70,6 +73,9 @@ class MultiBatchClonePass : public GraphPass {
   /// @return 0: SUCCESS / others: FAILED
   ///
   Status CreateIndexNode(const ComputeGraphPtr &graph);
+  Status AddAttrForGetDynamicDims(const NodePtr &shape_node);
+  Status LinkGetNextToGetDynamicDims(const NodePtr &getnext_node, const NodePtr &shape_node);
+  Status LinkGetDynamicDimsToNetOutput(const NodePtr &output_node);
 
   ///
   /// @ingroup ge
@@ -78,6 +84,28 @@ class MultiBatchClonePass : public GraphPass {
   /// @return 0: SUCCESS / others: FAILED
   ///
   Status CreateInputNode(const ComputeGraphPtr &graph);
+  Status LinkEdgeForGetNext(const NodePtr &getnext_node, size_t &case_input_index);
+
+  ///
+  /// @ingroup ge
+  /// @brief Set max shape to Data node in root graph.
+  /// @param [in] const NodePtr &data: data in Root/Case graph.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status SetMaxShape(const NodePtr &data);
+  Status SetMaxShapeToData(const NodePtr &node, size_t out_anchor_index);
+  ///
+  /// @ingroup ge
+  /// @brief Set max shape to Data/GetNext node in root graph.
+  /// @param [in] const std::vector<int64_t> &shapes: dims of shape.
+  /// @param [in] const NodePtr &data: data in Root/Case graph.
+  /// @param [in] GeShape &data_shape: dims of data node.
+  /// @param [in] size_t out_anchor_index: out anchor index of data node.
+  /// @return 0: SUCCESS / others: FAILED
+  ///
+  Status SetShapeToData(const std::vector<int64_t> &shapes, const NodePtr &data, GeShape &data_shape,
+                        size_t out_anchor_index);
+  Status UpdateShapeOfShapeNode(const NodePtr &node, size_t out_anchor_index);
 
   ///
   /// @ingroup ge
@@ -86,6 +114,7 @@ class MultiBatchClonePass : public GraphPass {
   /// @return 0: SUCCESS / others: FAILED
   ///
   Status CreateConstNode(const ComputeGraphPtr &graph);
+  void ChangeConstToData();
 
   ///
   /// @ingroup ge
@@ -97,30 +126,29 @@ class MultiBatchClonePass : public GraphPass {
 
   ///
   /// @ingroup ge
-  /// @brief Set max shape to Data node in root graph.
-  /// @param [in] const NodePtr &data: data in Root/Case graph.
+  /// @brief Update Data node in Subgraph.
+  /// @param [in] const NodePtr &data: data in Subgraph.
+  /// @param [in] size_t batch_index: The batch index.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status SetMaxShapeToData(const NodePtr &data);
+  Status UpdateSubgraphData(const NodePtr &data, size_t batch_index);
 
   ///
   /// @ingroup ge
-  /// @brief Set shape to Data node in branch.
-  /// @param [in] const NodePtr &data: data in branch.
-  /// @param [in] size_t index: The batch index.
+  /// @brief Update output_node in Subgraph.
+  /// @param [in] const NodePtr &output_node: output_node in Subgraph.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status UpdateShapeToData(const NodePtr &data, size_t index);
+  Status UpdateSubgraphOutput(const NodePtr &output_node);
 
   ///
   /// @ingroup ge
-  /// @brief Set max shape to Data node in root graph.
-  /// @param [in] const std::vector<int64_t> &shapes: dims of shape.
-  /// @param [in] const NodePtr &data: data in Root/Case graph.
-  /// @param [in] GeShape &data_shape: dims of data node.
+  /// @brief Create nodes for root graph.
+  /// @param [in] const ComputeGraphPtr &graph: Original graph.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status SetShapeToData(const std::vector<int64_t> &shapes, const NodePtr &data, GeShape &data_shape);
+  Status CreateOriGraph(const ComputeGraphPtr &graph);
+  NodePtr CreateDataNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor, size_t data_index);
 
   ///
   /// @ingroup ge
@@ -133,14 +161,6 @@ class MultiBatchClonePass : public GraphPass {
 
   ///
   /// @ingroup ge
-  /// @brief Assign parent index for branches.
-  /// @param [in] const ComputeGraphPtr &graph: Root/Case graph.
-  /// @return 0: SUCCESS / others: FAILED
-  ///
-  Status PostProcSubgraph(const ComputeGraphPtr &graph);
-
-  ///
-  /// @ingroup ge
   /// @brief Remove subgraph supend output anchor.
   /// @param [in] ComputeGraphPtr &graph: Parent compute graph.
   /// @return 0: SUCCESS / others: FAILED
@@ -168,6 +188,10 @@ class MultiBatchClonePass : public GraphPass {
   std::map<string, vector<vector<int64_t>>> data_to_dynamic_info_;
 
   NodePtr case_node_;
+  size_t data_count_from_getnext_ = 0;
+  bool getnext_sink_dynamic_dims_ = false;
+  NodePtr shape_node_;
+  std::set<NodePtr> out_control_nodes_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_MULTI_BATCH_CLONE_PASS_H_
diff --git a/ge/graph/passes/multi_batch_pass.cc b/ge/graph/passes/multi_batch_pass.cc
index c7034612..74f7e30e 100644
--- a/ge/graph/passes/multi_batch_pass.cc
+++ b/ge/graph/passes/multi_batch_pass.cc
@@ -22,9 +22,6 @@
 #include "graph/common/omg_util.h"
 #include "graph/utils/type_utils.h"
 
-using std::string;
-using std::vector;
-
 namespace ge {
 Status MultiBatchPass::Run(ComputeGraphPtr graph) {
   GELOGD("MultiBatchPass Enter");
@@ -53,7 +50,7 @@ Status MultiBatchPass::Run(ComputeGraphPtr graph) {
     return FAILED;
   }
   std::vector<std::vector<int64_t>> batch_shape;
-  vector<vector<int64_t>> combined_batch;
+  std::vector<std::vector<int64_t>> combined_batch;
   if (!CheckSwitchN(batch_shape, combined_batch)) {
     GELOGE(FAILED, "CheckSwitchN failed.");
     return FAILED;
@@ -104,6 +101,7 @@ Status MultiBatchPass::ClearStatus() {
 ///
 Status MultiBatchPass::SetCaseLabel(const ComputeGraphPtr &graph, const NodePtr &case_node) {
   const auto &func_desc = case_node->GetOpDesc();
+  GE_CHECK_NOTNULL(func_desc);
   if (!func_desc->HasAttr(ATTR_NAME_BATCH_NUM)) {
     GELOGD("Graph: %s Not multi-batch, Node: %s", graph->GetName().c_str(), case_node->GetName().c_str());
     return SUCCESS;
@@ -114,7 +112,7 @@ Status MultiBatchPass::SetCaseLabel(const ComputeGraphPtr &graph, const NodePtr
     const auto &subgraph = graph->GetSubgraph(dynamic_branch_names[i]);
     GE_CHECK_NOTNULL(subgraph);
 
-    const string batch_label = "Batch_" + std::to_string(i);
+    const std::string batch_label = "Batch_" + std::to_string(i);
     for (const auto &node : subgraph->GetDirectNode()) {
       (void)AttrUtils::SetStr(node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, batch_label);
     }
@@ -139,12 +137,12 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor
       continue;
     }
 
-    InDataAnchorPtr in_data_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
+    const auto &in_data_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
     if (in_data_anchor == nullptr) {
       GELOGE(FAILED, "FindPredInput failed, in_data_anchor is null, node:%s.", node->GetName().c_str());
       return FAILED;
     }
-    OutDataAnchorPtr pred_input = in_data_anchor->GetPeerOutAnchor();
+    const auto &pred_input = in_data_anchor->GetPeerOutAnchor();
     if (pred_input == nullptr) {
       GELOGE(FAILED, "FindPredInput failed, pred_input is null, node:%s.", node->GetName().c_str());
       return FAILED;
@@ -178,12 +176,10 @@ Status MultiBatchPass::FindPredValue(const ComputeGraphPtr &graph, OutDataAnchor
 /// @return Status
 ///
 Status MultiBatchPass::GetDynamicType() {
-  for (const auto &switchn : switch_n_nodes_) {
-    auto switchn_desc = switchn->GetOpDesc();
-    GE_CHECK_NOTNULL(switchn_desc);
+  for (const auto &switch_n : switch_n_nodes_) {
     int32_t dynamic_type = static_cast<int32_t>(FIXED);
-    if (!AttrUtils::GetInt(switchn_desc, ATTR_DYNAMIC_TYPE, dynamic_type)) {
-      GELOGE(FAILED, "Get attr ATTR_DYNAMIC_TYPE of node: %s failed.", switchn->GetName().c_str());
+    if (!AttrUtils::GetInt(switch_n->GetOpDesc(), ATTR_DYNAMIC_TYPE, dynamic_type)) {
+      GELOGE(FAILED, "Get attr ATTR_DYNAMIC_TYPE of node: %s failed.", switch_n->GetName().c_str());
       return FAILED;
     }
     if (dynamic_type == static_cast<int32_t>(FIXED)) {
@@ -191,7 +187,7 @@ Status MultiBatchPass::GetDynamicType() {
       return FAILED;
     }
     if (dynamic_type_ != static_cast<int32_t>(FIXED) && dynamic_type_ != dynamic_type) {
-      GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE of all switchn node should be same, while one is %d and another is %d.",
+      GELOGE(FAILED, "Attr ATTR_DYNAMIC_TYPE of all switch_n node should be same, while one is %d and another is %d.",
              dynamic_type, dynamic_type_);
       return FAILED;
     }
@@ -212,21 +208,19 @@ Status MultiBatchPass::GetDynamicType() {
 Status MultiBatchPass::GetUserDesignateShape() {
   data_name_order_.clear();
   bool first_check = true;
-  for (const auto &switchn : switch_n_nodes_) {
-    auto switchn_desc = switchn->GetOpDesc();
-    GE_CHECK_NOTNULL(switchn_desc);
-    vector<string> cur_switchn_data_name_order;
-    if (!AttrUtils::GetListStr(switchn_desc, ATTR_USER_DESIGNEATE_SHAPE_ORDER, cur_switchn_data_name_order)) {
-      GELOGE(FAILED, "Get attr ATTR_USER_DESIGNEATE_SHAPE_ORDER of node: %s failed.", switchn->GetName().c_str());
+  for (const auto &switch_n : switch_n_nodes_) {
+    std::vector<std::string> cur_data_name_order;
+    if (!AttrUtils::GetListStr(switch_n->GetOpDesc(), ATTR_USER_DESIGNEATE_SHAPE_ORDER, cur_data_name_order)) {
+      GELOGE(FAILED, "Get attr ATTR_USER_DESIGNEATE_SHAPE_ORDER of node: %s failed.", switch_n->GetName().c_str());
       return FAILED;
     }
     if (first_check) {
-      data_name_order_ = cur_switchn_data_name_order;
+      data_name_order_ = cur_data_name_order;
       first_check = false;
     } else {
-      if (data_name_order_ != cur_switchn_data_name_order) {
+      if (data_name_order_ != cur_data_name_order) {
         GELOGE(FAILED, "The ATTR_USER_DESIGNEATE_SHAPE_ORDER of switchN must be same: %s failed.",
-               switchn->GetName().c_str());
+               switch_n->GetName().c_str());
         return FAILED;
       }
     }
@@ -245,7 +239,8 @@ Status MultiBatchPass::GetUserDesignateShape() {
 /// @param [out] combined_batch
 /// @return bool
 ///
-bool MultiBatchPass::CheckSwitchN(vector<vector<int64_t>> &batch_shape, vector<vector<int64_t>> &combined_batch) {
+bool MultiBatchPass::CheckSwitchN(std::vector<std::vector<int64_t>> &batch_shape,
+                                  std::vector<std::vector<int64_t>> &combined_batch) {
   // Check if output_num of different SwitchN is same
   uint32_t batch_num = 0;
   for (const NodePtr &node : switch_n_nodes_) {
@@ -281,7 +276,8 @@ bool MultiBatchPass::CheckSwitchN(vector<vector<int64_t>> &batch_shape, vector<v
     }
     size_t tmp_combined_dim_num = combined_batch[i].size();
     if (combined_dim_num != tmp_combined_dim_num) {
-      GELOGE(FAILED, "Dim num of combined_batch not equal, batch_0:%zu, batch_%u:%zu.", dim_num, i, tmp_dim_num);
+      GELOGE(FAILED, "Dim num of combined_batch not equal, batch_0:%zu, batch_%u:%zu.",
+             combined_dim_num, i, tmp_combined_dim_num);
       return false;
     }
   }
@@ -296,11 +292,11 @@ bool MultiBatchPass::CheckSwitchN(vector<vector<int64_t>> &batch_shape, vector<v
 /// @param [out] combined_batch
 /// @return bool
 ///
-bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, vector<vector<int64_t>> &batch_shape,
-                                  vector<vector<int64_t>> &combined_batch) {
+bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, std::vector<std::vector<int64_t>> &batch_shape,
+                                  std::vector<std::vector<int64_t>> &combined_batch) {
   // Check if output_shape of different SwitchN is same
-  vector<vector<int64_t>> idx_batch_shape;
-  vector<vector<int64_t>> idx_combined_batch;
+  std::vector<std::vector<int64_t>> idx_batch_shape;
+  std::vector<std::vector<int64_t>> idx_combined_batch;
   for (uint32_t i = 0; i < batch_num; i++) {
     idx_batch_shape.clear();
     idx_combined_batch.clear();
@@ -310,7 +306,7 @@ bool MultiBatchPass::GetBatchInfo(uint32_t batch_num, vector<vector<int64_t>> &b
         GELOGE(FAILED, "CheckDims failed, get op_desc failed, node: %s.", node->GetName().c_str());
         return false;
       }
-      vector<int64_t> output_dims;
+      std::vector<int64_t> output_dims;
       if (!AttrUtils::GetListInt(op_desc->GetOutputDesc(i), ATTR_NAME_SWITCHN_PRED_VALUE, output_dims)) {
         GELOGE(FAILED, "CheckDims failed, get attr ATTR_NAME_SWITCHN_PRED_VALUE failed, batch_index=%u.", i);
         return false;
@@ -385,8 +381,8 @@ Status MultiBatchPass::FindSwitchOutNodes(uint32_t batch_num) {
 /// @return Status
 ///
 Status MultiBatchPass::ReplaceSwitchN(const ComputeGraphPtr &graph, const OutDataAnchorPtr &pred_value,
-                                      const vector<vector<int64_t>> &batch_shape,
-                                      const vector<vector<int64_t>> &combined_batch) {
+                                      const std::vector<std::vector<int64_t>> &batch_shape,
+                                      const std::vector<std::vector<int64_t>> &combined_batch) {
   NodePtr pred_value_node = pred_value->GetOwnerNode();
   // Create SwitchCase node
   const std::string &switch_case_name = pred_value_node->GetName() + "_" + STREAMSWITCHN;
@@ -429,31 +425,11 @@ bool MultiBatchPass::CheckDims(const std::vector<std::vector<int64_t>> &output_s
     return false;
   }
 
-  size_t num = output_shape.size();
-  size_t dim_num = output_shape[0].size();
-  for (size_t i = 1; i < num; i++) {
-    size_t tmp_dim_num = output_shape[i].size();
-    if (dim_num != tmp_dim_num) {
-      GELOGE(FAILED, "CheckDims failed: dim_num not equal, output_0:%zu, output_%zu:%zu.", dim_num, i, tmp_dim_num);
+  for (auto iter = output_shape.begin() + 1; iter != output_shape.end(); ++iter) {
+    if (output_shape[0] != *iter) {
       return false;
     }
   }
-
-  if (dim_num == 0) {
-    return true;
-  }
-
-  for (size_t i = 0; i < dim_num; i++) {
-    int64_t dim_value = output_shape[0][i];
-    for (size_t j = 1; j < num; j++) {
-      int64_t tmp_dim_value = output_shape[j][i];
-      if (dim_value != tmp_dim_value) {
-        GELOGE(FAILED, "CheckDims failed: dim_value not equal, dim_index=%zu, dim_value_0:%ld, dim_value_%zu:%ld.", i,
-               dim_value, j, tmp_dim_value);
-        return false;
-      }
-    }
-  }
   return true;
 }
 
@@ -468,8 +444,8 @@ bool MultiBatchPass::CheckDims(const std::vector<std::vector<int64_t>> &output_s
 ///
 NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const std::string &name,
                                              const OutDataAnchorPtr &pred_value,
-                                             const vector<vector<int64_t>> &batch_shape,
-                                             const vector<vector<int64_t>> &combined_batch) {
+                                             const std::vector<std::vector<int64_t>> &batch_shape,
+                                             const std::vector<std::vector<int64_t>> &combined_batch) {
   OpDescPtr op_desc = MakeShared<OpDesc>(name, STREAMSWITCHN);
   if (op_desc == nullptr) {
     GELOGE(FAILED, "Create op_desc failed, StreamSwitchN:%s.", name.c_str());
@@ -512,7 +488,7 @@ NodePtr MultiBatchPass::CreateSwitchCaseNode(const ComputeGraphPtr &graph, const
       GELOGE(FAILED, "set attr ATTR_NAME_PRED_VALUE failed, StreamSwitchN:%s.", name.c_str());
       return nullptr;
     }
-    const string &attr_combined_batch = ATTR_NAME_COMBINED_BATCH + "_" + std::to_string(i);
+    const std::string &attr_combined_batch = ATTR_NAME_COMBINED_BATCH + "_" + std::to_string(i);
     if (!AttrUtils::SetListInt(op_desc, attr_combined_batch, combined_batch[i])) {
       GELOGE(FAILED, "set attr ATTR_NAME_COMBINED_BATCH failed, StreamSwitchN:%s.", name.c_str());
       return nullptr;
diff --git a/ge/graph/passes/next_iteration_pass.cc b/ge/graph/passes/next_iteration_pass.cc
index d8c4779d..cf46f09d 100644
--- a/ge/graph/passes/next_iteration_pass.cc
+++ b/ge/graph/passes/next_iteration_pass.cc
@@ -19,6 +19,8 @@
 #include "common/ge/ge_util.h"
 #include "graph/common/omg_util.h"
 
+using std::string;
+
 namespace ge {
 Status NextIterationPass::Run(ComputeGraphPtr graph) {
   GELOGD("NextIterationPass Enter");
@@ -35,10 +37,6 @@ Status NextIterationPass::Run(ComputeGraphPtr graph) {
       return INTERNAL_ERROR;
     }
   }
-  if (GroupWithNoBatch(graph) != SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Group enter_nodes failed without batch_label attr.");
-    return INTERNAL_ERROR;
-  }
 
   if (FindWhileGroups() != SUCCESS) {
     GELOGE(INTERNAL_ERROR, "Find while groups failed.");
@@ -73,75 +71,22 @@ Status NextIterationPass::GroupEnterNode(const NodePtr &enter_node) {
     return FAILED;
   }
 
-  std::string batch_label;
-  (void)ge::AttrUtils::GetStr(enter_desc, ATTR_NAME_BATCH_LABEL, batch_label);
-  if (batch_label.empty()) {
-    auto frame_iter = frame_enter_map_.find(frame_name);
-    if (frame_iter == frame_enter_map_.end()) {
-      std::vector<NodePtr> enter_nodes;
-      enter_nodes.emplace_back(enter_node);
-      frame_enter_map_[frame_name] = enter_nodes;
-    } else {
-      frame_iter->second.emplace_back(enter_node);
-    }
-    return SUCCESS;
+  string batch_label;
+  if (ge::AttrUtils::GetStr(enter_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
+    frame_name += batch_label;
   }
 
-  auto group_iter = loop_group_map_.find(frame_name);
-  if (group_iter == loop_group_map_.end()) {
+  auto iter = loop_group_map_.find(frame_name);
+  if (iter == loop_group_map_.end()) {
     LoopCondGroupPtr loop_group = MakeShared<LoopCondGroup>();
     if (loop_group == nullptr) {
       GELOGE(FAILED, "MakeShared for LoopCondGroup failed.");
       return FAILED;
     }
     loop_group->enter_nodes.emplace_back(enter_node);
-    loop_group_map_[frame_name][batch_label] = loop_group;
+    loop_group_map_[frame_name] = loop_group;
   } else {
-    auto batch_iter = group_iter->second.find(batch_label);
-    if (batch_iter == group_iter->second.end()) {
-      LoopCondGroupPtr loop_group = MakeShared<LoopCondGroup>();
-      if (loop_group == nullptr) {
-        GELOGE(FAILED, "MakeShared for LoopCondGroup failed.");
-        return FAILED;
-      }
-      loop_group->enter_nodes.emplace_back(enter_node);
-      group_iter->second[batch_label] = loop_group;
-    } else {
-      batch_iter->second->enter_nodes.emplace_back(enter_node);
-    }
-  }
-
-  return SUCCESS;
-}
-
-///
-/// @brief Group Enter nodes without batch_label attr
-/// @param [in] compute_graph
-/// @return Status
-///
-Status NextIterationPass::GroupWithNoBatch(const ComputeGraphPtr &graph) {
-  if (frame_enter_map_.empty()) {
-    GELOGI("All enter nodes in graph %s has batch_label attr.", graph->GetName().c_str());
-    return SUCCESS;
-  }
-  for (const auto &item : frame_enter_map_) {
-    const std::string &frame_name = item.first;
-    auto iter = loop_group_map_.find(frame_name);
-    if (iter == loop_group_map_.end()) {
-      LoopCondGroupPtr loop_group = MakeShared<LoopCondGroup>();
-      if (loop_group == nullptr) {
-        GELOGE(FAILED, "MakeShared for LoopCondGroup failed.");
-        return FAILED;
-      }
-      loop_group->enter_nodes = item.second;
-      loop_group_map_[frame_name][""] = loop_group;
-    } else {
-      for (auto &batch_item : iter->second) {
-        for (const auto &enter_node : item.second) {
-          batch_item.second->enter_nodes.emplace_back(enter_node);
-        }
-      }
-    }
+    iter->second->enter_nodes.emplace_back(enter_node);
   }
 
   return SUCCESS;
@@ -154,55 +99,39 @@ Status NextIterationPass::GroupWithNoBatch(const ComputeGraphPtr &graph) {
 Status NextIterationPass::FindWhileGroups() {
   for (const auto &loop_group_iter : loop_group_map_) {
     const std::string &frame_name = loop_group_iter.first;
-    for (const auto &batch_iter : loop_group_iter.second) {
-      const std::string &batch_label = batch_iter.first;
-      for (const auto &enter_node : batch_iter.second->enter_nodes) {
-        for (const auto &out_node : enter_node->GetOutAllNodes()) {
-          GELOGI("Find while_group for enter_node %s, frame_name:%s, batch_label:%s.", enter_node->GetName().c_str(),
-                 frame_name.c_str(), batch_label.c_str());
-          if ((out_node->GetType() != MERGE) && (out_node->GetType() != REFMERGE)) {
-            continue;
-          }
-          std::string tmp_label;
-          GE_CHECK_NOTNULL(out_node->GetOpDesc());
-          (void)AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, tmp_label);
-          bool need_skip = !(batch_label.empty() || tmp_label.empty() || (batch_label == tmp_label));
-          if (need_skip) {
-            continue;
-          }
-
-          NodePtr next_node = nullptr;
-          if (FindTargetNode(out_node, NEXTITERATION, true, batch_label, next_node) != SUCCESS) {
-            GELOGE(INTERNAL_ERROR,
-                   "Get NextIteration node failed: inputs of Merge should be Enter/NextIteration, current_Merge=%s",
-                   out_node->GetName().c_str());
-            return INTERNAL_ERROR;
-          }
-          batch_iter.second->merge_next_pairs.emplace_back(std::make_pair(out_node, next_node));
-
-          NodePtr switch_node = nullptr;
-          if (FindTargetNode(out_node, SWITCH, false, batch_label, switch_node) != SUCCESS) {
-            GELOGE(INTERNAL_ERROR, "Get Switch node failed: output of Merge should be Switch, current_Merge=%s",
-                   out_node->GetName().c_str());
-            return INTERNAL_ERROR;
-          }
-          if (switch_node == nullptr) {
-            continue;
-          }
-
-          NodePtr loop_cond = nullptr;
-          if (FindTargetNode(switch_node, LOOPCOND, true, batch_label, loop_cond) != SUCCESS) {
-            GELOGE(INTERNAL_ERROR,
-                   "Get LoopCond node failed: pred input of Switch should be LoopCond, current_Switch=%s",
-                   switch_node->GetName().c_str());
-            return INTERNAL_ERROR;
-          }
-          if (batch_iter.second->loop_cond == nullptr) {
-            batch_iter.second->loop_cond = loop_cond;
-          } else if (batch_iter.second->loop_cond != loop_cond) {
-            GELOGE(FAILED, "Multi LoopCond nodes exist.");
-            return FAILED;
-          }
+    for (const auto &enter_node : loop_group_iter.second->enter_nodes) {
+      for (const auto &out_node : enter_node->GetOutAllNodes()) {
+        const string &type = out_node->GetType();
+        if ((type != MERGE) && (type != REFMERGE)) {
+          continue;
+        }
+
+        NodePtr next_node = nullptr;
+        if (FindTargetNode(out_node, NEXTITERATION, true, next_node) != SUCCESS) {
+          GELOGE(INTERNAL_ERROR, "Get NextIteration node failed, frame_name: %s", frame_name.c_str());
+          return INTERNAL_ERROR;
+        }
+        loop_group_iter.second->merge_next_pairs.emplace_back(std::make_pair(out_node, next_node));
+
+        NodePtr switch_node = nullptr;
+        if (FindTargetNode(out_node, SWITCH, false, switch_node) != SUCCESS) {
+          GELOGE(INTERNAL_ERROR, "Get Switch node failed, frame_name: %s.", frame_name.c_str());
+          return INTERNAL_ERROR;
+        }
+        if (switch_node == nullptr) {
+          continue;
+        }
+
+        NodePtr loop_cond = nullptr;
+        if (FindTargetNode(switch_node, LOOPCOND, true, loop_cond) != SUCCESS) {
+          GELOGE(INTERNAL_ERROR, "Get LoopCond node failed, frame_name: %s.", frame_name.c_str());
+          return INTERNAL_ERROR;
+        }
+        if (loop_group_iter.second->loop_cond == nullptr) {
+          loop_group_iter.second->loop_cond = loop_cond;
+        } else if (loop_group_iter.second->loop_cond != loop_cond) {
+          GELOGE(FAILED, "Multi LoopCond nodes exist, frame_name: %s.", frame_name.c_str());
+          return FAILED;
         }
       }
     }
@@ -223,18 +152,16 @@ bool NextIterationPass::VerifyWhileGroup() {
       GELOGE(INTERNAL_ERROR, "Verify while group failed, frame_name is empty.");
       return false;
     }
-    for (const auto &batch_iter : loop_group_iter.second) {
-      if (batch_iter.second->loop_cond == nullptr) {
-        GELOGE(INTERNAL_ERROR, "Verify while group failed, LoopCond is null, frame_name: %s.", frame_name.c_str());
-        return false;
-      }
+    if (loop_group_iter.second->loop_cond == nullptr) {
+      GELOGE(INTERNAL_ERROR, "Verify while group failed, LoopCond is null, frame_name: %s.", frame_name.c_str());
+      return false;
+    }
 
-      for (const auto &pair_iter : batch_iter.second->merge_next_pairs) {
-        if ((pair_iter.first == nullptr) || (pair_iter.second == nullptr)) {
-          GELOGE(INTERNAL_ERROR, "Verify while group failed, merge_node/next_node is null, frame_name: %s.",
-                 frame_name.c_str());
-          return false;
-        }
+    for (const auto &pair_iter : loop_group_iter.second->merge_next_pairs) {
+      if ((pair_iter.first == nullptr) || (pair_iter.second == nullptr)) {
+        GELOGE(INTERNAL_ERROR, "Verify while group failed, merge_node/next_node is null, frame_name: %s.",
+               frame_name.c_str());
+        return false;
       }
     }
   }
@@ -249,56 +176,53 @@ bool NextIterationPass::VerifyWhileGroup() {
 ///
 Status NextIterationPass::HandleWhileGroup(ComputeGraphPtr &graph) {
   for (const auto &loop_cond_iter : loop_group_map_) {
-    for (const auto &batch_iter : loop_cond_iter.second) {
-      const std::string &cond_name = batch_iter.second->loop_cond->GetName();
-      GELOGI("Handle while group, LoopCond node: %s.", cond_name.c_str());
-
-      // Create Active node, Enter->Active->Merge, NextIteration->Active->Merge
-      NodePtr enter_active = CreateActiveNode(graph, cond_name + "_Enter_" + STREAMACTIVE);
-      NodePtr next_active = CreateActiveNode(graph, cond_name + "_Next_" + STREAMACTIVE);
-      if ((enter_active == nullptr) || (next_active == nullptr)) {
-        GELOGE(INTERNAL_ERROR, "Create active node failed, cond_name: %s.", cond_name.c_str());
+    const std::string &cond_name = loop_cond_iter.second->loop_cond->GetName();
+    GELOGI("Handle while group, LoopCond node: %s.", cond_name.c_str());
+
+    // Create Active node, Enter->Active->Merge, NextIteration->Active->Merge
+    NodePtr enter_active = CreateActiveNode(graph, cond_name + "_Enter_" + STREAMACTIVE);
+    NodePtr next_active = CreateActiveNode(graph, cond_name + "_Next_" + STREAMACTIVE);
+    if ((enter_active == nullptr) || (next_active == nullptr)) {
+      GELOGE(INTERNAL_ERROR, "Create active node failed, cond_name: %s.", cond_name.c_str());
+      return INTERNAL_ERROR;
+    }
+
+    for (const auto &enter_node : loop_cond_iter.second->enter_nodes) {
+      // Enter --> Active
+      if (GraphUtils::AddEdge(enter_node->GetOutControlAnchor(), enter_active->GetInControlAnchor()) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Add control edge from %s to %s failed.", enter_node->GetName().c_str(),
+               enter_active->GetName().c_str());
         return INTERNAL_ERROR;
       }
+    }
 
-      for (const auto &enter_node : batch_iter.second->enter_nodes) {
-        // Enter --> Active
-        if (GraphUtils::AddEdge(enter_node->GetOutControlAnchor(), enter_active->GetInControlAnchor()) !=
-            GRAPH_SUCCESS) {
-          GELOGE(INTERNAL_ERROR, "Add control edge failed.");
-          return INTERNAL_ERROR;
-        }
+    for (const auto &pair : loop_cond_iter.second->merge_next_pairs) {
+      NodePtr merge_node = pair.first;
+      NodePtr next_node = pair.second;
+      // Active --> Merge
+      if (GraphUtils::AddEdge(enter_active->GetOutControlAnchor(), merge_node->GetInControlAnchor()) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Add control edge failed.");
+        return INTERNAL_ERROR;
       }
 
-      for (const auto &pair : batch_iter.second->merge_next_pairs) {
-        NodePtr merge_node = pair.first;
-        NodePtr next_node = pair.second;
-        // Active --> Merge
-        if (GraphUtils::AddEdge(enter_active->GetOutControlAnchor(), merge_node->GetInControlAnchor()) !=
-            GRAPH_SUCCESS) {
-          GELOGE(INTERNAL_ERROR, "Add control edge failed.");
-          return INTERNAL_ERROR;
-        }
-
-        // NextIteration --> Active
-        if (GraphUtils::AddEdge(next_node->GetOutControlAnchor(), next_active->GetInControlAnchor()) != GRAPH_SUCCESS) {
-          GELOGE(INTERNAL_ERROR, "Add control edge failed.");
-          return INTERNAL_ERROR;
-        }
-
-        // break link between NextIteration and Merge
-        if (BreakNextIteration(next_node, merge_node) != SUCCESS) {
-          GELOGE(INTERNAL_ERROR, "Break NextIteration failed");
-          return INTERNAL_ERROR;
-        }
+      // NextIteration --> Active
+      if (GraphUtils::AddEdge(next_node->GetOutControlAnchor(), next_active->GetInControlAnchor()) != GRAPH_SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Add control edge failed.");
+        return INTERNAL_ERROR;
       }
 
-      if ((SetActiveLabelList(enter_active, {cond_name}) != SUCCESS) ||
-          (SetActiveLabelList(next_active, {cond_name}) != SUCCESS)) {
-        GELOGE(INTERNAL_ERROR, "Set attr ACTIVE_LABEL_LIST failed.");
+      // break link between NextIteration and Merge
+      if (BreakNextIteration(next_node, merge_node) != SUCCESS) {
+        GELOGE(INTERNAL_ERROR, "Break NextIteration failed");
         return INTERNAL_ERROR;
       }
     }
+
+    if ((SetActiveLabelList(enter_active, {cond_name}) != SUCCESS) ||
+        (SetActiveLabelList(next_active, {cond_name}) != SUCCESS)) {
+      GELOGE(INTERNAL_ERROR, "Set attr ACTIVE_LABEL_LIST failed.");
+      return INTERNAL_ERROR;
+    }
   }
 
   return SUCCESS;
@@ -365,12 +289,11 @@ Status NextIterationPass::BreakNextIteration(const NodePtr &next_node, NodePtr &
 /// @param [in] node
 /// @param [in] target_type
 /// @param [in] is_input
-/// @param [in] batch_label
 /// @param [out] target_node
 /// @return Status
 ///
 Status NextIterationPass::FindTargetNode(const NodePtr &node, const std::string &target_type, bool is_input,
-                                         const std::string &batch_label, NodePtr &target_node) {
+                                         NodePtr &target_node) {
   if (node == nullptr) {
     GELOGE(PARAM_INVALID, "node is null.");
     return PARAM_INVALID;
@@ -387,12 +310,6 @@ Status NextIterationPass::FindTargetNode(const NodePtr &node, const std::string
   }
 
   for (const auto &tmp_node : nodes) {
-    std::string tmp_label;
-    (void)AttrUtils::GetStr(tmp_node->GetOpDesc(), ATTR_NAME_BATCH_LABEL, tmp_label);
-    bool need_skip = !(batch_label.empty() || tmp_label.empty() || (batch_label == tmp_label));
-    if (need_skip) {
-      continue;
-    }
     const std::string type = tmp_node->GetType();
     if ((target_type == LOOPCOND) && (type == target_type)) {
       target_node = tmp_node;
@@ -415,7 +332,6 @@ Status NextIterationPass::FindTargetNode(const NodePtr &node, const std::string
 /// @return SUCCESS
 ///
 Status NextIterationPass::ClearStatus() {
-  frame_enter_map_.clear();
   loop_group_map_.clear();
   return SUCCESS;
 }
diff --git a/ge/graph/passes/next_iteration_pass.h b/ge/graph/passes/next_iteration_pass.h
index f8223c20..3266254d 100755
--- a/ge/graph/passes/next_iteration_pass.h
+++ b/ge/graph/passes/next_iteration_pass.h
@@ -47,13 +47,6 @@ class NextIterationPass : public GraphPass {
   Status GroupEnterNode(const NodePtr &enter_node);
 
   ///
-  /// @brief Group Enter nodes without batch_label attr
-  /// @param [in] compute_graph
-  /// @return Status
-  ///
-  Status GroupWithNoBatch(const ComputeGraphPtr &graph);
-
-  ///
   /// @brief Find while groups
   /// @return Status
   ///
@@ -97,13 +90,10 @@ class NextIterationPass : public GraphPass {
   /// @param [out] target_node
   /// @return Status
   ///
-  Status FindTargetNode(const NodePtr &node, const std::string &target_type, bool is_input,
-                        const std::string &batch_label, NodePtr &target_node);
+  Status FindTargetNode(const NodePtr &node, const std::string &target_type, bool is_input, NodePtr &target_node);
 
-  // map<frame_name, vector<enter_node>>
-  std::unordered_map<std::string, std::vector<NodePtr>> frame_enter_map_;
-  // map<frame_name, map<batch_label, LoopCondGroup>>
-  std::unordered_map<std::string, std::unordered_map<std::string, LoopCondGroupPtr>> loop_group_map_;
+  // map<frame_name, LoopCondGroup>
+  std::unordered_map<std::string, LoopCondGroupPtr> loop_group_map_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_NEXT_ITERATION_PASS_H_
diff --git a/ge/graph/passes/pass_utils.cc b/ge/graph/passes/pass_utils.cc
index 5359ff63..3adfbde3 100644
--- a/ge/graph/passes/pass_utils.cc
+++ b/ge/graph/passes/pass_utils.cc
@@ -37,10 +37,6 @@
 #include "graph/utils/type_utils.h"
 
 namespace ge {
-namespace {
-const uint32_t kShapeDimSize = 1;
-const uint32_t DIM_SIZE_TWO = 2;
-}  // namespace
 
 Status PassUtils::ConstructTensorDescWithData(const GeTensorDesc &out_desc, std::vector<int64_t> &data,
                                               std::vector<GeTensorPtr> &v_output, const bool scalar_output) {
diff --git a/ge/graph/passes/remove_same_const_pass.cc b/ge/graph/passes/remove_same_const_pass.cc
new file mode 100644
index 00000000..e75a4553
--- /dev/null
+++ b/ge/graph/passes/remove_same_const_pass.cc
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "remove_same_const_pass.h"
+
+#include <sstream>
+#include <string>
+#include <set>
+
+#include "common/base64.h"
+#include "ge_local_engine/engine/host_cpu_engine.h"
+#include "graph/utils/node_utils.h"
+
+namespace ge {
+namespace {
+std::string GetCseKey(const NodePtr &node) {
+  std::stringstream ss;
+  ss << node->GetType() << "control-inputs-";
+  std::set<std::string> control_in_node_names;
+  for (auto &src_node : node->GetInControlNodes()) {
+    control_in_node_names.insert(src_node->GetName());
+  }
+  for (auto &name : control_in_node_names) {
+    ss << name << "-";
+  }
+
+  ss << "attrs-" << AttrUtils::GetAllAttrsStr(node->GetOpDesc());
+
+  return ss.str();
+}
+
+bool IsConstType(const NodePtr &node) { return (node->GetType() == CONSTANT || node->GetType() == CONSTANTOP); }
+}  // namespace
+Status RemoveSameConstPass::Run(ComputeGraphPtr graph) {
+  GELOGD("Begin to run RemoveSameConstPass on the graph");
+  GE_CHECK_NOTNULL(graph);
+  std::map<std::string, NodePtr> keys_to_node;
+  for (const auto &node : graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(node);
+    if (!IsConstType(node)) {
+      continue;
+    }
+    bool is_unknown = false;
+    auto ret = NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown);
+    if (ret != GRAPH_SUCCESS) {
+      GELOGW("Get node unknown status failed, node name:%s, type:%s.",
+             node->GetName().c_str(), node->GetType().c_str());
+      continue;
+    }
+    if (is_unknown) {
+      GELOGI("Current node %s, type %s is unknown shape which should be skip.",
+             node->GetName().c_str(), node->GetType().c_str());
+      continue;
+    }
+    auto key = GetCseKey(node);
+    GELOGD("The const node %s cse key %s", node->GetName().c_str(), ge::base64::EncodeToBase64(key).c_str());
+    auto iter = keys_to_node.find(key);
+    if (iter == keys_to_node.end()) {
+      keys_to_node[key] = node;
+      continue;
+    }
+
+    if (node->GetAllOutDataAnchorsSize() != iter->second->GetAllOutDataAnchorsSize()) {
+      GELOGW("The const node %s and %s have the same CSE key, but different output anchor count, skip to fusion them",
+             iter->second->GetName().c_str(), node->GetName().c_str());
+      continue;
+    }
+
+    std::vector<int> output_map(node->GetAllOutDataAnchorsSize());
+    for (size_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) {
+      output_map[i] = i;
+    }
+
+    ret = GraphUtils::ReplaceNodeAnchors(iter->second, node, {}, output_map);
+    if (ret != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Failed to replace node %s by node %s", node->GetName().c_str(),
+             iter->second->GetName().c_str(), ret);
+      return INTERNAL_ERROR;
+    }
+
+    NodeUtils::UnlinkAll(*node);
+
+    ret = GraphUtils::RemoveNodeWithoutRelink(graph, node);
+    if (ret != GRAPH_SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Failed to remove node %s from graph", node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+
+    GELOGI("Remove const node %s by RemoveSameConstPass, replace it with node %s", node->GetName().c_str(),
+           iter->second->GetName().c_str());
+  }
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/ge/graph/passes/remove_same_const_pass.h b/ge/graph/passes/remove_same_const_pass.h
new file mode 100644
index 00000000..08905bd2
--- /dev/null
+++ b/ge/graph/passes/remove_same_const_pass.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef GE_GRAPH_PASSES_REMOVE_SAME_CONST_PASS_H_
+#define GE_GRAPH_PASSES_REMOVE_SAME_CONST_PASS_H_
+
+#include "graph/types.h"
+#include "inc/graph_pass.h"
+
+namespace ge {
+class RemoveSameConstPass : public GraphPass {
+ public:
+  Status Run(ge::ComputeGraphPtr graph) override ;
+};
+}  // namespace ge
+#endif //GE_GRAPH_PASSES_REMOVE_SAME_CONST_PASS_H_
diff --git a/ge/graph/passes/same_transdata_breadth_fusion_pass.cc b/ge/graph/passes/same_transdata_breadth_fusion_pass.cc
index 5709dcb7..ad8819e5 100644
--- a/ge/graph/passes/same_transdata_breadth_fusion_pass.cc
+++ b/ge/graph/passes/same_transdata_breadth_fusion_pass.cc
@@ -28,7 +28,6 @@
 #include "init/gelib.h"
 
 namespace {
-const char *const kRemainNode = "node_remain";
 const int kNoTransOp = 1;
 }  // namespace
 
diff --git a/ge/graph/passes/subgraph_const_migration_pass.cc b/ge/graph/passes/subgraph_const_migration_pass.cc
index 579b2424..d2effd44 100644
--- a/ge/graph/passes/subgraph_const_migration_pass.cc
+++ b/ge/graph/passes/subgraph_const_migration_pass.cc
@@ -20,11 +20,12 @@
 #include "graph/passes/folding_pass.h"
 
 namespace ge {
-constexpr uint32_t kDataOutIndex = 0;
+constexpr uint32_t kZeroIndex = 0;
 constexpr uint32_t kCaseInputBase = 1;
 constexpr uint32_t kInvalidParent = 0x7fffffffU;
+const string kMbatchNodeNameMark = "_ascend_mbatch_batch_";
 
-bool IsSameOpNode(const NodePtr &src_node, const NodePtr &dst_node) {
+bool IsSameConstNode(const NodePtr &src_node, const NodePtr &dst_node) {
   if ((src_node == nullptr) && (dst_node == nullptr)) {
     return true;
   }
@@ -37,35 +38,9 @@ bool IsSameOpNode(const NodePtr &src_node, const NodePtr &dst_node) {
     return false;
   }
 
-  if ((src_node->GetInControlNodes().size() != dst_node->GetInControlNodes().size()) ||
-      (src_node->GetOutDataNodesSize() != dst_node->GetOutDataNodesSize())) {
-    return false;
-  }
-
-  set<uint32_t> related_parent;
-  const auto in_nodes = src_node->GetInControlNodes();
-  for (uint32_t i = 0; i < in_nodes.size(); ++i) {
-    const auto owner_node = in_nodes.at(i);
-    uint32_t parent_index = 0;
-    if (!AttrUtils::GetInt(owner_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
-      return false;
-    }
-
-    related_parent.insert(parent_index);
-  }
-
-  for (const auto &in_node : dst_node->GetInControlNodes()) {
-    uint32_t parent_index = 0;
-    if (!AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
-      return false;
-    }
-
-    if (related_parent.count(parent_index) == 0) {
-      return false;
-    }
-  }
-
-  return true;
+  const GeTensorDesc &src_desc = src_node->GetOpDesc()->GetOutputDesc(kZeroIndex);
+  const GeTensorDesc &dst_desc = dst_node->GetOpDesc()->GetOutputDesc(kZeroIndex);
+  return (src_desc == dst_desc);
 }
 
 /***********************************************************************************************************************
@@ -89,12 +64,12 @@ bool IsSameOpNode(const NodePtr &src_node, const NodePtr &dst_node) {
  +-----------+ +-----------+ +-----------+ +-----------+ +-----------+    +-----------+    +-----------+
  |   Data    | |   Data    | |   Data    | |   Data    | |   Data    |    |   Data    |    |  Conv2D   |
  +-----------+ +-----------+ +-----------+ +-----------+ +-----------+    +-----------+    +-----------+
-        \                 \        |        /                  /                |                |
-         \                 \       |       /                  /                 |                |
-          \                 \      |      /                  /                  |                |
-           \                 \     |     /                  /                   |                |
-            \                +-----------+                 /                    |          +-----------+
-             +---------------|   Const   |----------------+                     |          |  Pooling  |
+        \                 \        |        /                  /                |                |         +-----------+
+         \                 \       |       /                  /                 |                |         |   Const   |
+          \                 \      |      /                  /                  |                |         +-----------+
+           \                 \     |     /                  /                   |                |             /
+            \                +-----------+                 /                    |          +-----------+      /
+             +---------------|   Const   |----------------+                     |          |  Pooling  |-----+
                              +-----------+                                      |          +-----------+
                                    \                                            |               /
                                     \                                           |              /
@@ -126,28 +101,26 @@ Status SubgraphConstMigrationPass::Run(ComputeGraphPtr graph) {
       continue;
     }
 
-    do {
-      migration_append_ = false;
-      map<ComputeGraphPtr, map<uint32_t, NodePtr>> graph_datas;
-      if (ClassifyDataNodes(graph, func_desc, graph_datas) != SUCCESS) {
-        return FAILED;
-      }
+    map<ComputeGraphPtr, map<string, NodePtr>> all_const_nodes;
+    map<ComputeGraphPtr, map<uint32_t, NodePtr>> all_data_nodes;
+    if (ClassifyGraphNodes(graph, func_desc, all_const_nodes, all_data_nodes) != SUCCESS) {
+      return FAILED;
+    }
 
-      if (graph_datas.empty()) {
-        GELOGW("Graph: %s subgraph is empty", graph->GetName().c_str());
-        break;
-      }
+    if (all_const_nodes.empty()) {
+      GELOGW("Graph: %s subgraph is empty", graph->GetName().c_str());
+      break;
+    }
 
-      // {subgraph0, {{1, Data}, {2, Data}, {3, Data}, {4, Data}, ..., {n, Data}}}
-      // {subgraph1, {{1, Data}, {2, Data}, {3, Data}, {4, Data}, ..., {n, Data}}}
-      // {subgraph2, {{1, Data}, {2, Data}, {3, Data}, {4, Data}, ..., {n, Data}}}
-      const auto base_nodes = graph_datas.begin()->second;  // Need copy.
-      for (const auto &node_item : base_nodes) {
-        if (GraphNodeMigration(graph, node, graph_datas, node_item.second, node_item.first) != SUCCESS) {
-          return FAILED;
-        }
+    // {subgraph0, {{key1, Const}, {key2, Const}, {key3, Const}, {key4, Const}, ..., {keyn, Const}}}
+    // {subgraph1, {{key1, Const}, {key2, Const}, {key3, Const}, {key4, Const}, ..., {keyn, Const}}}
+    // {subgraph2, {{key1, Const}, {key2, Const}, {key3, Const}, {key4, Const}, ..., {keyn, Const}}}
+    const auto &const_nodes = all_const_nodes.begin()->second;
+    for (const auto &item : const_nodes) {
+      if (GraphNodeMigration(graph, node, all_const_nodes, all_data_nodes, item.second, item.first) != SUCCESS) {
+        return FAILED;
       }
-    } while (migration_append_);
+    }
   }
 
   return SUCCESS;
@@ -155,14 +128,16 @@ Status SubgraphConstMigrationPass::Run(ComputeGraphPtr graph) {
 
 ///
 /// @ingroup ge
-/// @brief Get all Data nodes for all subgraph.
+/// @brief Get all Const/Data nodes for all subgraph.
 /// @param [in] graph: Root compute graph.
 /// @param [in] func_desc: functional OpDesc of Case.
-/// @param [out] graph_datas: Data groups of subgraph.
+/// @param [out] all_const_nodes: Const groups of subgraph.
+/// @param [out] all_data_nodes: Data groups of subgraph.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status SubgraphConstMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &graph, const OpDescPtr &func_desc,
-                                                     map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas) {
+Status SubgraphConstMigrationPass::ClassifyGraphNodes(const ComputeGraphPtr &graph, const OpDescPtr &func_desc,
+                                                      map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                                                      map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes) {
   for (const auto &name : func_desc->GetSubgraphInstanceNames()) {
     const auto &subgraph = graph->GetSubgraph(name);
     if (subgraph == nullptr) {
@@ -170,32 +145,63 @@ Status SubgraphConstMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap
       return GE_GRAPH_EMPTY_SUBGRAPH;
     }
 
-    auto &data_nodes = graph_datas[subgraph];
-    for (auto &data : subgraph->GetDirectNode()) {
-      if (data->GetType() != DATA) {
-        continue;
-      }
+    set<NodePtr> ctrl_only_const_nodes;
+    auto &data_nodes = all_data_nodes[subgraph];
+    auto &const_nodes = all_const_nodes[subgraph];
+    for (auto &node : subgraph->GetDirectNode()) {
+      if (node->GetType() == DATA) {
+        uint32_t parent_index = kInvalidParent;
+        if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+          return FAILED;
+        }
 
-      uint32_t parent_index = 0;
-      if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
-        GELOGE(FAILED, "Parent index not found, name: %s", data->GetName().c_str());
-        return FAILED;
-      }
+        data_nodes[parent_index] = node;
+        GELOGD("%s, index: %u, Data: %s", subgraph->GetName().c_str(), parent_index, node->GetName().c_str());
+      } else if ((node->GetType() == CONSTANT) && (node->GetOutDataAnchor(kZeroIndex) != nullptr)) {
+        set<string> peer_name_list;
+        const auto &out_anchor = node->GetOutDataAnchor(kZeroIndex);
+        for (const auto &in_anchor : out_anchor->GetPeerInDataAnchors()) {
+          const auto &peer_node = in_anchor->GetOwnerNode();
+          // Trim subgraph node name prefix.
+          string node_full_name = peer_node->GetName();
+          size_t pos = node_full_name.find(kMbatchNodeNameMark);
+          if (pos == string::npos) {
+            GELOGE(FAILED, "find: %s of multi-batch in node: %s", kMbatchNodeNameMark.c_str(), node_full_name.c_str());
+            return FAILED;
+          }
+
+          string fixed_name = node_full_name.substr(0, pos);
+          pos = node_full_name.find("_", pos + kMbatchNodeNameMark.length());
+          if (pos != string::npos) {
+            fixed_name += node_full_name.substr(pos);
+          }
+
+          peer_name_list.insert(fixed_name + ":" + std::to_string(in_anchor->GetIdx()));
+        }
+
+        if (peer_name_list.empty()) {
+          GELOGI("%s, Const: %s, no data output", subgraph->GetName().c_str(), node->GetName().c_str());
+          const auto in_all_nodes = node->GetInAllNodes();
+          if (in_all_nodes.empty() || std::all_of(in_all_nodes.begin(), in_all_nodes.end(),
+                                                  [](const NodePtr &n) { return n->GetType() == DATA; })) {
+            ctrl_only_const_nodes.insert(node);
+          }
+          continue;
+        }
 
-      data_nodes[parent_index] = data;
-      GELOGD("%s, Parent index: %u, Data: %s", subgraph->GetName().c_str(), parent_index, data->GetName().c_str());
+        string key_of_const;
+        for (const string &name : peer_name_list) {
+          key_of_const += (key_of_const.empty() ? name : "_" + name);
+        }
+
+        const_nodes[key_of_const] = node;
+        GELOGD("%s, Const: %s, Key: %s", subgraph->GetName().c_str(), node->GetName().c_str(), key_of_const.c_str());
+      }
     }
-  }
 
-  auto iter = graph_datas.begin();
-  if (iter == graph_datas.end()) {
-    return SUCCESS;
-  }
-  for (const auto &data_nodes : graph_datas) {
-    if (data_nodes.second.size() != iter->second.size()) {
-      GELOGE(FAILED, "Subgraph %s has invalid Data nodes[%zu != %zu]",
-             data_nodes.first->GetName().c_str(), data_nodes.second.size(), iter->second.size());
-      return FAILED;
+    for (auto &node : ctrl_only_const_nodes) {
+      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(subgraph, node),
+          "Remove node without relink failed, node: %s", node->GetName().c_str());
     }
   }
 
@@ -204,36 +210,27 @@ Status SubgraphConstMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap
 
 ///
 /// @ingroup ge
-/// @brief Get all Data nodes for all subgraph.
-/// @param [in] node: Const node of subgraph.
-/// @param [out] inputs: parent index to Const.
-/// @param [out] outputs: Data groups of subgraph.
+/// @brief Get parent_index for Const node migration.
+/// @param [in] all_data_nodes: Data groups of subgraph.
+/// @param [in] const_node: Const node will process.
+/// @param [out] parent_index: parent index for replace Data.
 /// @return true: SUCCESS / false: FAILED
 ///
-bool SubgraphConstMigrationPass::GetAssociatedNodes(const NodePtr &node, map<uint32_t, uint32_t> &inputs,
-                                                    map<uint32_t, uint32_t> &outputs) {
-  for (uint32_t i = 0; i < node->GetAllOutDataAnchorsSize(); ++i) {
-    outputs[i] = kInvalidParent;
-  }
-
-  uint32_t out_index = 0;
-  const auto in_nodes = node->GetInAllNodes();
-  for (size_t i = 0; i < in_nodes.size(); ++i) {
-    const auto owner_node = in_nodes.at(i);
-    if (owner_node->GetType() != DATA) {
+bool SubgraphConstMigrationPass::GetAssociatedNodes(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes,
+                                                    const NodePtr &const_node, uint32_t &parent_index) {
+  for (const auto in_node : const_node->GetInAllNodes()) {
+    if (in_node->GetType() != DATA) {
       return false;
     }
 
-    uint32_t parent_index = 0;
-    if (!AttrUtils::GetInt(owner_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+    uint32_t node_index = 0;
+    if (!AttrUtils::GetInt(in_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, node_index)) {
       return false;
     }
 
     // Input Data feed other Node, need add new Data.
-    inputs[i] = parent_index;
-    if ((out_index == outputs.size()) && owner_node->GetOutDataNodes().empty()) {
-      outputs[out_index] = parent_index;
-      ++out_index;
+    if ((parent_index == kInvalidParent) && in_node->GetOutDataNodes().empty()) {
+      parent_index = node_index;
     }
   }
 
@@ -242,43 +239,26 @@ bool SubgraphConstMigrationPass::GetAssociatedNodes(const NodePtr &node, map<uin
 
 ///
 /// @ingroup ge
-/// @brief Get all Data nodes for all subgraph.
-/// @param [in] graph_nodes: Data groups of subgraph.
-/// @param [in] data_base: Data Node for migration.
-/// @param [in] data_idx: Data groups of subgraph.
-/// @param [in] data_idx: Data groups of subgraph.
+/// @brief Check parallel node is same for all subgraph.
+/// @param [in] all_const_nodes: Const groups of subgraph.
+/// @param [in] const_node: Const Node for migration.
+/// @param [in] node_key: Key of Const node.
 /// @return true: Same / false: not same
 ///
-bool SubgraphConstMigrationPass::IsParallelNodeSame(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
-                                                    const NodePtr &const_node, uint32_t parent_index, size_t index) {
-  auto it = graph_datas.begin();
-  for (++it; it != graph_datas.end(); ++it) {
-    const auto &data_nodes = it->second;
-    auto data_it = data_nodes.find(parent_index);
-    if (data_it == data_nodes.end()) {
-      GELOGE(FAILED, "Data: %s not fount, index: %u", const_node->GetName().c_str(), parent_index);
-      return false;
-    }
-
-    const auto &work_data = data_it->second;
-    const auto &out_anchor = work_data->GetOutControlAnchor();
-    const auto &in_anchors = out_anchor->GetPeerInControlAnchors();
-    if (in_anchors.size() <= index || in_anchors.at(index) == nullptr) {
-      GELOGW("Node anchors not same, Data: %s -> %s anchor size: %zu, index: %zu",
-             work_data->GetName().c_str(), const_node->GetName().c_str(), in_anchors.size(), index);
-      return false;
-    }
-
-    const auto &in_anchor = in_anchors.at(index);
-    const auto &work_node = in_anchor->GetOwnerNode();
-    if (work_node == nullptr) {
-      GELOGE(FAILED, "Data: %s not found, parent: %u, index: %zu", const_node->GetName().c_str(), parent_index, index);
+bool SubgraphConstMigrationPass::IsParallelNodeSame(const map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                                                    const NodePtr &const_node, const string &node_key) {
+  auto it = all_const_nodes.begin();
+  for (++it; it != all_const_nodes.end(); ++it) {
+    const auto &const_nodes = it->second;
+    auto node_it = const_nodes.find(node_key);
+    if (node_it == const_nodes.end()) {
+      GELOGW("Const node: %s not fount, key: %s", const_node->GetName().c_str(), node_key.c_str());
       return false;
     }
 
-    if (!IsSameOpNode(const_node, work_node)) {
-      GELOGI("OpDesc not same: %s %s, parent: %u, index: %zu",
-             const_node->GetName().c_str(), work_node->GetName().c_str(), parent_index, index);
+    const auto &work_node = node_it->second;
+    if (!IsSameConstNode(const_node, work_node)) {
+      GELOGI("Not same: %s %s, key: %s", const_node->GetName().c_str(), work_node->GetName().c_str(), node_key.c_str());
       return false;
     }
   }
@@ -291,51 +271,34 @@ bool SubgraphConstMigrationPass::IsParallelNodeSame(const map<ComputeGraphPtr, m
 /// @brief Migration subgraph Node to Root
 /// @param [in] graph: Root compute graph.
 /// @param [in] func_node: functional Node of Case.
-/// @param [in] graph_nodes: Data groups of subgraph.
-/// @param [in] data_base: Data Node for migration.
-/// @param [in] data_idx: Data groups of subgraph.
+/// @param [in] all_const_nodes: Const groups of subgraph.
+/// @param [in] all_data_nodes: Data groups of subgraph.
+/// @param [in] const_node: Const Node for migration.
+/// @param [in] node_key: Key of Const node for migration.
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status SubgraphConstMigrationPass::GraphNodeMigration(const ComputeGraphPtr &graph, const NodePtr &func_node,
-                                                      map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
-                                                      const NodePtr &data_node, uint32_t parent_index) {
-  bool can_extrapolation = false;
-  do {
-    can_extrapolation = false;
-    const auto &out_anchor = data_node->GetOutControlAnchor();
-    const auto &in_anchors = out_anchor->GetPeerInControlAnchors();
-    for (size_t i = in_anchors.size(); i > 0; --i) {
-      const auto &in_anchor = in_anchors.at(i - 1);
-      const auto &work_node = in_anchor->GetOwnerNode();
-      GELOGD("Data: %s, node: %s, parent: %u, index: %zu",
-             data_node->GetName().c_str(), work_node->GetName().c_str(), parent_index, i);
-      if (work_node->GetType() != CONSTANT) {
-        continue;
-      }
-
-      // Get associated Data, if Data feed other nodes, need append new Data.
-      map<uint32_t, uint32_t> inputs;
-      map<uint32_t, uint32_t> outputs;
-      if (!GetAssociatedNodes(work_node, inputs, outputs)) {
-        continue;
-      }
+                                                      const map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                                                      map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes,
+                                                      const NodePtr &const_node, const string &node_key) {
+  if (!IsParallelNodeSame(all_const_nodes, const_node, node_key)) {
+    return SUCCESS;
+  }
 
-      if (!IsParallelNodeSame(graph_datas, work_node, parent_index, i - 1)) {
-        continue;
-      }
+  // Get associated Data, if Data feed other nodes, need append new Data.
+  uint32_t parent_index = kInvalidParent;
+  if (!GetAssociatedNodes(all_data_nodes, const_node, parent_index)) {
+    return SUCCESS;
+  }
 
-      GELOGI("Move node: %s, parent: %u, index: %zu", work_node->GetName().c_str(), parent_index, i);
-      if (AppendParallelNode(graph_datas, func_node, outputs) != SUCCESS) {
-        return FAILED;
-      }
+  GELOGI("Move node: %s, parent index: %u", const_node->GetName().c_str(), parent_index);
+  if (AppendParallelNode(func_node, parent_index, all_data_nodes) != SUCCESS) {
+    return FAILED;
+  }
 
-      if (MoveNodeToParent(graph, func_node, graph_datas, parent_index, i - 1, inputs, outputs) != SUCCESS) {
-        return FAILED;
-      }
-      can_extrapolation = true;
-      break;
-    }
-  } while (can_extrapolation);
+  if (MoveNodeToParent(graph, func_node, all_const_nodes, all_data_nodes, node_key, parent_index) != SUCCESS) {
+    return FAILED;
+  }
 
   return SUCCESS;
 }
@@ -343,114 +306,101 @@ Status SubgraphConstMigrationPass::GraphNodeMigration(const ComputeGraphPtr &gra
 ///
 /// @ingroup ge
 /// @brief Append Input Tensor for functional node.
-/// @param [in] graph_nodes: Data groups of subgraph.
 /// @param [in] func_node: functional Node of Case.
-/// @param [in] outputs: Parent index of Node output.
+/// @param [in/out] parent_index: Parent index for migration.
+/// @param [in/out] all_data_nodes: Data groups of subgraph.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status SubgraphConstMigrationPass::AppendParallelNode(map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
-                                                      const NodePtr &func_node, map<uint32_t, uint32_t> &outputs) {
+Status SubgraphConstMigrationPass::AppendParallelNode(const NodePtr &func_node, uint32_t &parent_index,
+                                                      map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes) {
   // If outputs index invalid, add Data and Input Tensor.
-  for (auto &item : outputs) {
-    if (item.second != kInvalidParent) {
-      continue;
-    }
-
-    // Add Data to subgraph.
-    map<ComputeGraphPtr, uint32_t> append_num;
-    for (auto &groups : graph_datas) {
-      const auto &subgraph = groups.first;
-      auto &data_nodes = groups.second;
-
-      item.second = func_node->GetAllInDataAnchorsSize() + append_num[subgraph]; // Update to valid parent index.
-      const auto data_name = subgraph->GetName() + "_data_" + std::to_string(item.second);
-
-      OpDescBuilder op_builder(data_name, DATA);
-      const OpDescPtr op_desc = op_builder.AddInput("x").AddOutput("y").Build();
-      if (op_desc == nullptr) {
-        GELOGE(OUT_OF_MEMORY, "Create multi-batch subgraph data desc failed");
-        return OUT_OF_MEMORY;
-      }
+  if (parent_index != kInvalidParent) {
+    return SUCCESS;
+  }
 
-      uint32_t data_index = item.second - kCaseInputBase;
-      if (!AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
-        GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
-        return FAILED;
-      }
+  // Add Data to subgraph.
+  parent_index = func_node->GetAllInDataAnchorsSize();  // Update to valid parent index.
+  for (auto &item : all_data_nodes) {
+    const auto &subgraph = item.first;
+    const auto data_name = subgraph->GetName() + "_data_" + std::to_string(parent_index);
+    OpDescBuilder op_builder(data_name, DATA);
+    const auto op_desc = op_builder.AddInput("x").AddOutput("y").Build();
+    if (op_desc == nullptr) {
+      GELOGE(OUT_OF_MEMORY, "Create multi-batch subgraph data desc failed");
+      return OUT_OF_MEMORY;
+    }
 
-      if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, item.second)) {
-        GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
-        return FAILED;
-      }
+    uint32_t data_index = parent_index - kCaseInputBase;
+    if (!AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
+      GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
+      return FAILED;
+    }
 
-      append_num[subgraph]++;
-      data_nodes[item.second] = subgraph->AddNode(op_desc);
-      GELOGI("Add Node: %s, parent index: %u", op_desc->GetName().c_str(), item.second);
+    if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
+      GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str());
+      return FAILED;
     }
 
-    // Add InputTensor to functional Node.
-    NodeUtils::AppendInputAnchor(func_node, item.second + 1);
+    item.second[parent_index] = subgraph->AddNode(op_desc);
+    GELOGI("Add Node: %s, parent index: %u", op_desc->GetName().c_str(), parent_index);
   }
 
+  // Add InputTensor to functional Node.
+  NodeUtils::AppendInputAnchor(func_node, parent_index + 1);
   return SUCCESS;
 }
 
 ///
 /// @ingroup ge
-/// @brief Delete Node from all subgraph.
-/// @param [in] graph_nodes: Data groups of subgraph.
-/// @param [in] detach: Node will move to parent.
-/// @param [in] outputs: Parent index of Node output.
+/// @brief Delete Node from subgraph.
+/// @param [in] graph: subgraph for process.
+/// @param [in] const_node: Node will move to parent.
+/// @param [in] data_node: Place holder for Const.
 /// @return 0: SUCCESS / others: FAILED
 ///
-Status SubgraphConstMigrationPass::DetachParallelNode(const map<uint32_t, NodePtr> &graph_datas, const NodePtr &detach,
-                                                      const map<uint32_t, uint32_t> &outputs) {
+Status SubgraphConstMigrationPass::DetachParallelNode(const ComputeGraphPtr &graph, const NodePtr &const_node,
+                                                      const NodePtr &data_node) {
   // Break Data and Move node.
-  const auto &in_anchor = detach->GetInControlAnchor();
-  const auto &out_anchors = in_anchor->GetPeerOutControlAnchors();
-  for (size_t i = out_anchors.size(); i > 0; --i) {
-    const auto &out_anchor = out_anchors.at(i - 1);
+  const auto &in_anchor = const_node->GetInControlAnchor();
+  const auto out_anchors = in_anchor->GetPeerOutControlAnchors();
+  for (const auto out_anchor : out_anchors) {
     GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
-    const auto &owner_node = out_anchor->GetOwnerNode();
-    GELOGI("Remove Edge: %s %s", owner_node->GetName().c_str(), detach->GetName().c_str());
-  }
-
-  // Break Move and follow, Link Data and follow.
-  for (uint32_t i = 0; i < detach->GetAllOutDataAnchorsSize(); ++i) {
-    auto it_idx = outputs.find(i);
-    if (it_idx == outputs.end()) {
-      GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i);
-      return FAILED;
-    }
-
-    auto it_data = graph_datas.find(it_idx->second);
-    if (it_data == graph_datas.end()) {
-      GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i);
-      return FAILED;
+    const auto owner_node = out_anchor->GetOwnerNode();
+    GELOGI("Remove Edge: %s %s", owner_node->GetName().c_str(), const_node->GetName().c_str());
+    if (owner_node->GetInAllNodes().empty() && owner_node->GetOutAllNodes().empty() && owner_node != data_node) {
+      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph, owner_node),
+          "Remove node without relink failed, node: %s", owner_node->GetName().c_str());
     }
+  }
 
-    const auto &data_node = it_data->second;
-    const auto &out_anchor = detach->GetOutDataAnchor(i);
+  const auto &ctrl_anchor = const_node->GetOutControlAnchor();
+  const auto ctrl_anchors = ctrl_anchor->GetPeerInControlAnchors();
+  for (const auto in_anchor : ctrl_anchors) {
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(ctrl_anchor, in_anchor), "Remove edge failed");
+    GELOGI("Remove Edge: %s %s", const_node->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str());
 
-    const auto &out_desc = detach->GetOpDesc()->GetOutputDesc(i);
-    const auto &data_desc = data_node->GetOpDesc();
-    (void)data_desc->UpdateInputDesc(kDataOutIndex, out_desc);    // Set Data Input to new connect Node.
-    (void)data_desc->UpdateOutputDesc(kDataOutIndex, out_desc);   // Set Data Output to new connect Node.
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(data_node->GetOutControlAnchor(), in_anchor), "Add edge failed");
+    GELOGI("Add Edge: %s %s", data_node->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str());
+  }
 
-    for (const auto &in_anchor : out_anchor->GetPeerInDataAnchors()) {
-      if (in_anchor == nullptr) {
-          continue;
-      }
-      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
-      const auto &owner_node = in_anchor->GetOwnerNode();
-      GELOGI("Remove Edge: %s %s", detach->GetName().c_str(), owner_node->GetName().c_str());
+  // Break Move and follow, Link Data and follow.
+  const auto &out_anchor = const_node->GetOutDataAnchor(kZeroIndex);
+  const auto in_anchors =out_anchor->GetPeerInDataAnchors();
+  for (const auto in_anchor : in_anchors) {
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
+    GELOGI("Remove Edge: %s %s", const_node->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str());
 
-      const auto &data_out_anchor = data_node->GetOutDataAnchor(kDataOutIndex);
-      GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(data_out_anchor, in_anchor), "Add edge failed");
-      GELOGI("Add Edge: %s %s", data_node->GetName().c_str(), owner_node->GetName().c_str());
-    }
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(data_node->GetOutDataAnchor(kZeroIndex), in_anchor), "Add edge failed");
+    GELOGI("Add Edge: %s %s", data_node->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str());
   }
 
+  // Update Data op DataType.
+  const auto &const_desc = const_node->GetOpDesc();
+  const auto &tensor_desc = const_desc->GetOutputDesc(kZeroIndex);
+  const auto &data_desc = data_node->GetOpDesc();
+  (void)data_desc->UpdateInputDesc(kZeroIndex, tensor_desc);    // Set Data Input to new connect Node.
+  (void)data_desc->UpdateOutputDesc(kZeroIndex, tensor_desc);   // Set Data Output to new connect Node.
+
   return SUCCESS;
 }
 
@@ -459,47 +409,38 @@ Status SubgraphConstMigrationPass::DetachParallelNode(const map<uint32_t, NodePt
 /// @brief Move Node to Parent Graph.
 /// @param [in] graph: Parent compute graph.
 /// @param [in] func_node: functional Node of Case.
-/// @param [in] attach: Node will move to parent.
-/// @param [in] inputs: Parent index of Node input.
-/// @param [in] outputs: Parent index of Node output.
+/// @param [in] const_node: Node will move to parent.
+/// @param [in] parent_index: Parent index of Node input.
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status SubgraphConstMigrationPass::AttachParallelNode(const ComputeGraphPtr &graph, const NodePtr &func_node,
-                                                      const NodePtr &attach, const map<uint32_t, uint32_t> &inputs,
-                                                      const map<uint32_t, uint32_t> &outputs) {
-  GE_CHECK_NOTNULL(attach);
-  for (const auto item : inputs) {
-    if (item.second == kInvalidParent) {   // Not connect, Skip.
-      continue;
-    }
-
-    const auto &in_anchor = func_node->GetInDataAnchor(item.second);
-    const auto &out_anchor = in_anchor->GetPeerOutAnchor();
-    const auto &owner_node = out_anchor->GetOwnerNode();
-    const auto &in_control = attach->GetInControlAnchor();
-    GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(owner_node->GetOutControlAnchor(), in_control), "Add edge failed");
-    GELOGI("Add Edge: %s %s", owner_node->GetName().c_str(), attach->GetName().c_str());
+                                                      const NodePtr &const_node, uint32_t parent_index) {
+  GE_CHECK_NOTNULL(const_node);
+  if (parent_index == kInvalidParent) {
+    return INTERNAL_ERROR;
   }
 
-  for (const auto &item : outputs) {
-    const auto &func_desc = func_node->GetOpDesc();
-    const auto &out_desc = attach->GetOpDesc()->GetOutputDesc(item.second);
-    (void)func_desc->UpdateInputDesc(item.second, out_desc);    // Set Data Input to new connect Node.
-
-    const auto &in_anchor = func_node->GetInDataAnchor(item.second);
-    const auto &out_anchor = in_anchor->GetPeerOutAnchor();
-    if (out_anchor != nullptr) {
-      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
-      const auto &owner_node = out_anchor->GetOwnerNode();
-      GELOGI("Remove Edge: %s %s", owner_node->GetName().c_str(), func_node->GetName().c_str());
+  const auto &func_desc = func_node->GetOpDesc();
+  const auto &tensor_desc = const_node->GetOpDesc()->GetOutputDesc(kZeroIndex);
+  (void)func_desc->UpdateInputDesc(parent_index, tensor_desc);    // Set Data Input to new connect Node.
+
+  const auto &in_anchor = func_node->GetInDataAnchor(parent_index);
+  const auto &out_anchor = in_anchor->GetPeerOutAnchor();
+  if (out_anchor != nullptr) {  // Break useless old link.
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, in_anchor), "Remove edge failed");
+    const auto owner_node = out_anchor->GetOwnerNode();
+    GELOGI("Remove Edge: %s %s", owner_node->GetName().c_str(), func_node->GetName().c_str());
+    if (owner_node->GetInAllNodes().empty() && owner_node->GetOutAllNodes().empty()) {
+      GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph, owner_node),
+          "Remove node without relink failed, node: %s", owner_node->GetName().c_str());
     }
-    GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(attach->GetOutDataAnchor(item.first), in_anchor), "Add edge failed");
-    GELOGI("Add Edge: %s %s", attach->GetName().c_str(), func_node->GetName().c_str());
   }
+  GE_CHK_GRAPH_STATUS_RET(GraphUtils::AddEdge(const_node->GetOutDataAnchor(kZeroIndex), in_anchor), "Add edge failed");
+  GELOGI("Add Edge: %s %s, index: %u", const_node->GetName().c_str(), func_node->GetName().c_str(), parent_index);
 
-  (void)graph->AddNode(attach);
-  (void)attach->SetOwnerComputeGraph(graph);
-  GELOGI("Add Node: %s %s", graph->GetName().c_str(), attach->GetName().c_str());
+  (void)graph->AddNode(const_node);
+  (void)const_node->SetOwnerComputeGraph(graph);
+  GELOGI("Add Node: %s %s", graph->GetName().c_str(), const_node->GetName().c_str());
   return SUCCESS;
 }
 
@@ -515,55 +456,50 @@ Status SubgraphConstMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra
 /// @return 0: SUCCESS / others: FAILED
 ///
 Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph, const NodePtr &func_node,
-                                                    const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas,
-                                                    uint32_t parent_index, uint32_t index,
-                                                    const map<uint32_t, uint32_t> &inputs,
-                                                    const map<uint32_t, uint32_t> &outputs) {
-  if (inputs.empty()) {
-    GELOGE(FAILED, "Graph: %s, inputs is empty", graph->GetName().c_str());
+                                                    const map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                                                    const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes,
+                                                    const string &node_key, uint32_t parent_index) {
+  if (node_key.empty() || parent_index == kInvalidParent) {
+    GELOGE(FAILED, "Graph: %s, node key: %s, parent index: %u invalid",
+           graph->GetName().c_str(), node_key.c_str(), parent_index);
     return FAILED;
   }
 
   NodePtr move_node;
-  for (auto &groups : graph_datas) {
-    const auto &subgraph = groups.first;
-    const auto &data_nodes = groups.second;
-    auto it = data_nodes.find(parent_index);
-    if (it == data_nodes.end()) {
-      GELOGE(FAILED, "Graph: %s, Data: %u node not found", subgraph->GetName().c_str(), parent_index);
+  for (auto &item : all_const_nodes) {
+    const auto &subgraph = item.first;
+    const auto it_const = item.second.find(node_key);
+    if (it_const == item.second.end()) {
+      GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str());
       return FAILED;
     }
+    move_node = it_const->second;
 
-    const auto &base_data = it->second;
-    const auto &out_anchor = base_data->GetOutControlAnchor();
-    const auto &in_anchors = out_anchor->GetPeerInControlAnchors();
-    if (in_anchors.size() <= index || in_anchors.at(index) == nullptr) {
-      GELOGE(FAILED, "Data: %s, anchor size: %zu, index: %u not found",
-             base_data->GetName().c_str(), in_anchors.size(), index);
+    const auto it_nodes = all_data_nodes.find(subgraph);
+    if (it_nodes == all_data_nodes.end()) {
+      GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str());
       return FAILED;
     }
-
-    const auto &in_anchor = in_anchors.at(index);
-    move_node = in_anchor->GetOwnerNode();
-    if (move_node == nullptr) {
-      GELOGE(FAILED, "Data: %s not found, index: %u", base_data->GetName().c_str(), parent_index);
+    const auto it_data = it_nodes->second.find(parent_index);
+    if (it_data == it_nodes->second.end()) {
+      GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str());
       return FAILED;
     }
 
-    if (DetachParallelNode(data_nodes, move_node, outputs) != SUCCESS) {
-      GELOGE(FAILED, "Data: %s not found, index: %u", base_data->GetName().c_str(), parent_index);
+    if (DetachParallelNode(subgraph, move_node, it_data->second) != SUCCESS) {
+      GELOGE(FAILED, "Data: %s not found, index: %u", move_node->GetName().c_str(), parent_index);
       return FAILED;
     }
 
-    GE_CHK_GRAPH_STATUS_RET(subgraph->RemoveNode(move_node), "Remove node failed");
+    GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(subgraph, move_node),
+        "Remove node without relink failed, node: %s", move_node->GetName().c_str());
     GELOGI("Remove Node: %s %s", subgraph->GetName().c_str(), move_node->GetName().c_str());
   }
 
-  if (AttachParallelNode(graph, func_node, move_node, inputs, outputs) != SUCCESS) {
+  if (AttachParallelNode(graph, func_node, move_node, parent_index) != SUCCESS) {
     return FAILED;
   }
 
-  migration_append_ = true;
   return SUCCESS;
 }
 }  // namespace ge
diff --git a/ge/graph/passes/subgraph_const_migration_pass.h b/ge/graph/passes/subgraph_const_migration_pass.h
index 3c087852..d93da839 100755
--- a/ge/graph/passes/subgraph_const_migration_pass.h
+++ b/ge/graph/passes/subgraph_const_migration_pass.h
@@ -36,50 +36,54 @@ class SubgraphConstMigrationPass : public GraphPass {
  private:
   ///
   /// @ingroup ge
-  /// @brief Get all Data nodes for all subgraph.
+  /// @brief Get all Const/Data nodes for all subgraph.
   /// @param [in] graph: Root compute graph.
   /// @param [in] func_desc: functional OpDesc of Case.
-  /// @param [out] graph_datas: Data groups of subgraph.
+  /// @param [out] all_const_nodes: Const groups of subgraph.
+  /// @param [out] all_data_nodes: Data groups of subgraph.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status ClassifyDataNodes(const ComputeGraphPtr &graph, const OpDescPtr &func_desc,
-                           map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_datas);
+  Status ClassifyGraphNodes(const ComputeGraphPtr &graph, const OpDescPtr &func_desc,
+                            map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                            map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes);
 
   ///
   /// @ingroup ge
-  /// @brief Get all Data nodes for all subgraph.
-  /// @param [in] node: Const node of subgraph.
-  /// @param [in] func_desc: functional OpDesc of Case.
-  /// @param [out] graph_nodes: Data groups of subgraph.
+  /// @brief Get parent_index for Const node migration.
+  /// @param [in] all_data_nodes: Data groups of subgraph.
+  /// @param [in] const_node: Const node will process.
+  /// @param [out] parent_index: parent index for replace Data.
   /// @return true: SUCCESS / false: FAILED
   ///
-  bool GetAssociatedNodes(const NodePtr &node, map<uint32_t, uint32_t> &inputs, map<uint32_t, uint32_t> &outputs);
+  bool GetAssociatedNodes(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes,
+                          const NodePtr &const_node, uint32_t &parent_index);
 
   ///
   /// @ingroup ge
-  /// @brief Get all Data nodes for all subgraph.
-  /// @param [in] graph_nodes: Data groups of subgraph.
-  /// @param [in] data_base: Data Node for migration.
-  /// @param [in] data_idx: Data groups of subgraph.
-  /// @param [in] data_idx: Data groups of subgraph.
+  /// @brief Check parallel node is same for all subgraph.
+  /// @param [in] all_const_nodes: Const groups of subgraph.
+  /// @param [in] const_node: Const Node for migration.
+  /// @param [in] node_key: Key of Const node.
   /// @return true: Same / false: not same
   ///
-  bool IsParallelNodeSame(const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes,
-                          const NodePtr &const_node, uint32_t parent_index, size_t index);
+  bool IsParallelNodeSame(const map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                          const NodePtr &const_node, const string &node_key);
 
   ///
   /// @ingroup ge
   /// @brief Migration subgraph Node to Root
   /// @param [in] graph: Root compute graph.
   /// @param [in] func_node: functional Node of Case.
-  /// @param [in] graph_nodes: Data groups of subgraph.
-  /// @param [in] data_base: Data Node for migration.
-  /// @param [in] data_idx: Data groups of subgraph.
+  /// @param [in] all_const_nodes: Const groups of subgraph.
+  /// @param [in] all_data_nodes: Data groups of subgraph.
+  /// @param [in] const_node: Const Node for migration.
+  /// @param [in] node_key: Key of Const node for migration.
   /// @return 0: SUCCESS / others: FAILED
   ///
   Status GraphNodeMigration(const ComputeGraphPtr &graph, const NodePtr &func_node,
-                            map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes,
-                            const NodePtr &data_base, uint32_t data_idx);
+                            const map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                            map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes,
+                            const NodePtr &const_node, const string &node_key);
 
   ///
   /// @ingroup ge
@@ -93,46 +97,42 @@ class SubgraphConstMigrationPass : public GraphPass {
   /// @return 0: SUCCESS / others: FAILED
   ///
   Status MoveNodeToParent(const ComputeGraphPtr &graph, const NodePtr &func_node,
-                          const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes,
-                          uint32_t parent_index, uint32_t anchor_idx,
-                          const map<uint32_t, uint32_t> &inputs, const map<uint32_t, uint32_t> &outputs);
+                          const map<ComputeGraphPtr, map<string, NodePtr>> &all_const_nodes,
+                          const map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes,
+                          const string &node_key, uint32_t parent_index);
 
   ///
   /// @ingroup ge
   /// @brief Append Input Tensor for functional node.
-  /// @param [in] graph_nodes: Data groups of subgraph.
-  /// @param [in] func_node: functional Node of Case.
-  /// @param [in] outputs: Parent index of Node output.
+  /// @param [in] graph_nodes: Const groups of subgraph.
+  /// @param [in/out] parent_index: Parent index for migration.
+  /// @param [in/out] all_data_nodes: Data groups of subgraph.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status AppendParallelNode(map<ComputeGraphPtr, map<uint32_t, NodePtr>> &graph_nodes,
-                            const NodePtr &func_node, map<uint32_t, uint32_t> &outputs);
+  Status AppendParallelNode(const NodePtr &func_node, uint32_t &parent_index,
+                            map<ComputeGraphPtr, map<uint32_t, NodePtr>> &all_data_nodes);
 
   ///
   /// @ingroup ge
-  /// @brief Delete Node from all subgraph.
-  /// @param [in] graph_nodes: Data groups of subgraph.
-  /// @param [in] detach: Node will move to parent.
-  /// @param [in] outputs: Parent index of Node output.
+  /// @brief Delete Node from subgraph.
+  /// @param [in] graph: subgraph for process.
+  /// @param [in] const_node: Node will move to parent.
+  /// @param [in] data_node: Place holder for Const.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status DetachParallelNode(const map<uint32_t, NodePtr> &graph_datas, const NodePtr &detach,
-                            const map<uint32_t, uint32_t> &outputs);
+  Status DetachParallelNode(const ComputeGraphPtr &graph, const NodePtr &const_node, const NodePtr &data_node);
 
   ///
   /// @ingroup ge
   /// @brief Move Node to Parent Graph.
   /// @param [in] graph: Parent compute graph.
   /// @param [in] func_node: functional Node of Case.
-  /// @param [in] attach: Node will move to parent.
-  /// @param [in] inputs: Parent index of Node input.
-  /// @param [in] outputs: Parent index of Node output.
+  /// @param [in] const_node: Node will move to parent.
+  /// @param [in] parent_index: Parent index of Node input.
   /// @return 0: SUCCESS / others: FAILED
   ///
-  Status AttachParallelNode(const ComputeGraphPtr &graph, const NodePtr &func_node, const NodePtr &attach,
-                            const map<uint32_t, uint32_t> &inputs, const map<uint32_t, uint32_t> &outputs);
-
-  bool migration_append_{false};
+  Status AttachParallelNode(const ComputeGraphPtr &graph, const NodePtr &func_node,
+                            const NodePtr &const_node, uint32_t parent_index);
 };
 }  // namespace ge
 #endif  // GE_COMMON_SUBGRAPH_CONST_MIGRATION_H_
\ No newline at end of file
diff --git a/ge/graph/passes/subgraph_pass.cc b/ge/graph/passes/subgraph_pass.cc
index 88e661a7..d1111d52 100755
--- a/ge/graph/passes/subgraph_pass.cc
+++ b/ge/graph/passes/subgraph_pass.cc
@@ -149,10 +149,10 @@ Status SubgraphPass::SubgraphOutputNode(const ComputeGraphPtr &graph, const Node
     //   5. While->NetOutput in known subgraph
     std::string op_type;
     bool insert_flag = NodeUtils::GetConstOpType(in_node, op_type) ||
-                       IsAtomicRequired(in_node, peer_out_anchor->GetIdx()) || IsOutputContinuesRequired(in_node) ||
-                       ((in_node->GetType() == DATA) && (kWhileOpTypes.count(graph->GetParentNode()->GetType()) == 0)) ||
-                       (!graph->GetGraphUnknownFlag() && NodeUtils::IsDynamicShape(node) &&
-                        (kWhileOpTypes.count(in_node->GetType()) != 0));
+        IsAtomicRequired(in_node, peer_out_anchor->GetIdx()) || IsOutputContinuesRequired(in_node) ||
+        ((in_node->GetType() == DATA) && (kWhileOpTypes.count(graph->GetParentNode()->GetType()) == 0)) ||
+        (!graph->GetGraphUnknownFlag() && NodeUtils::IsDynamicShape(node) &&
+            (kWhileOpTypes.count(in_node->GetType()) != 0));
     if (insert_flag) {
       GELOGD("Insert MemcpyAsync node between %s and %s.", in_node->GetName().c_str(), node->GetName().c_str());
       std::string name = node->GetName() + "_input_" + std::to_string(in_data_anchor->GetIdx()) + "_Memcpy";
diff --git a/ge/graph/passes/switch_dead_branch_elimination.cc b/ge/graph/passes/switch_dead_branch_elimination.cc
index 9358c9c3..70105aea 100644
--- a/ge/graph/passes/switch_dead_branch_elimination.cc
+++ b/ge/graph/passes/switch_dead_branch_elimination.cc
@@ -18,8 +18,6 @@
 
 #include <string>
 #include <vector>
-#include "common/ge_inner_error_codes.h"
-#include "common/types.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/common/omg_util.h"
 #include "graph/passes/pass_utils.h"
diff --git a/ge/graph/passes/switch_to_stream_switch_pass.cc b/ge/graph/passes/switch_to_stream_switch_pass.cc
index 529480a6..392968e7 100644
--- a/ge/graph/passes/switch_to_stream_switch_pass.cc
+++ b/ge/graph/passes/switch_to_stream_switch_pass.cc
@@ -17,13 +17,8 @@
 #include "graph/passes/switch_to_stream_switch_pass.h"
 #include <stack>
 #include "common/ge/ge_util.h"
-#include "framework/common/debug/ge_log.h"
-#include "framework/common/debug/log.h"
-#include "framework/common/ge_inner_error_codes.h"
-#include "framework/common/types.h"
 #include "ge/ge_api_types.h"
 #include "graph/common/omg_util.h"
-#include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
 #include "graph/utils/type_utils.h"
 
@@ -72,25 +67,26 @@ Status SwitchToStreamSwitchPass::CheckCycleDependence(const ComputeGraphPtr &gra
   std::unordered_map<NodePtr, std::vector<NodePtr>> cond_switch_map;
   for (const NodePtr &node : graph->GetDirectNode()) {
     GE_CHK_STATUS_RET(GetOriginalType(node, type), "Get node type failed.");
-    if ((type == SWITCH) || (type == REFSWITCH)) {
-      InDataAnchorPtr in_cond_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
-      GE_CHECK_NOTNULL(in_cond_anchor);
-      OutDataAnchorPtr peer_out_anchor = in_cond_anchor->GetPeerOutAnchor();
-      GE_CHECK_NOTNULL(peer_out_anchor);
-      if (FindSwitchCondInput(true, peer_out_anchor) != SUCCESS) {
-        GELOGE(FAILED, "Find pred_input for switch_node %s failed.", node->GetName().c_str());
-        return FAILED;
-      }
+    if ((type != SWITCH) && (type != REFSWITCH)) {
+      continue;
+    }
+    InDataAnchorPtr in_cond_anchor = node->GetInDataAnchor(SWITCH_PRED_INPUT);
+    GE_CHECK_NOTNULL(in_cond_anchor);
+    OutDataAnchorPtr peer_out_anchor = in_cond_anchor->GetPeerOutAnchor();
+    GE_CHECK_NOTNULL(peer_out_anchor);
+    if (FindSwitchCondInput(peer_out_anchor) != SUCCESS) {
+      GELOGE(FAILED, "Find pred_input for switch_node %s failed.", node->GetName().c_str());
+      return FAILED;
+    }
 
-      NodePtr cond_node = peer_out_anchor->GetOwnerNode();
-      auto iter = cond_switch_map.find(cond_node);
-      if (iter == cond_switch_map.end()) {
-        cond_switch_map[cond_node] = { node };
-      } else {
-        iter->second.emplace_back(node);
-      }
-      switch_nodes_.emplace_back(node);
+    NodePtr cond_node = peer_out_anchor->GetOwnerNode();
+    auto iter = cond_switch_map.find(cond_node);
+    if (iter == cond_switch_map.end()) {
+      cond_switch_map[cond_node] = { node };
+    } else {
+      iter->second.emplace_back(node);
     }
+    switch_nodes_.emplace_back(node);
   }
 
   MarkCycleDependence(cond_switch_map);
@@ -124,12 +120,13 @@ void SwitchToStreamSwitchPass::MarkCycleDependence(
       if (visited.count(tmp_node) > 0) {
         continue;
       }
-      GELOGD("MarkCycleDependence: tmp_node=%s.", tmp_node->GetName().c_str());
       for (const NodePtr &out_node : tmp_node->GetOutAllNodes()) {
         if (switch_nodes.find(out_node) == switch_nodes.end()) {
           out_nodes.push(out_node);
           continue;
         }
+        GELOGD("MarkCycleDependence: tmp_node=%s, switch_node=%s.",
+               tmp_node->GetName().c_str(), out_node->GetName().c_str());
         GE_IF_BOOL_EXEC(SetCyclicDependenceFlag(out_node) != SUCCESS,
                         GELOGW("set cyclic dependence attr failed."); return );
         auto map_iter = switch_cyclic_map_.find(out_node);
@@ -241,10 +238,6 @@ Status SwitchToStreamSwitchPass::BypassSwitchNode(const NodePtr &switch_node, Ou
     if (idx == SWITCH_DATA_INPUT) {
       peer_data_anchor = peer_out_anchor;
     } else {
-      if (FindSwitchCondInput(false, peer_out_anchor) != SUCCESS) {
-        GELOGE(FAILED, "Find pred_input for switch_node %s failed.", switch_node->GetName().c_str());
-        return FAILED;
-      }
       peer_cond_anchor = peer_out_anchor;
     }
   }
@@ -254,15 +247,14 @@ Status SwitchToStreamSwitchPass::BypassSwitchNode(const NodePtr &switch_node, Ou
 
 ///
 /// @brief Find Switch cond input
-/// @param [in] pass_switch_flag
 /// @param [out] peer_cond_anchor
 /// @return Status
 ///
-Status SwitchToStreamSwitchPass::FindSwitchCondInput(bool pass_switch_flag, OutDataAnchorPtr &peer_cond_anchor) {
+Status SwitchToStreamSwitchPass::FindSwitchCondInput(OutDataAnchorPtr &peer_cond_anchor) {
   NodePtr tmp_node = nullptr;
-  string type;
-  bool need_pass_type = true;
-  while (need_pass_type) {
+  std::string type;
+  bool pass_flag = true;
+  while (pass_flag) {
     if (tmp_node == nullptr) {
       tmp_node = peer_cond_anchor->GetOwnerNode();
     } else {
@@ -274,7 +266,7 @@ Status SwitchToStreamSwitchPass::FindSwitchCondInput(bool pass_switch_flag, OutD
     }
 
     GE_CHK_STATUS_RET(GetOriginalType(tmp_node, type), "Get node type failed.");
-    need_pass_type = (pass_switch_flag && ((type == SWITCH) || (type == REFSWITCH)));
+    pass_flag = ((type == SWITCH) || (type == REFSWITCH));
   }
 
   return SUCCESS;
@@ -369,7 +361,7 @@ Status SwitchToStreamSwitchPass::MarkBranches(const OutDataAnchorPtr &peer_cond_
     }
   } else {
     int64_t switch_group_id = GetGroupId(stream_switch);
-    map<int64_t, std::vector<std::list<NodePtr>>> switch_group_map;
+    std::map<int64_t, std::vector<std::list<NodePtr>>> switch_group_map;
     std::list<NodePtr> false_node_list;
     std::list<NodePtr> true_node_list;
     std::list<NodePtr> &node_list = true_branch_flag ? true_node_list : false_node_list;
@@ -389,7 +381,7 @@ Status SwitchToStreamSwitchPass::MarkBranches(const OutDataAnchorPtr &peer_cond_
 /// @return group_id
 ///
 int64_t SwitchToStreamSwitchPass::GetGroupId(const NodePtr &node) {
-  string tailing_optimization_option;
+  std::string tailing_optimization_option;
   bool is_tailing_optimization = false;
   if (GetContext().GetOption(OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION, tailing_optimization_option) == GRAPH_SUCCESS) {
     // "1" means it's True from frontend option
@@ -400,7 +392,7 @@ int64_t SwitchToStreamSwitchPass::GetGroupId(const NodePtr &node) {
     return 0;
   }
 
-  string hccl_group_id;
+  std::string hccl_group_id;
   if (!AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_HCCL_FUSED_GROUP, hccl_group_id)) {
     GELOGI("Node %s can not find hccl group id.", node->GetName().c_str());
     return 0;
@@ -432,6 +424,7 @@ Status SwitchToStreamSwitchPass::CombineSwitchNode(const ComputeGraphPtr &graph)
       same_cond_switch.insert(true_switch_list.begin(), true_switch_list.end());
 
       OutDataAnchorPtr peer_cond_anchor = iter->first;
+      GE_CHECK_NOTNULL(peer_cond_anchor);
       NodePtr cond_node = peer_cond_anchor->GetOwnerNode();
       GELOGI("CombineSwitchNode: cond_node=%s.", cond_node->GetName().c_str());
 
@@ -549,6 +542,7 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con
 
   NodePtr cast_node = graph->AddNode(cast_desc);
   GE_CHK_BOOL_EXEC(cast_node != nullptr, return nullptr, "Create cast_node failed.");
+  // Cast node has and only has one input
   GE_CHK_STATUS(GraphUtils::AddEdge(peer_cond_anchor, cast_node->GetInDataAnchor(0)), "Cast add data edge failed.");
 
   return cast_node;
@@ -604,7 +598,7 @@ Status SwitchToStreamSwitchPass::AddConstNode(const ComputeGraphPtr &graph, cons
 ///
 Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_node, const NodePtr &cast_node,
                                                         const std::set<NodePtr> &same_cond_switch) {
-  GELOGI("ModifySwitchInCtlEdges: switch_node=%s, active_node=%s", switch_node->GetName().c_str(),
+  GELOGD("ModifySwitchInCtlEdges: switch_node=%s, cast_node=%s", switch_node->GetName().c_str(),
          cast_node->GetName().c_str());
   std::string orig_switch_name = switch_node->GetName();
   OpDescPtr switch_desc = switch_node->GetOpDesc();
@@ -614,24 +608,24 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no
     return INTERNAL_ERROR;
   }
 
-  for (const NodePtr &in_ctl_node : switch_node->GetInControlNodes()) {
-    GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctl_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()),
+  for (const NodePtr &in_ctrl_node : switch_node->GetInControlNodes()) {
+    GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctrl_node->GetOutControlAnchor(), switch_node->GetInControlAnchor()),
                   "Remove ctl edge failed.");
-    GE_IF_BOOL_EXEC(!in_ctl_node->GetOutControlAnchor()->IsLinkedWith(cast_node->GetInControlAnchor()), {
-      GE_CHK_STATUS(GraphUtils::AddEdge(in_ctl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
+    GE_IF_BOOL_EXEC(!in_ctrl_node->GetOutControlAnchor()->IsLinkedWith(cast_node->GetInControlAnchor()), {
+      GE_CHK_STATUS(GraphUtils::AddEdge(in_ctrl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
                     "Add ctl edge failed.");
     });
 
-    GE_IF_BOOL_EXEC(in_ctl_node->GetType() != STREAMSWITCH, continue);
-    if (same_cond_switch.count(in_ctl_node) > 0) {
-      GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
+    GE_IF_BOOL_EXEC(in_ctrl_node->GetType() != STREAMSWITCH, continue);
+    if (same_cond_switch.count(in_ctrl_node) > 0) {
+      GE_CHK_STATUS(GraphUtils::RemoveEdge(in_ctrl_node->GetOutControlAnchor(), cast_node->GetInControlAnchor()),
                     "Remove ctl edge failed.");
       continue;
     }
 
-    auto find_res1 = switch_node_map_.find(in_ctl_node);
+    auto find_res1 = switch_node_map_.find(in_ctrl_node);
     GE_IF_BOOL_EXEC(find_res1 == switch_node_map_.end(), {
-      GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", in_ctl_node->GetName().c_str());
+      GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", in_ctrl_node->GetName().c_str());
       return INTERNAL_ERROR;
     });
     auto find_res2 = find_res1->second.find(orig_switch_name);
@@ -655,7 +649,7 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no
 ///
 Status SwitchToStreamSwitchPass::ModifySwitchOutCtlEdges(const NodePtr &switch_node, const NodePtr &stream_switch,
                                                          const NodePtr &active_node) {
-  GELOGI("ModifySwitchOutCtlEdges: switch_node=%s, stream_switch=%s, active_node=%s", switch_node->GetName().c_str(),
+  GELOGD("ModifySwitchOutCtlEdges: switch_node=%s, stream_switch=%s, active_node=%s", switch_node->GetName().c_str(),
          stream_switch->GetName().c_str(), active_node->GetName().c_str());
   auto find_res = switch_node_map_.find(switch_node);
   GE_IF_BOOL_EXEC(find_res == switch_node_map_.end(), {
diff --git a/ge/graph/passes/switch_to_stream_switch_pass.h b/ge/graph/passes/switch_to_stream_switch_pass.h
index 48725230..05628871 100644
--- a/ge/graph/passes/switch_to_stream_switch_pass.h
+++ b/ge/graph/passes/switch_to_stream_switch_pass.h
@@ -42,9 +42,9 @@ namespace ge {
   +-----------+                +-----------+
   |   Const   |                | VariableV2|
   +-----------+                +-----------+
-*/
 
-/* Switch branch op optimize, Switches in same case merge to one StreamSwitch, update following nodes' input
+
+  Switch branch op optimize, Switches in same case merge to one StreamSwitch, update following nodes' input
 
                                             +-----------+
                                           / |   task2   | \
@@ -131,11 +131,10 @@ class SwitchToStreamSwitchPass : public GraphPass {
 
   ///
   /// @brief Find Switch cond input
-  /// @param [in] pass_switch_flag
   /// @param [out] peer_cond_anchor
   /// @return Status
   ///
-  Status FindSwitchCondInput(bool pass_switch_flag, OutDataAnchorPtr &peer_cond_anchor);
+  Status FindSwitchCondInput(OutDataAnchorPtr &peer_cond_anchor);
 
   ///
   /// @brief Create StreamSwitch Node
diff --git a/ge/graph/passes/transop_breadth_fusion_pass.cc b/ge/graph/passes/transop_breadth_fusion_pass.cc
index 689510f0..654c3822 100644
--- a/ge/graph/passes/transop_breadth_fusion_pass.cc
+++ b/ge/graph/passes/transop_breadth_fusion_pass.cc
@@ -70,8 +70,10 @@ std::string TransOpBreadthFusionPass::GetNodeId(const int anchor_index, const No
     trans_data_type = true;
     trans_format = true;
     trans_shape = true;
-  } else if (node->GetType() == RESHAPE) {
+  } else if (node->GetType() == RESHAPE || node->GetType() == EXPANDDIMS || node->GetType() == SQUEEZE) {
     trans_shape = true;
+  } else if (node->GetType() == REFORMAT) {
+    trans_format = true;
   }
 
   id << node->GetType() << '-' << anchor_index;
diff --git a/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/ge/graph/passes/transop_without_reshape_fusion_pass.cc
index d2b3f1b1..6bea9edc 100644
--- a/ge/graph/passes/transop_without_reshape_fusion_pass.cc
+++ b/ge/graph/passes/transop_without_reshape_fusion_pass.cc
@@ -63,7 +63,7 @@ void TransOpWithoutReshapeFusionPass::SetRemainNode(
       continue;
     }
     GELOGI("SetRemainNode node is %s", op_desc->GetName().c_str());
-    GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return );
+    GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return);
   }
 }
 
@@ -594,7 +594,7 @@ void TransOpWithoutReshapeFusionPass::GetBeginOutDescAndEndInDesc(const int inde
   auto out_owner_node = out_peer_anchor->GetOwnerNode();
   GE_CHECK_NOTNULL_JUST_RETURN(out_owner_node);
   auto out_peer_op_desc = out_owner_node->GetOpDesc();
-  GE_IF_BOOL_EXEC(out_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "out_peer_op_desc is nullptr"); return );
+  GE_IF_BOOL_EXEC(out_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "out_peer_op_desc is nullptr"); return);
   out_desc = out_peer_op_desc->GetInputDesc(out_peer_anchor->GetIdx());
 
   auto in_peer_anchor = nodes_anchor.back().first;
@@ -602,7 +602,7 @@ void TransOpWithoutReshapeFusionPass::GetBeginOutDescAndEndInDesc(const int inde
   auto in_owner_node = in_peer_anchor->GetOwnerNode();
   GE_CHECK_NOTNULL_JUST_RETURN(in_owner_node);
   auto in_peer_op_desc = in_owner_node->GetOpDesc();
-  GE_IF_BOOL_EXEC(in_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "in_peer_op_desc is nullptr"); return );
+  GE_IF_BOOL_EXEC(in_peer_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "in_peer_op_desc is nullptr"); return);
   in_desc = in_peer_op_desc->GetOutputDesc(in_peer_anchor->GetIdx());
 }
 
@@ -734,10 +734,14 @@ void TransOpWithoutReshapeFusionPass::RemoveNousedNodes(const ComputeGraphPtr &g
         continue;
       }
 
-      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return );
+      GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return);
       GELOGI("remove node:%s", node->GetName().c_str());
-      if (graph->RemoveNode(node) != GRAPH_SUCCESS) {
-        GELOGW("remove node failed!node:%s", node->GetName().c_str());
+      if (GraphUtils::IsolateNode(node, {0}) != GRAPH_SUCCESS) {
+        GELOGW("Isolate node: %s failed.", node->GetName().c_str());
+        continue;
+      }
+      if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) {
+        GELOGW("Remove node: %s failed.", node->GetName().c_str());
         continue;
       }
     }
diff --git a/ge/graph/passes/transpose_transdata_pass.cc b/ge/graph/passes/transpose_transdata_pass.cc
index 7348f143..2178eac7 100644
--- a/ge/graph/passes/transpose_transdata_pass.cc
+++ b/ge/graph/passes/transpose_transdata_pass.cc
@@ -217,11 +217,11 @@ void TransposeTransDataPass::CopyInputEdges(NodePtr &origin_node, NodePtr &new_n
   }
   OutDataAnchorPtr out_anchor = origin_node->GetInDataAnchor(0)->GetPeerOutAnchor();
   new_in_data_anchor->UnlinkAll();
-  GE_IF_BOOL_EXEC(new_in_data_anchor->LinkFrom(out_anchor) != GRAPH_SUCCESS, GELOGW("Link failed"); return );
+  GE_IF_BOOL_EXEC(new_in_data_anchor->LinkFrom(out_anchor) != GRAPH_SUCCESS, GELOGW("Link failed"); return);
 
   // control anchor only link to control anchor
   GE_IF_BOOL_EXEC(
-    GraphUtils::CopyInCtrlEdges(origin_node, new_node) != GRAPH_SUCCESS, GELOGW("Copy in ctrl edges failed"); return );
+    GraphUtils::CopyInCtrlEdges(origin_node, new_node) != GRAPH_SUCCESS, GELOGW("Copy in ctrl edges failed"); return);
 }
 
 bool TransposeTransDataPass::TransDataCheckAccuracySupported(const OpDescPtr &op_desc) {
diff --git a/ge/graph/passes/unused_args_clean_pass.cc b/ge/graph/passes/unused_args_clean_pass.cc
index 83fd0438..ec66b129 100755
--- a/ge/graph/passes/unused_args_clean_pass.cc
+++ b/ge/graph/passes/unused_args_clean_pass.cc
@@ -204,6 +204,10 @@ Status UnusedArgsCleanPass::RemoveInputTensor(const map<ComputeGraphPtr, map<uin
   GE_CHK_GRAPH_STATUS_RET(GraphUtils::RemoveEdge(out_anchor, old_anchor), "Remove edge failed");
   GELOGI("Remove edge: %s %s", out_node->GetName().c_str(), func_node->GetName().c_str());
 
+  if (out_node->GetInDataNodes().size() == 0 && out_node->GetOutAllNodes().size() == 0) {
+    GE_CHK_GRAPH_STATUS_RET(out_node->GetOwnerComputeGraph()->RemoveNode(out_node), "Remove node failed: %s",
+                            out_node->GetName().c_str());
+  }
   return SUCCESS;
 }
 }  // namespace ge
\ No newline at end of file
diff --git a/ge/graph/passes/useless_control_out_remove_pass.cc b/ge/graph/passes/useless_control_out_remove_pass.cc
new file mode 100644
index 00000000..4d74d582
--- /dev/null
+++ b/ge/graph/passes/useless_control_out_remove_pass.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/passes/useless_control_out_remove_pass.h"
+
+#include "graph/debug/ge_attr_define.h"
+#include "graph/utils/graph_utils.h"
+#include "framework/common/debug/ge_log.h"
+#include "framework/common/debug/log.h"
+
+namespace ge {
+Status UselessControlOutRemovePass::Run(NodePtr &node) {
+  GE_CHECK_NOTNULL(node);
+
+  if ((node->GetType() != CONSTANT) && (node->GetType() != CONSTANTOP)) {
+    return SUCCESS;
+  }
+  GELOGD("UselessControlOutRemovePass running, node: %s.", node->GetName().c_str());
+
+  // const has no control input
+  if (node->GetInControlNodes().empty()) {
+    if (node->GetOutDataNodes().empty()) {
+      // It is an isolated const, just remove it.
+      GELOGI("Delete isolated const: %s.", node->GetName().c_str());
+      GE_CHK_STATUS_RET(IsolateAndDeleteNode(node, {}))
+      AddNodeDeleted(node);
+    } else {
+      auto out_ctrl_anchor = node->GetOutControlAnchor();
+      if (out_ctrl_anchor != nullptr && !out_ctrl_anchor->GetPeerAnchors().empty()) {
+        GELOGI("Node: %s unlink all out control edge.", node->GetName().c_str());
+        out_ctrl_anchor->UnlinkAll();
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+}  // namespace ge
\ No newline at end of file
diff --git a/ge/graph/passes/useless_control_out_remove_pass.h b/ge/graph/passes/useless_control_out_remove_pass.h
new file mode 100644
index 00000000..d84b918f
--- /dev/null
+++ b/ge/graph/passes/useless_control_out_remove_pass.h
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_PASSES_USELESS_CONTROL_OUT_REMOVE_PASS_H_
+#define GE_GRAPH_PASSES_USELESS_CONTROL_OUT_REMOVE_PASS_H_
+
+#include "graph/passes/base_pass.h"
+
+namespace ge {
+class UselessControlOutRemovePass : public BaseNodePass {
+ public:
+  Status Run(NodePtr &node) override;
+};
+}  // namespace ge
+
+#endif  // GE_GRAPH_PASSES_USELESS_CONTROL_OUT_REMOVE_PASS_H_
\ No newline at end of file
diff --git a/ge/graph/passes/variable_op_pass_bak.cc b/ge/graph/passes/variable_op_pass_bak.cc
index 3e40e686..c9218296 100644
--- a/ge/graph/passes/variable_op_pass_bak.cc
+++ b/ge/graph/passes/variable_op_pass_bak.cc
@@ -252,7 +252,6 @@ Status VariableOpPass::RenewTransRoadDesc(const NodePtr &var, VarTransRoad &fusi
   // case 2: suppose input format of transdata not equal with out format
   // and input format not equal with var
   // so we make input format equal with var
-
   for (auto &cur_trans : fusion_road) {
     if (cur_trans.input.GetFormat() == cur_trans.output.GetFormat()) {
       cur_trans.output.SetFormat(prev_node_info.output.GetFormat());
@@ -319,8 +318,8 @@ Status VariableOpPass::FusionIfNeed(const NodePtr &var, VarTransRoad &fusion_roa
 }
 
 Status VariableOpPass::UpdateTransRoad(VarTransRoad &fusion_road, vector<std::string> &first_path_trans_order,
-                                       map<std::string,std::pair<std::string, bool>> &trans_type_to_changed_desc,
-                                       map<std::string,vector<NodePtr>> &trans_type_to_trans_ops){
+                                       map<std::string, std::pair<std::string, bool>> &trans_type_to_changed_desc,
+                                       map<std::string, vector<NodePtr>> &trans_type_to_trans_ops){
   vector<std::string> delete_trans_type;
   for (auto &trans_type : first_path_trans_order) {
     if (trans_type_to_changed_desc.find(trans_type) == trans_type_to_changed_desc.end()) {
diff --git a/ge/graph/passes/variable_op_pass_bak.h b/ge/graph/passes/variable_op_pass_bak.h
index b9fbb90e..fccd063b 100644
--- a/ge/graph/passes/variable_op_pass_bak.h
+++ b/ge/graph/passes/variable_op_pass_bak.h
@@ -45,8 +45,8 @@ class VariableOpPass : public GraphPass {
 
  private:
   Status UpdateTransRoad(VarTransRoad &fusion_road, vector<string> &trans_road_order,
-                         map<string,pair<string, bool>> &trans_type_to_changed_desc,
-                         map<string,vector<NodePtr>> &trans_type_to_trans_ops);
+                         map<string, pair<string, bool>> &trans_type_to_changed_desc,
+                         map<string, vector<NodePtr>> &trans_type_to_trans_ops);
 
   Status DealFusion(const ge::NodePtr &var_node, VarTransRoad &fusion_road,
                     map<string, pair<string, bool>> trans_type_to_changed_desc,
diff --git a/ge/graph/preprocess/graph_preprocess.cc b/ge/graph/preprocess/graph_preprocess.cc
index b899ee83..91fab280 100644
--- a/ge/graph/preprocess/graph_preprocess.cc
+++ b/ge/graph/preprocess/graph_preprocess.cc
@@ -18,9 +18,7 @@
 #include <map>
 #include <set>
 #include <string>
-#include <utility>
 #include "common/formats/format_transfers/format_transfer_fractal_nz.h"
-#include "common/formats/format_transfers/format_transfer_fractal_z.h"
 #include "common/formats/format_transfers/format_transfer_nchw_nc1hwc0.h"
 #include "common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.h"
 #include "common/formats/format_transfers/format_transfer_transpose.h"
@@ -28,13 +26,9 @@
 #include "common/helper/model_helper.h"
 #include "common/math/math_util.h"
 #include "common/op/ge_op_utils.h"
-#include "common/util/error_manager/error_manager.h"
-#include "common/formats/utils/formats_trans_utils.h"
-#include "framework/common/debug/ge_log.h"
 #include "graph/common/ge_call_wrapper.h"
 #include "graph/common/local_context.h"
 #include "graph/common/transop_util.h"
-#include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
 #include "graph/shape_refiner.h"
 #include "graph/manager/graph_var_manager.h"
@@ -43,30 +37,23 @@
 #include "graph/passes/addn_pass.h"
 #include "graph/passes/aicpu_constant_folding_pass.h"
 #include "graph/passes/assert_pass.h"
-#include "graph/passes/assign_pass.h"
-#include "graph/passes/base_pass.h"
+#include "ge/ge_api_types.h"
 #include "graph/passes/common_subexpression_elimination_pass.h"
 #include "graph/passes/cond_pass.h"
 #include "graph/passes/cond_remove_pass.h"
 #include "graph/passes/constant_folding_pass.h"
-#include "graph/passes/constant_fuse_same_pass.h"
-#include "graph/passes/control_trigger_pass.h"
 #include "graph/passes/dimension_adjust_pass.h"
 #include "graph/passes/dimension_compute_pass.h"
 #include "graph/passes/dropout_pass.h"
 #include "graph/passes/enter_pass.h"
-#include "graph/passes/flow_ctrl_pass.h"
 #include "graph/passes/for_pass.h"
-#include "graph/passes/get_original_format_pass.h"
 #include "graph/passes/guarantee_const_pass.h"
 #include "graph/passes/hccl_group_pass.h"
 #include "graph/passes/hccl_memcpy_pass.h"
 #include "graph/passes/identity_pass.h"
 #include "graph/passes/infershape_pass.h"
-#include "graph/passes/iterator_op_pass.h"
 #include "graph/passes/merge_pass.h"
 #include "graph/passes/net_output_pass.h"
-#include "graph/passes/next_iteration_pass.h"
 #include "graph/passes/no_use_reshape_remove_pass.h"
 #include "graph/passes/parallel_concat_start_op_pass.h"
 #include "graph/passes/placeholder_with_default_pass.h"
@@ -81,45 +68,19 @@
 #include "graph/passes/shape_operate_op_remove_pass.h"
 #include "graph/passes/snapshot_pass.h"
 #include "graph/passes/stop_gradient_pass.h"
-#include "graph/passes/subgraph_pass.h"
-#include "graph/passes/switch_data_edges_bypass.h"
 #include "graph/passes/switch_dead_branch_elimination.h"
-#include "graph/passes/switch_logic_remove_pass.h"
-#include "graph/passes/merge_to_stream_merge_pass.h"
-#include "graph/passes/switch_to_stream_switch_pass.h"
-#include "graph/passes/attach_stream_label_pass.h"
 #include "graph/passes/unused_const_pass.h"
-#include "graph/passes/unused_op_remove_pass.h"
 #include "graph/passes/var_is_initialized_op_pass.h"
 #include "graph/passes/variable_prepare_op_pass.h"
 #include "graph/preprocess/insert_op/util_insert_aipp_op.h"
-#include "graph/types.h"
-#include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
 #include "inc/pass_manager.h"
 #include "init/gelib.h"
 #include "multi_batch_copy_graph.h"
-#include "runtime/dev.h"
 
-#include "graph/passes/dimension_adjust_pass.h"
-#include "graph/passes/link_gen_mask_nodes_pass.h"
-#include "graph/passes/permute_pass.h"
-#include "graph/passes/reshape_remove_pass.h"
-#include "graph/passes/same_transdata_breadth_fusion_pass.h"
-#include "graph/passes/transop_breadth_fusion_pass.h"
-#include "graph/passes/transop_depth_fusion_pass.h"
-#include "graph/passes/transop_nearby_allreduce_fusion_pass.h"
-
-#include "graph/passes/cast_remove_pass.h"
 #include "graph/passes/data_pass.h"
-#include "graph/passes/transop_without_reshape_fusion_pass.h"
-#include "graph/passes/transpose_transdata_pass.h"
-#include "graph/passes/variable_op_pass.h"
-#include "graph/passes/variable_prepare_op_pass.h"
-#include "graph/passes/variable_ref_delete_op_pass.h"
 #include "graph/passes/mark_agnostic_pass.h"
 
-
 namespace ge {
 namespace {
 static std::map<std::string, ge::DataType> output_type_str_to_datatype = {
@@ -938,6 +899,160 @@ Status ProcessNetoutputNodeDynShape(NodePtr &node) {
   }
   return SUCCESS;
 }
+long StringToLongNoThrow(const string &str) {
+  try {
+    return std::stol(str);
+  } catch (const std::invalid_argument) {
+    GELOGE(PARAM_INVALID,
+           "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example: "
+           "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"",
+           str.c_str());
+    return PARAM_INVALID;
+  } catch (const std::out_of_range) {
+    GELOGE(PARAM_INVALID,
+           "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example: "
+           "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"",
+           str.c_str());
+    return PARAM_INVALID;
+  }
+}
+/**
+ * Parser shape_range from string to vector
+ * shape_range from option normally is "[1~20,3,3~6,-1],[1~20,3,3~6,-1]"
+ * @param shape_range
+ */
+Status ParseDynamicInputShapeRange(const std::string &shape_range,
+                                   std::vector<std::vector<std::pair<int64_t, int64_t>>> &range) {
+  if (shape_range.size() < 2) {
+    GELOGE(PARAM_INVALID, "Shape range %s is invalid.", shape_range.c_str());
+    return PARAM_INVALID;
+  }
+  // different shape_range of single input are split by ']'
+  vector<string> shape_range_set = ge::StringUtils::Split(shape_range, ']');
+  if (shape_range_set.empty()) {
+    GELOGE(PARAM_INVALID, "Shape range %s is not valid. Correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"",
+           shape_range.c_str());
+    return PARAM_INVALID;
+  }
+  for (auto &shape_range_str : shape_range_set) {
+    if (shape_range_str.empty()) {
+      continue;
+    }
+    // trim start bytes, after that, single input should be "1~20,3,3~6,-1"
+    if (ge::StringUtils::StartWith(shape_range_str, "[")) {
+      shape_range_str = shape_range_str.substr(1, shape_range_str.size());
+    }
+    if (ge::StringUtils::StartWith(shape_range_str, ",")) {
+      shape_range_str = shape_range_str.substr(2, shape_range_str.size());
+    }
+
+    // parse shape_range of single input. eg. "1~20,3,3~6,-1"
+    std::vector<std::pair<int64_t, int64_t>> range_of_single_input;
+    vector<string> dim_range_set = ge::StringUtils::Split(shape_range_str, ',');
+    for (const auto &range_pair_str : dim_range_set) {
+      vector<string> range_pair_set = ge::StringUtils::Split(range_pair_str, '~');
+      pair<int64_t, int64_t> range_pair;
+      if (range_pair_set.size() == 1) {
+        // fix dim
+        auto range_value = StringToLongNoThrow(range_pair_set.at(0).c_str());
+        if (range_value < 0) {
+          range_pair = std::make_pair(0, range_value);
+        } else {
+          range_pair = std::make_pair(range_value, range_value);
+        }
+      } else if (range_pair_set.size() == 2) {
+        // unknown dim, should get range.
+        auto range_left = StringToLongNoThrow(range_pair_set.at(0).c_str());
+        auto range_right = StringToLongNoThrow(range_pair_set.at(1).c_str());
+        range_pair = std::make_pair(range_left, range_right);
+      } else {
+        GELOGE(PARAM_INVALID,
+               "Shape range of input is invalid. Given %s, while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"",
+               shape_range.c_str());
+        return PARAM_INVALID;
+      }
+      range_of_single_input.emplace_back(range_pair);
+    }
+    range.emplace_back(range_of_single_input);
+  }
+  return SUCCESS;
+}
+
+Status GetDynamicInputShapeRange(const std::vector<GeTensor> &user_input, const std::map<string, string> &graph_option,
+                                 vector<vector<std::pair<int64_t, int64_t>>> &range_vec) {
+  auto mode_iter = graph_option.find(OPTION_EXEC_DYNAMIC_EXECUTE_MODE);
+  if (mode_iter == graph_option.end()) {
+    GELOGD("Graph Option: Can not find %s option in graph options.", OPTION_EXEC_DYNAMIC_EXECUTE_MODE);
+    return SUCCESS;
+  }
+  GELOGD("Graph Option: dynamic_input_mode value is %s.", mode_iter->second.c_str());
+  if (mode_iter->second != "dynamic_execute") {
+    return SUCCESS;
+  }
+  auto iter = graph_option.find(OPTION_EXEC_DATA_INPUTS_SHAPE_RANGE);
+  if (iter == graph_option.end()) {
+    GELOGE(PARAM_INVALID, "Graph option %s is required when %s is dynamic_execute", OPTION_EXEC_DATA_INPUTS_SHAPE_RANGE,
+           OPTION_EXEC_DYNAMIC_EXECUTE_MODE);
+    return PARAM_INVALID;
+  }
+  GELOGD("GraphOption: dynamic_inputs_shape_range value is %s.", iter->second.c_str());
+  auto ret = ParseDynamicInputShapeRange(iter->second, range_vec);
+  GE_CHK_STATUS_RET(ret, "Parse dynamic input shape range failed.");
+  if (range_vec.size() != user_input.size()) {
+    GELOGE(PARAM_INVALID, "Dynamic input shape range size is %zu, inputs size is %zu. Not match.", range_vec.size(),
+           user_input.size());
+    return PARAM_INVALID;
+  }
+  return SUCCESS;
+}
+
+Status UpdateDynamicInputShapeRange(const ge::GeAttrValue::INT index,
+                                    const vector<vector<std::pair<int64_t, int64_t>>> &range_vec, OpDescPtr &op,
+                                    GeTensorDesc &desc) {
+  auto origin_shape = desc.GetShape();
+  auto current_shape_range_vec = range_vec.at(index);
+  if (current_shape_range_vec.size() != origin_shape.GetDimNum()) {
+    GELOGE(PARAM_INVALID, "Given shape_range dim num is %zu, current dim num is %zu, not match.Pleace Check.",
+           current_shape_range_vec.size(), origin_shape.GetDimNum());
+    return PARAM_INVALID;
+  }
+  for (size_t i = 0; i < origin_shape.GetDimNum(); ++i) {
+    if (current_shape_range_vec.at(i).first == current_shape_range_vec.at(i).second) {
+      // given shape_range is known dim, check is same as origin or not
+      if (origin_shape.GetDim(i) != current_shape_range_vec.at(i).first) {
+        GELOGE(PARAM_INVALID, "Given shape range is %ld, current dim shape is %ld, not match.Pleace Check.",
+              current_shape_range_vec.at(i).first, origin_shape.GetDim(i));
+        return PARAM_INVALID;
+      }
+      origin_shape.SetDim(i, current_shape_range_vec.at(i).first);
+    } else {
+      origin_shape.SetDim(i, -1);
+    }
+  }
+  desc.SetShape(origin_shape);
+  desc.SetShapeRange(current_shape_range_vec);
+
+  int64_t dynamic_shape_size = 1;
+  for (const auto range_pair : range_vec.at(index)) {
+    FMK_INT64_MULCHECK(dynamic_shape_size, range_pair.second);
+    dynamic_shape_size *= range_pair.second;
+  }
+  auto data_type_size = GetSizeByDataType(desc.GetDataType());
+  if (data_type_size < 0) {
+    GELOGE(PARAM_INVALID, "Input data type is %s, is not supported.",
+           TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str());
+    return PARAM_INVALID;
+  }
+  FMK_INT64_MULCHECK(dynamic_shape_size, data_type_size);
+  dynamic_shape_size *= data_type_size;
+  GELOGI("In dynamic_execute mode ,set input %s shape range size %ld", op->GetName().c_str(), dynamic_shape_size);
+  ge::TensorUtils::SetSize(desc, dynamic_shape_size);
+  graphStatus graph_ret = op->UpdateInputDesc(0, desc);
+  GE_CHK_STATUS_RET(graph_ret, "UpdateInputDesc fail, graph ret: %u", graph_ret);
+  graph_ret = op->UpdateOutputDesc(0, desc);
+  GE_CHK_STATUS_RET(graph_ret, "UpdateInputDesc fail, graph ret: %u", graph_ret);
+  return SUCCESS;
+}
 }  // namespace
 
 GraphPrepare::GraphPrepare() : compute_graph_(nullptr) {}
@@ -1142,7 +1257,11 @@ Status GraphPrepare::AdjustDataOpOutput(const NodePtr &node) {
   return SUCCESS;
 }
 
-Status GraphPrepare::UpdateInput(const std::vector<GeTensor> &user_input) {
+Status GraphPrepare::UpdateInput(const std::vector<GeTensor> &user_input, const std::map<string,string> &graph_option) {
+  // Get shape range of input in dynamic_execute mode
+  vector<vector<std::pair<int64_t,int64_t>>> dynamic_shape_range_vec;
+  auto ret = GetDynamicInputShapeRange(user_input, graph_option, dynamic_shape_range_vec);
+  GE_CHK_STATUS_RET(ret, "Graph option is not right on Dynamic execute mode.");
   compute_graph_->SaveDataFormat(ge::TypeUtils::DomiFormatToFormat(GetLocalOmgContext().format));
   for (NodePtr &input_node : compute_graph_->GetDirectNode()) {
     GE_CHECK_NOTNULL(input_node);
@@ -1225,6 +1344,12 @@ Status GraphPrepare::UpdateInput(const std::vector<GeTensor> &user_input) {
         return graph_ret;
       }
 
+      if (!dynamic_shape_range_vec.empty()) {
+        ret = UpdateDynamicInputShapeRange(index, dynamic_shape_range_vec, op, desc);
+        GE_CHK_STATUS_RET(ret, "Fail to update dynamic input shape range on %s.", op->GetName().c_str());
+        continue;
+      }
+
       if (!options_.train_graph_flag) {
         Status ret = AdjustDataOpOutput(input_node);
         GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "AdjustDataOpOutput fail, ret:%u", ret); return ret);
@@ -1398,17 +1523,17 @@ Status GraphPrepare::SaveOriginalGraphToOmModel() {
     GELOGI("Prepare %s on graph %s success.", name, compute_graph->GetName().c_str()); \
   } while (0)
 
-Status GraphPrepare::PrepareDynShape(ConstGraphPtr graph, const std::vector<GeTensor> &user_input,
+Status GraphPrepare::PrepareDynShape(const GraphNodePtr &graph_node, const std::vector<GeTensor> &user_input,
                                      ge::ComputeGraphPtr &compute_graph, uint64_t session_id) {
-  GE_CHECK_NOTNULL(graph);
+  GE_CHECK_NOTNULL(graph_node->GetGraph());
   GE_CHECK_NOTNULL(compute_graph);
 
   GetLocalOmgContext().type = static_cast<domi::FrameworkType>(options_.framework_type);
-  const Graph &const_graph = *graph;
+  const Graph &const_graph = *graph_node->GetGraph();
 
   PP_RUN("Init", Init, const_graph, session_id);
   PP_RUN("SetRtContext", SetRtContext, rtContext_t(), RT_CTX_GEN_MODE);
-  PP_RUN_AND_DUMP("CheckAndUpdateInput", CheckAndUpdateInput, user_input);
+  PP_RUN_AND_DUMP("CheckAndUpdateInput", CheckAndUpdateInput, user_input, graph_node->GetOptions());
   PP_RUN_AND_DUMP("GraphEquivalentTransformation", GraphEquivalentTransformation);
   PP_RUN_AND_DUMP("ProcessOutput", ProcessNetOutput);
   PP_RUN_AND_DUMP("ProcessMultiBatch", multibatch::ProcessMultiBatch, compute_graph_);
@@ -1621,7 +1746,8 @@ Status GraphPrepare::CheckUserInput(const std::vector<GeTensor> &user_input) {
 
       for (size_t i = 0; i < desc.GetShape().GetDimNum(); ++i) {
         if (desc.GetShape().GetDim(i) < 0) {
-          std::string situation = "data dim[" + std::to_string(i) + "][" + std::to_string(desc.GetShape().GetDim(i)) + "]" ;
+          std::string situation = "data dim[" + std::to_string(i) + "][" +
+                  std::to_string(desc.GetShape().GetDim(i)) + "]" ;
           std::string reason = "it need >= 0";
           ErrorManager::GetInstance().ATCReportErrMessage("E19025", {"situation", "reason"}, {situation, reason});
           GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need >= 0, real:%ld.", i,
@@ -1645,6 +1771,10 @@ Status GraphPrepare::InferShapeForPreprocess() {
   if (!options_.train_graph_flag) {
     names_to_passes.emplace_back("AssertPass", &assert_pass);
   }
+  SwitchDeadBranchElimination switch_dead_branch_elimination;
+  names_to_passes.emplace_back("SwitchDeadBranchElimination", &switch_dead_branch_elimination);
+  MergePass merge_pass;
+  names_to_passes.emplace_back("MergePass", &merge_pass);
   InferShapePass infer_shape_pass;
   names_to_passes.emplace_back("InferShapePass", &infer_shape_pass);
   ReplaceWithEmptyConstPass replace_with_empty_const_pass;
@@ -1701,7 +1831,7 @@ Status GraphPrepare::PrepareOptimize() {
   try {
     (void)original_graph_passes.AddPass("PrepareOptimize::ShapeOperateOpRemovePass", new ShapeOperateOpRemovePass);
     (void)original_graph_passes.AddPass("PrepareOptimize::ReplaceTransShapePass", new ReplaceTransShapePass);
-    (void)original_graph_passes.AddPass("PrepareOptimize::MarkAgnosticPass" , new MarkAgnosticPass);
+    (void)original_graph_passes.AddPass("PrepareOptimize::MarkAgnosticPass", new MarkAgnosticPass);
   } catch (std::bad_alloc &e) {
     GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs.");
     return INTERNAL_ERROR;
@@ -1738,7 +1868,6 @@ Status GraphPrepare::PrepareOptimize() {
   VarIsInitializedOpPass var_is_initialized_pass;
   ParallelConcatStartOpPass parallel_concat_start_op_pass;
   IdentityPass identity_pass(false);
-  AssignPass assign_pass;
   SnapshotPass snapshot_pass;
   if (!options_.train_graph_flag) {
     names_to_passes.emplace_back("DropOutPass", &dropout_pass);
@@ -1753,9 +1882,6 @@ Status GraphPrepare::PrepareOptimize() {
   names_to_passes.emplace_back("VarIsInitializedOpPass", &var_is_initialized_pass);
   names_to_passes.emplace_back("ParallelConcatStartOpPass", &parallel_concat_start_op_pass);
   names_to_passes.emplace_back("IdentityPass", &identity_pass);
-  if (GetContext().GetHostExecFlag()) {
-    names_to_passes.emplace_back("AssignPass", &assign_pass);
-  }
   GE_TIMESTAMP_START(names_to_passes);
   ret = ge_passes.Run(names_to_passes);
   GE_TIMESTAMP_END(names_to_passes, "GraphPrepare::NamesToPasses");
@@ -1796,6 +1922,16 @@ Status GraphPrepare::PrepareOptimize() {
 }
 
 void GraphPrepare::TypeConversionOfConstant() {
+  bool is_acl_compile = false;
+  for (ge::NodePtr &n : compute_graph_->GetAllNodes()) {
+    // This can ensure that n is not a null pointer
+    // No Conversion when called by aclOpCompile
+    (void)AttrUtils::GetBool(n->GetOpDesc(), ATTR_DYNAMIC_SHAPE_SINGLE_AICPU, is_acl_compile);
+    if (is_acl_compile) {
+      return;
+    }
+  }
+
   if (options_.train_graph_flag) {
     GELOGD("trans CONSTANT to CONSTANTOP in train.");
     for (ge::NodePtr &n : compute_graph_->GetAllNodes()) {
@@ -1858,7 +1994,7 @@ Status GraphPrepare::ProcessNetOutput() {
   return SUCCESS;
 }
 
-Status GraphPrepare::CheckAndUpdateInput(const std::vector<GeTensor> &user_input) {
+Status GraphPrepare::CheckAndUpdateInput(const std::vector<GeTensor> &user_input,const std::map<string,string> &graph_option) {
   compute_graph_->SetInputSize(user_input.size());
   if (user_input.empty()) {
     return SUCCESS;
@@ -1870,7 +2006,7 @@ Status GraphPrepare::CheckAndUpdateInput(const std::vector<GeTensor> &user_input
     return ret;
   }
 
-  ret = UpdateInput(user_input);
+  ret = UpdateInput(user_input, graph_option);
   if (ret != SUCCESS) {
     GELOGE(ret, "UpdateInput fail, ret:%u", ret);
     return ret;
diff --git a/ge/graph/preprocess/graph_preprocess.h b/ge/graph/preprocess/graph_preprocess.h
index a3bbf433..de755418 100755
--- a/ge/graph/preprocess/graph_preprocess.h
+++ b/ge/graph/preprocess/graph_preprocess.h
@@ -45,7 +45,7 @@ class GraphPrepare {
   virtual ~GraphPrepare();
   GraphPrepare(const GraphPrepare &in) = delete;
   GraphPrepare &operator=(const GraphPrepare &in) = delete;
-  Status PrepareDynShape(ConstGraphPtr graph,
+  Status PrepareDynShape(const GraphNodePtr &graph_node,
                          const std::vector<GeTensor> &user_input,
                          ge::ComputeGraphPtr &compute_graph,
                          uint64_t session_id = 0);
@@ -63,8 +63,8 @@ class GraphPrepare {
   Status CheckRefOp();
   Status SetRtContext(rtContext_t rt_context, rtCtxMode_t mode);
   Status AdjustDataOpOutput(const NodePtr &node);
-  Status UpdateInput(const std::vector<GeTensor> &user_input);
-  Status CheckAndUpdateInput(const std::vector<GeTensor> &user_input);
+  Status UpdateInput(const std::vector<GeTensor> &user_input, const std::map<string,string> &graph_option);
+  Status CheckAndUpdateInput(const std::vector<GeTensor> &user_input, const std::map<string,string> &graph_option);
   Status CheckConstOp();
   Status VerifyConstOp(const NodePtr &node);
   Status CheckUserInput(const std::vector<GeTensor> &user_input);
diff --git a/ge/graph/preprocess/insert_op/ge_aipp_op.cc b/ge/graph/preprocess/insert_op/ge_aipp_op.cc
index 98712a82..7c8d9073 100755
--- a/ge/graph/preprocess/insert_op/ge_aipp_op.cc
+++ b/ge/graph/preprocess/insert_op/ge_aipp_op.cc
@@ -408,7 +408,7 @@ Status AippOp::ConvertRelatedInputNameToRank() {
   GE_CHECK_NOTNULL(aipp_params_);
 
   string related_input_name = aipp_params_->related_input_name();
-  if(related_input_name.empty()) {
+  if (related_input_name.empty()) {
     return SUCCESS;
   }
 
diff --git a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
index 1b926e4b..3b37003f 100755
--- a/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
+++ b/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
@@ -470,7 +470,7 @@ Status InsertNewOpUtil::UpdateDataBySwitchN(const NodePtr &switchn, const NodePt
     }
   }
   if (max_index >= switchn->GetOpDesc()->GetOutputsSize()) {
-    string error_msg = "No max size found from switchn node[" + switchn->GetName()+ "]";
+    string error_msg = "No max size found from switchn node[" + switchn->GetName() + "]";
     GE_ERRORLOG_AND_ERRORMSG(INTERNAL_ERROR, error_msg.c_str());
     return INTERNAL_ERROR;
   }
diff --git a/ge/graph/preprocess/multi_batch_copy_graph.cc b/ge/graph/preprocess/multi_batch_copy_graph.cc
index 9ab74d70..5506435e 100644
--- a/ge/graph/preprocess/multi_batch_copy_graph.cc
+++ b/ge/graph/preprocess/multi_batch_copy_graph.cc
@@ -44,6 +44,8 @@
 using std::set;
 using std::string;
 using std::vector;
+using std::map;
+using std::queue;
 
 namespace ge {
 namespace multibatch {
@@ -57,10 +59,15 @@ const int kDataInIndex = 0;
 const int kMergeDataOutIndex = 0;
 const int kStaticOutput = -1;
 const int kDivisionConst = 2;
+const int32_t kOneInDataNode = 1;
+const int32_t kFindNoMatch = 0;
 
 
 inline bool IsDataLikeType(const std::string &node_type) { return (node_type == DATA) || (node_type == AIPP); }
 
+inline bool IsEnterType(const string &node_type) { return (node_type == ENTER) || (node_type == REFENTER); }
+const set<string> unchange_types({CONSTANT, CONSTANTOP, ENTER, REFENTER});
+
 inline bool IsGetNextType(const NodePtr &node) {
   std::string original_type;
   GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS,
@@ -218,12 +225,6 @@ Status MultiBatchGraphCopyer::CopyGraph() {
     return ret;
   }
 
-  ret = InsertIdentityAfterSwitchN();
-  if (ret != SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Failed to insert identity nodes after switchn node.");
-    return INTERNAL_ERROR;
-  }
-
   GELOGI("Begin to remove useless nodes by prune pass after copy process");
   PrunePass prune_pass;
   ret = prune_pass.Run(graph_);
@@ -240,6 +241,18 @@ Status MultiBatchGraphCopyer::Init() {
     return ret;
   }
 
+  ret = RelinkConstCtrlEdge();
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "Relink const's control edge failed.");
+    return FAILED;
+  }
+
+  ret = ExtractUnchangedStructureOutofCycle();
+  if (ret != SUCCESS) {
+    GELOGE(FAILED, "Extract unchanged structure out of cycle failed.");
+    return FAILED;
+  }
+
   for (auto &node : graph_->GetAllNodes()) {
     origin_all_nodes_.emplace_back(node);
     if (IsDataLikeType(node->GetType())) {
@@ -252,6 +265,281 @@ Status MultiBatchGraphCopyer::Init() {
   return SUCCESS;
 }
 
+Status MultiBatchGraphCopyer::RelinkConstCtrlEdge() {
+  for (auto &node : graph_->GetAllNodes()) {
+    GE_CHECK_NOTNULL(node);
+    if ((node->GetType() == CONSTANT) || (node->GetType() == CONSTANTOP)) {
+      if (node->GetOutDataNodes().empty()) {
+        continue;
+      }
+      if (!node->GetInControlNodes().empty()) {
+        auto in_ctrl_nodes = node->GetInControlNodes();
+        auto out_nodes = node->GetOutAllNodes();
+        bool has_merge_out = false;
+        for (const auto &out_node : out_nodes) {
+          GE_CHECK_NOTNULL(out_node);
+          if (out_node->GetType() == MERGE || out_node->GetType() == REFMERGE) {
+            has_merge_out = true;
+            break;
+          }
+        }
+        if (has_merge_out) {
+          continue;
+        }
+        auto in_ctrl_anchor = node->GetInControlAnchor();
+        GE_CHECK_NOTNULL(in_ctrl_anchor);
+        in_ctrl_anchor->UnlinkAll();
+        for (auto &in_ctrl_node : in_ctrl_nodes) {
+          auto out_ctrl_anchor_of_in_ctrl_node = in_ctrl_node->GetOutControlAnchor();
+          GE_CHECK_NOTNULL(out_ctrl_anchor_of_in_ctrl_node);
+          for (auto &out_node : out_nodes) {
+            if (IsEnterType(out_node->GetType())) {
+              continue;
+            }
+            if (!out_ctrl_anchor_of_in_ctrl_node->IsLinkedWith(out_node->GetInControlAnchor())) {
+              GE_CHK_STATUS_RET(out_ctrl_anchor_of_in_ctrl_node->LinkTo(out_node->GetInControlAnchor()))
+            }
+          }
+        }
+      }
+      auto out_ctrl_anchor = node->GetOutControlAnchor();
+      if (out_ctrl_anchor != nullptr) {
+        out_ctrl_anchor->UnlinkAll();
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status MultiBatchGraphCopyer::ExtractUnchangedStructureOutofCycle() {
+  map<string, vector<NodePtr>> frame_enter;
+  if (GetEnterNodesGroupByFrame(frame_enter) != SUCCESS) {
+    GELOGE(FAILED, "Get enter nodes grouped by frame_name failed.");
+    return FAILED;
+  }
+
+  queue<NodePtr> nodes_to_extract;
+  if (GetNodeNeedExtract(frame_enter, nodes_to_extract) != SUCCESS) {
+    GELOGE(FAILED, "Get nodes needed to extract failed.");
+    return FAILED;
+  }
+
+  while (!nodes_to_extract.empty()) {
+    auto node = nodes_to_extract.front();
+    nodes_to_extract.pop();
+    OpDescPtr enter_desc = nullptr;
+    if (MoveInEntersInDataAnchorDown(node, enter_desc) != SUCCESS) {
+      GELOGE(FAILED, "Move in enter nodes' in data anchors down of %s failed.", node->GetName().c_str());
+      return FAILED;
+    }
+    set<NodePtr> out_nodes;
+    if (InsertEnterAfterNode(node, enter_desc, out_nodes) != SUCCESS) {
+      GELOGE(FAILED, "Insert enter node after %s failed.", node->GetName().c_str());
+      return FAILED;
+    }
+
+    if (MoveCtrlEdgeToOutNodes(node, out_nodes) != SUCCESS) {
+      GELOGE(FAILED, "Move %s's control edge to out nodes failed.", node->GetName().c_str());
+      return FAILED;
+    }
+
+    for (auto &out_node : out_nodes) {
+      GE_CHECK_NOTNULL(out_node);
+      if (AllInDataNodesUnchangeAndNoMergeOut(out_node)) {
+        nodes_to_extract.push(out_node);
+      }
+    }
+  }
+
+  if (DeleteEnterWithoutDataOut() != SUCCESS) {
+    GELOGE(FAILED, "Delete enter node without out data nodes failed.");
+    return FAILED;
+  }
+
+  return SUCCESS;
+}
+
+Status MultiBatchGraphCopyer::GetEnterNodesGroupByFrame(map<string, vector<NodePtr>> &frame_enter) {
+  for (auto &node : graph_->GetAllNodes()) {
+    GE_CHECK_NOTNULL(node);
+    if (IsEnterType(node->GetType())) {
+      if (!node->GetInControlNodes().empty() || !node->GetOutControlNodes().empty()) {
+        continue;
+      }
+      auto op_desc = node->GetOpDesc();
+      GE_CHECK_NOTNULL(op_desc);
+      string frame_name;
+      if (!AttrUtils::GetStr(op_desc, ENTER_ATTR_FRAME_NAME, frame_name)) {
+        GELOGE(FAILED, "Get attr frame_name of enter[%] failed.", node->GetName().c_str());
+        return FAILED;
+      }
+      frame_enter[frame_name].emplace_back(node);
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status MultiBatchGraphCopyer::GetNodeNeedExtract(const map<string, vector<NodePtr>> &frame_enter,
+                                                 queue<NodePtr> &nodes_to_extract) {
+  for (const auto &one_group : frame_enter) {
+    auto enters = one_group.second;
+    for (const auto &enter : enters) {
+      auto out_data_nodes = enter->GetOutDataNodes();
+      for (const auto &out_data_node : out_data_nodes) {
+        GE_CHECK_NOTNULL(out_data_node);
+        if (AllInDataNodesUnchangeAndNoMergeOut(out_data_node)) {
+          nodes_to_extract.push(out_data_node);
+        }
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+bool MultiBatchGraphCopyer::AllInDataNodesUnchangeAndNoMergeOut(const NodePtr &node) {
+  auto out_data_nodes = node->GetOutDataNodes();
+  for (const auto &out_data_node : out_data_nodes) {
+    if (out_data_node == nullptr) {
+      return false;
+    }
+
+    if (out_data_node->GetType() == MERGE || out_data_node->GetType() == REFMERGE) {
+      return false;
+    }
+  }
+
+  auto in_data_nodes = node->GetInDataNodes();
+  if (in_data_nodes.size() == kOneInDataNode) {
+    return true;
+  }
+
+  for (const auto &in_data_node : in_data_nodes) {
+    if (in_data_node == nullptr) {
+      return false;
+    }
+    if (unchange_types.count(in_data_node->GetType()) == kFindNoMatch) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+Status MultiBatchGraphCopyer::MoveInEntersInDataAnchorDown(NodePtr &node, OpDescPtr &enter_desc) {
+  auto in_data_anchors = node->GetAllInDataAnchors();
+  for (auto &in_data_anchor : in_data_anchors) {
+    auto peer_out_data_anchor = in_data_anchor->GetPeerOutAnchor();
+    GE_CHECK_NOTNULL(peer_out_data_anchor);
+    auto peer_in_data_node = peer_out_data_anchor->GetOwnerNode();
+    if (IsEnterType(peer_in_data_node->GetType())) {
+      GE_CHK_STATUS_RET(peer_out_data_anchor->Unlink(in_data_anchor))
+      GELOGD("Unlink data edge from %s to %s.", peer_in_data_node->GetName().c_str(), node->GetName().c_str());
+      auto enter_in_data_anchors = peer_in_data_node->GetAllInDataAnchors();
+      for (auto &enter_in_data_anchor : enter_in_data_anchors) {
+        auto peer_out_data_anchor_of_enter = enter_in_data_anchor->GetPeerOutAnchor();
+        GE_CHECK_NOTNULL(peer_out_data_anchor_of_enter);
+        if (peer_out_data_anchor_of_enter->IsLinkedWith(in_data_anchor)) {
+          continue;
+        }
+        GE_CHK_STATUS_RET(peer_out_data_anchor_of_enter->LinkTo(in_data_anchor))
+        GELOGD("Relink data edge from %s to %s.", peer_out_data_anchor_of_enter->GetOwnerNode()->GetName().c_str(),
+               node->GetName().c_str());
+      }
+      enter_desc = peer_in_data_node->GetOpDesc();
+      GE_CHECK_NOTNULL(enter_desc);
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status MultiBatchGraphCopyer::InsertEnterAfterNode(NodePtr &node, const OpDescPtr &copy_desc, set<NodePtr> &out_nodes) {
+  if (copy_desc == nullptr) {
+    return SUCCESS;
+  }
+  map<OutDataAnchorPtr, vector<std::pair<InDataAnchorPtr, NodePtr>>> outanchors_inanchors_nodes;
+  auto out_data_anchors = node->GetAllOutDataAnchors();
+  for (auto &out_data_anchor : out_data_anchors) {
+    auto peer_in_data_anchors = out_data_anchor->GetPeerInDataAnchors();
+    for (auto peer_in_data_anchor : peer_in_data_anchors) {
+      GE_CHECK_NOTNULL(peer_in_data_anchor);
+      auto peer_in_data_node = peer_in_data_anchor->GetOwnerNode();
+      out_nodes.emplace(peer_in_data_node);
+      outanchors_inanchors_nodes[out_data_anchor].emplace_back(std::make_pair(peer_in_data_anchor, peer_in_data_node));
+    }
+  }
+
+  int32_t i = 0;
+  auto node_desc = node->GetOpDesc();
+  GE_CHECK_NOTNULL(node_desc);
+  // Insert one enter node after node's per out data anchor
+  for (auto &outanchor_inanchors_nodes : outanchors_inanchors_nodes) {
+    string name = node->GetName() + "_" + ENTER + "_" + std::to_string(i++);
+    GELOGD("Create Enter op %s after %s.", name.c_str(), node->GetName().c_str());
+    auto enter_desc = AttrUtils::CopyOpDesc(copy_desc);
+    enter_desc->SetName(name);
+    GE_CHK_STATUS_RET(
+        enter_desc->UpdateInputDesc("x", node_desc->GetOutputDesc(outanchor_inanchors_nodes.first->GetIdx())))
+    GE_CHK_STATUS_RET(
+        enter_desc->UpdateOutputDesc("y", node_desc->GetOutputDesc(outanchor_inanchors_nodes.first->GetIdx())))
+    auto enter_node = graph_->AddNode(enter_desc);
+    GE_CHECK_NOTNULL(enter_node);
+    GE_CHK_STATUS_RET(outanchor_inanchors_nodes.first->LinkTo(enter_node->GetInDataAnchor(kDataInIndex)))
+    GE_CHECK_NOTNULL(enter_node->GetOutDataAnchor(kDataInIndex));
+    for (auto &inanchor_node : outanchor_inanchors_nodes.second) {
+      GE_CHK_STATUS_RET(outanchor_inanchors_nodes.first->Unlink(inanchor_node.first))
+      GE_CHK_STATUS_RET(enter_node->GetOutDataAnchor(kDataInIndex)->LinkTo(inanchor_node.first))
+      GELOGD("Unlink from %s to %s, link from %s to %s then to %s.", node->GetName().c_str(),
+             inanchor_node.second->GetName().c_str(), node->GetName().c_str(), enter_node->GetName().c_str(),
+             inanchor_node.second->GetName().c_str());
+    }
+  }
+
+  return SUCCESS;
+}
+
+// Move node's in control edges to out data nodes
+Status MultiBatchGraphCopyer::MoveCtrlEdgeToOutNodes(NodePtr &node, set<NodePtr> &out_nodes) {
+  auto in_ctrl_anchor = node->GetInControlAnchor();
+  GE_CHECK_NOTNULL(in_ctrl_anchor);
+  auto peer_out_ctrl_anchors = in_ctrl_anchor->GetPeerOutControlAnchors();
+  for (auto &peer_out_ctrl_anchor : peer_out_ctrl_anchors) {
+    GE_CHK_STATUS_RET(peer_out_ctrl_anchor->Unlink(in_ctrl_anchor))
+    GELOGD("Unlink control edge from %s to %s.", peer_out_ctrl_anchor->GetOwnerNode()->GetName().c_str(),
+           node->GetName().c_str());
+    for (auto &out_node : out_nodes) {
+      auto in_ctrl_anchor_of_out_node = out_node->GetInControlAnchor();
+      GE_CHECK_NOTNULL(in_ctrl_anchor_of_out_node);
+      if (!peer_out_ctrl_anchor->IsLinkedWith(in_ctrl_anchor_of_out_node)) {
+        GE_CHK_STATUS_RET(peer_out_ctrl_anchor->LinkTo(in_ctrl_anchor_of_out_node))
+        GELOGD("Link control edge from %s to %s.", peer_out_ctrl_anchor->GetOwnerNode()->GetName().c_str(),
+               out_node->GetName().c_str());
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status MultiBatchGraphCopyer::DeleteEnterWithoutDataOut() {
+  for (auto &node : graph_->GetAllNodes()) {
+    GE_CHECK_NOTNULL(node);
+    if (IsEnterType(node->GetType())) {
+      auto out_nodes = node->GetOutAllNodes();
+      if (out_nodes.empty()) {
+        GELOGD("Delete enter node: %s which has no output.", node->GetName().c_str());
+        GE_CHK_STATUS_RET(GraphUtils::IsolateNode(node, {}))
+        GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph_, node))
+      }
+    }
+  }
+
+  return SUCCESS;
+}
+
 void MultiBatchGraphCopyer::LabelStatusForData(const NodePtr &data) {
   auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
   GELOGI("Label status for %s, shape_dims is %s.", data->GetName().c_str(),
@@ -297,6 +585,9 @@ Status MultiBatchGraphCopyer::LabelInBatchBranchStatus() {
       LabelStatusForGetNextSink(data);
     }
   }
+
+  map<string, vector<NodePtr>> frame_enters;
+  InitStatus(frame_enters);
   bool changed = true;
   // If anyone of in node is kNodeInBatchBranch, it is also kNodeInBatchBranch
   while (changed) {
@@ -306,12 +597,13 @@ Status MultiBatchGraphCopyer::LabelInBatchBranchStatus() {
       if (iter != origin_nodes_status_.end()) {
         continue;
       }
-      for (auto &in_node : node->GetInAllNodes()) {
-        bool is_in_batch = origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end() &&
-                           origin_nodes_status_[in_node.get()] == kNodeInBatchBranch;
-        if (is_in_batch) {
-          origin_nodes_status_[node.get()] = kNodeInBatchBranch;
-          changed = true;
+      for (auto &in_node : node->GetInDataNodes()) {
+        if (origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end()) {
+          if (origin_nodes_status_.find(node.get()) == origin_nodes_status_.end()) {
+            origin_nodes_status_[node.get()] == kNodeInBatchBranch;
+            ResetEnterStatus(frame_enters, node);
+            changed = true;
+          }
           break;
         }
       }
@@ -320,6 +612,45 @@ Status MultiBatchGraphCopyer::LabelInBatchBranchStatus() {
   return SUCCESS;
 }
 
+void MultiBatchGraphCopyer::InitStatus(map<string, vector<NodePtr>> &frame_enters) {
+  for (const auto &node : origin_all_nodes_) {
+    if (!IsEnterType(node->GetType())) {
+      continue;
+    }
+    auto op_desc = node->GetOpDesc();
+    if (op_desc == nullptr) {
+      continue;
+    }
+    string frame_name;
+    if (AttrUtils::GetStr(op_desc, ENTER_ATTR_FRAME_NAME, frame_name)) {
+      frame_enters[frame_name].emplace_back(node);
+    }
+  }
+
+  for (const auto &data : origin_data_nodes_) {
+    auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
+    if (!IsAllDimsPositive(data_shape.GetDims())) {
+      origin_nodes_status_[data.get()] = kNodeInBatchBranch;
+    }
+  }
+}
+
+void MultiBatchGraphCopyer::ResetEnterStatus(map<string, vector<NodePtr>> &frame_enters, const NodePtr &node) {
+  if (!IsEnterType(node->GetType())) {
+    return;
+  }
+
+  for (const auto &frame_enter : frame_enters) {
+    auto &enters = frame_enter.second;
+    if (std::find(enters.begin(), enters.end(), node) != enters.end()) {
+      for (const auto &enter : enters) {
+        origin_nodes_status_[enter.get()] = kNodeInBatchBranch;
+      }
+      break;
+    }
+  }
+}
+
 Status MultiBatchGraphCopyer::LabelStatus() {
   if (LabelInBatchBranchStatus() != SUCCESS) {
     GELOGE(PARAM_INVALID, "Failed to label no in batch branch");
@@ -1360,55 +1691,9 @@ Status MultiBatchGraphCopyer::LinkToNodeOutBranch(const NodePtr &node) {
   return SUCCESS;
 }
 
-Status MultiBatchGraphCopyer::InsertIdentityAfterSwitchN() {
-  for (auto &node : graph_->GetAllNodes()) {
-    if (node->GetType() != SWITCHN) {
-      continue;
-    }
-    auto switchn_desc = node->GetOpDesc();
-    GE_CHECK_NOTNULL(switchn_desc);
-    size_t i = 0;
-    for (auto &out_data_anchor : node->GetAllOutDataAnchors()) {
-      for (auto &in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
-        auto out_node = in_data_anchor->GetOwnerNode();
-        auto op_desc = out_node->GetOpDesc();
-        GE_CHECK_NOTNULL(op_desc);
-        if ((out_node->GetType() == MERGE) && (op_desc->HasAttr(ATTR_INSERT_BY_MBATCH))) {
-          GELOGD("No need to insert identity between %s and %s.", node->GetName().c_str(), out_node->GetName().c_str());
-          continue;
-        }
-
-        auto identity_desc = MakeShared<OpDesc>(node->GetName() + "_identity_" + std::to_string(i), IDENTITY);
-        GE_CHECK_NOTNULL(identity_desc);
-
-        string batch_label;
-        if (AttrUtils::GetStr(op_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
-          if (!AttrUtils::SetStr(identity_desc, ATTR_NAME_BATCH_LABEL, batch_label)) {
-            GELOGE(FAILED, "Set attr ATTR_NAME_BATCH_LABEL failed, node:%s.", identity_desc->GetName().c_str());
-            return FAILED;
-          }
-        }
-
-        auto data_desc = switchn_desc->GetOutputDesc(i);
-        i++;
-        GE_CHK_STATUS_RET(identity_desc->AddInputDesc("x", data_desc));
-        GE_CHK_STATUS_RET(identity_desc->AddOutputDesc("y", data_desc));
-
-        auto identity_node = graph_->AddNode(identity_desc);
-        GE_CHECK_NOTNULL(identity_node);
-        GE_CHK_STATUS_RET(out_data_anchor->LinkTo(identity_node->GetInDataAnchor(0)));
-        GE_CHECK_NOTNULL(identity_node->GetOutControlAnchor());
-        GE_CHK_STATUS_RET(identity_node->GetOutControlAnchor()->LinkTo(out_node->GetInControlAnchor()));
-      }
-    }
-  }
-
-  return SUCCESS;
-}
-
 Status ProcessMultiBatch(ComputeGraphPtr &graph) {
-  const char *multi_batch_with_case = std::getenv("MULTI_BATCH_WITH_CASE");
-  if (multi_batch_with_case != nullptr) {
+  const char *multi_batch_with_switchn = std::getenv("MULTI_BATCH_WITH_SWITCHN");
+  if (multi_batch_with_switchn == nullptr) {
     PassManager pass_manager;
     GE_CHK_STATUS_RET(pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass));
     return pass_manager.Run(graph);
diff --git a/ge/graph/preprocess/multi_batch_copy_graph.h b/ge/graph/preprocess/multi_batch_copy_graph.h
index a0de4413..d51c4c02 100644
--- a/ge/graph/preprocess/multi_batch_copy_graph.h
+++ b/ge/graph/preprocess/multi_batch_copy_graph.h
@@ -18,6 +18,7 @@
 #include <map>
 #include <queue>
 #include <vector>
+#include <set>
 
 #include "external/ge/ge_api_error_codes.h"
 
@@ -64,12 +65,26 @@ class MultiBatchGraphCopyer {
  private:
   Status Init();
   Status CheckArguments();
+  Status RelinkConstCtrlEdge();
+
+  Status ExtractUnchangedStructureOutofCycle();
+  Status GetEnterNodesGroupByFrame(std::map<std::string, std::vector<NodePtr>> &frame_enter);
+  Status GetNodeNeedExtract(const std::map<std::string, std::vector<NodePtr>> &frame_enter,
+                            std::queue<NodePtr> &nodes_to_extract);
+  bool AllInDataNodesUnchangeAndNoMergeOut(const NodePtr &node);
+  Status MoveInEntersInDataAnchorDown(NodePtr &node, OpDescPtr &enter_desc);
+  Status InsertEnterAfterNode(NodePtr &node, const OpDescPtr &enter_desc, std::set<NodePtr> &out_nodes);
+  Status MoveCtrlEdgeToOutNodes(NodePtr &node, std::set<NodePtr> &out_nodes);
+  Status DeleteEnterWithoutDataOut();
 
   // label status for origin_all_nodes_
   Status LabelStatus();
   Status LabelInBatchBranchStatus();
   void LabelStatusForData(const NodePtr &data);
   void LabelStatusForGetNextSink(const NodePtr &data);
+  void InitStatus(std::map<std::string, std::vector<NodePtr>> &frame_enters);
+  void ResetEnterStatus(std::map<std::string, std::vector<NodePtr>> &frame_enters, const NodePtr &node);
+
   // add nodes functions
   Status CreateNewNodes();
 
@@ -81,7 +96,6 @@ class MultiBatchGraphCopyer {
   Status InsertSwitchNForData(const NodePtr &node, const size_t &out_anchor_index, const size_t &peer_in_anchor_index,
                               std::vector<std::pair<Node *, NodePtr>> &dynamic_out_to_switchn);
 
-  Status InsertIdentityAfterSwitchN();
   Status UpdateMaxShapeToData(const NodePtr &node, size_t out_anchor_index);
   Status UpdateShapeOfShapeNode(const NodePtr &node, size_t out_anchor_index);
 
diff --git a/ge/graph/preprocess/multi_batch_options.cc b/ge/graph/preprocess/multi_batch_options.cc
index c26b08bc..8aab0981 100644
--- a/ge/graph/preprocess/multi_batch_options.cc
+++ b/ge/graph/preprocess/multi_batch_options.cc
@@ -37,17 +37,12 @@ constexpr int kDecimal = 10;
 constexpr uint8_t kMaxShapesCount = 100;
 constexpr uint8_t kMinShapesCount = 2;
 const int kDynmaicDims = -1;
-const int kDynamicBatchDynamicDimsNum = 1;
 const int kDynamicImgSizeDynamciDimsNum = 2;
-const size_t kMaxNDDimNum = 4;
-const size_t kMinNDDimNum = 1;
 const size_t kNumOfGetnextNode = 1;
 const int kDivisionConst = 2;
 const char *const kSubstrOfGetNextNosinkName = "IteratorGetNext";
 const char *const kShapeDataName = "ascend_mbatch_shape_data";
 const char *const kGetNextName = "IteratorV2";
-const char *const kExtAttrDataNodes = "data_nodes";
-const char *const kExtAttrGetNextNoSink = "getnext_no_sink";
 
 inline bool IsGetNextType(const NodePtr &node) {
   std::string original_type;
@@ -99,9 +94,8 @@ Status DistinguishGetNextAndData(ComputeGraphPtr &graph, vector<NodePtr> &data_n
   }
   GELOGI("Data count is %zu, getnext nosink count is %zu, getnext sink count is %zu.", data_nodes.size(),
          getnext_nosink_nodes.size(), getnext_sink_nodes.size());
-  GE_IF_BOOL_EXEC(!graph->SetExtAttr(kExtAttrDataNodes, data_nodes), GELOGW("Set data nodes attr failed.");)
-  GE_IF_BOOL_EXEC(!graph->SetExtAttr(kExtAttrGetNextNoSink, getnext_nosink_nodes),
-                  GELOGW("Set getnext nosink nodes attr failed.");)
+  GetLocalOmgContext().data_nodes = data_nodes;
+  GetLocalOmgContext().getnext_nosink_nodes = getnext_nosink_nodes;
   return SUCCESS;
 }
 
diff --git a/ge/host_cpu_engine/CMakeLists.txt b/ge/host_cpu_engine/CMakeLists.txt
index 97b5a0f5..cbd0bd8b 100644
--- a/ge/host_cpu_engine/CMakeLists.txt
+++ b/ge/host_cpu_engine/CMakeLists.txt
@@ -20,6 +20,7 @@ add_library(host_cpu_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
 
 target_compile_options(host_cpu_engine PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(host_cpu_engine PRIVATE
@@ -49,9 +50,7 @@ target_link_libraries(host_cpu_engine PRIVATE
     ascend_protobuf
     c_sec
     graph
-    register
     slog
-    runtime
     -Wl,--as-needed
 )
 
@@ -60,6 +59,7 @@ add_library(atc_host_cpu_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
 
 target_compile_options(atc_host_cpu_engine PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(atc_host_cpu_engine PRIVATE
@@ -90,9 +90,7 @@ target_link_libraries(atc_host_cpu_engine PRIVATE
     ascend_protobuf
     c_sec
     graph
-    register
     slog
-    runtime_compile
     -Wl,--as-needed
 )
 
@@ -106,6 +104,7 @@ add_library(host_cpu_opskernel_builder SHARED ${CPU_OPS_KERNEL_LIST})
 
 target_compile_options(host_cpu_opskernel_builder PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(host_cpu_opskernel_builder PRIVATE
@@ -145,6 +144,7 @@ add_library(atc_host_cpu_opskernel_builder SHARED ${CPU_OPS_KERNEL_LIST})
 
 target_compile_options(atc_host_cpu_opskernel_builder PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(atc_host_cpu_opskernel_builder PRIVATE
@@ -189,10 +189,12 @@ add_library(host_cpu_opskernel_builder_static STATIC ${CPU_OPS_KERNEL_LIST})
 
 target_compile_options(host_cpu_opskernel_builder_static PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(host_cpu_opskernel_builder_static PRIVATE
     google=ascend_private
+    LOG_CPP
 )
 
 target_include_directories(host_cpu_opskernel_builder_static PRIVATE
diff --git a/ge/host_kernels/concat_v2_kernel.cc b/ge/host_kernels/concat_v2_kernel.cc
index a9f0da81..234d8c8a 100644
--- a/ge/host_kernels/concat_v2_kernel.cc
+++ b/ge/host_kernels/concat_v2_kernel.cc
@@ -120,7 +120,7 @@ Status ConcatV2Kernel::ConcatV2PreCompute(const std::vector<ConstGeTensorPtr> &i
                                           int &tidx,
                                           ConstGeTensorPtr &tensor) {
   size_t input_size = input.size();
-  // N >= 2 and N + 1 >= 3
+  // N + 1 is greater than or equal to 3
   if (input_size < kConcatV2InputNum) {
     GELOGI("The number of input for ConcatV2 must not be less than %zu.", kConcatV2InputNum);
     return NOT_CHANGED;
diff --git a/ge/host_kernels/dynamic_stitch_kernel.cc b/ge/host_kernels/dynamic_stitch_kernel.cc
index d26237f4..32611b03 100644
--- a/ge/host_kernels/dynamic_stitch_kernel.cc
+++ b/ge/host_kernels/dynamic_stitch_kernel.cc
@@ -33,6 +33,8 @@ namespace {
 const int kDoubleAttrN = 2;
 const int kFirstOutputDescIdx = 0;
 const int kMergedShapeSecondDim = 1;
+const size_t kNullTensorDimNum = 1;
+const int64_t kNullTensorDimValue = 0;
 const std::set<DataType> kSupportedTypeSet = {DT_INT8,  DT_UINT8, DT_INT16,   DT_UINT16, DT_INT32,
                                               DT_INT64, DT_BOOL,  DT_FLOAT16, DT_FLOAT,  DT_DOUBLE};
 }  // namespace
@@ -177,7 +179,14 @@ Status DynamicStitchKernel::StitchDataFollowIndices(int64_t data_unit, const vec
   int64_t src_offset = 0;
   std::set<int32_t> indices_set;
   for (int i = 0; i < n_; i++) {
-    auto indices_shape_size = input[i]->GetTensorDesc().GetShape().GetShapeSize();
+    GeShape indices_shape = input[i]->GetTensorDesc().GetShape();
+    size_t indices_dim_num = indices_shape.GetDimNum();
+    // skip null indices tensor
+    if (indices_dim_num == kNullTensorDimNum && indices_shape.GetDim(0) == kNullTensorDimValue) {
+      GELOGD("Input indices[%d] has null tensor, skip it.", i);
+      continue;
+    }
+    auto indices_shape_size = indices_shape.GetShapeSize();
     // to normalize logic, assume scalar as vector with shape of [1].
     indices_shape_size = (indices_shape_size == 0) ? 1 : indices_shape_size;
     // all index for input is less than size of input
diff --git a/ge/host_kernels/floordiv_kernel.cc b/ge/host_kernels/floordiv_kernel.cc
index e254af09..df381212 100644
--- a/ge/host_kernels/floordiv_kernel.cc
+++ b/ge/host_kernels/floordiv_kernel.cc
@@ -112,8 +112,8 @@ void FloorDivKernel::ShapeCal(const std::vector<ge::ConstGeTensorPtr> &input, Ge
 template <typename T>
 T FloorDivKernel::DivCal(const T &x_i, const T &y_i) {
   if ((x_i < static_cast<T>(0)) != (y_i < static_cast<T>(0))) {
-    T abs_x_i = std::abs(x_i);
-    T abs_y_i = std::abs(y_i);
+    T abs_x_i = x_i < 0 ? -x_i : x_i;
+    T abs_y_i = y_i < 0 ? -y_i : y_i;
     return static_cast<T>(static_cast<int32_t>(-(abs_x_i + abs_y_i - 1) / abs_y_i));
   } else {
     return static_cast<T>(static_cast<int32_t>(x_i / y_i));
diff --git a/ge/host_kernels/floordiv_kernel.h b/ge/host_kernels/floordiv_kernel.h
index d3dc3ff7..b8f6dd12 100755
--- a/ge/host_kernels/floordiv_kernel.h
+++ b/ge/host_kernels/floordiv_kernel.h
@@ -40,10 +40,6 @@ class FloorDivKernel : public Kernel {
   template <typename T>
   Status DataCal(const std::vector<ConstGeTensorPtr> &input, ge::GeTensorPtr output_ptr);
   Status ComputeByDataType(DataType data_type, const std::vector<ConstGeTensorPtr> &input, GeTensorPtr output_ptr);
-
-  int64_t axis_dim_;
-  int64_t head_dim_;
-  int64_t end_dim_;
 };
 }  // namespace ge
 
diff --git a/ge/host_kernels/gather_v2_kernel.cc b/ge/host_kernels/gather_v2_kernel.cc
index e52b4534..ee73626b 100644
--- a/ge/host_kernels/gather_v2_kernel.cc
+++ b/ge/host_kernels/gather_v2_kernel.cc
@@ -40,6 +40,10 @@ const size_t kGatherV2InpotNum = 3;
 const size_t kMaxIndicatesDims = 1;  // only support scalar and 1 dims indicates_
 const std::set<DataType> supported_type = {DT_FLOAT16, DT_DOUBLE, DT_INT8,   DT_INT16,  DT_INT16, DT_INT32,
                                            DT_INT64,   DT_UINT8,  DT_UINT16, DT_UINT32, DT_UINT64};
+const int64_t DIM_AXIS_0 = 0;
+const int64_t DIM_AXIS_1 = 1;
+const int64_t DIM_AXIS_2 = 2;
+const int64_t DIM_AXIS_3 = 3;
 }  // namespace
 template <typename T>
 Status GatherV2Kernel::ProcessAxis0(ConstGeTensorPtr tensor_x, GeTensorPtr output) {
@@ -191,16 +195,16 @@ Status GatherV2Kernel::GenData(const int64_t data_num, ConstGeTensorPtr tensor_x
 
   Status ret = SUCCESS;
   switch (axis) {
-    case 0:
+    case DIM_AXIS_0:
       ret = ProcessAxis0<T>(tensor_x, output);
       break;
-    case 1:
+    case DIM_AXIS_1:
       ret = ProcessAxis1<T>(tensor_x, output);
       break;
-    case 2:
+    case DIM_AXIS_2:
       ret = ProcessAxis2<T>(tensor_x, output);
       break;
-    case 3:
+    case DIM_AXIS_3:
       ret = ProcessAxis3<T>(tensor_x, output);
       break;
     default:
diff --git a/ge/host_kernels/range_kernel.cc b/ge/host_kernels/range_kernel.cc
index 32a72b47..97254fff 100644
--- a/ge/host_kernels/range_kernel.cc
+++ b/ge/host_kernels/range_kernel.cc
@@ -32,6 +32,9 @@ namespace ge {
 namespace {
 constexpr size_t kRangeInputNum = 3;
 constexpr uint32_t kRangeDimNum = 0;
+constexpr size_t kStartIndex = 0;
+constexpr size_t kLimitIndex = 1;
+constexpr size_t kDeltaIndex = 2;
 const std::set<DataType> kRangeSupportedType = {DT_INT32, DT_FLOAT};
 }  // namespace
 
@@ -53,9 +56,9 @@ Status RangeKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector<Const
     return MEMALLOC_FAILED;
   }
 
-  ConstGeTensorPtr start = input.at(0);
-  ConstGeTensorPtr limit = input.at(1);
-  ConstGeTensorPtr delta = input.at(2);
+  ConstGeTensorPtr start = input.at(kStartIndex);
+  ConstGeTensorPtr limit = input.at(kLimitIndex);
+  ConstGeTensorPtr delta = input.at(kDeltaIndex);
   DataType data_type = delta->GetTensorDesc().GetDataType();
   if (data_type == DT_FLOAT) {
     if (GetRange(*reinterpret_cast<const float *>(start->GetData().data()),
diff --git a/ge/host_kernels/ssd_prior_box_kernel.cc b/ge/host_kernels/ssd_prior_box_kernel.cc
index b3a0fc3e..3661fa9d 100644
--- a/ge/host_kernels/ssd_prior_box_kernel.cc
+++ b/ge/host_kernels/ssd_prior_box_kernel.cc
@@ -180,14 +180,18 @@ Status SsdPriorboxKernel::SetVariance(const vector<float> &variance, const int d
   return SUCCESS;
 }
 
-Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint aspect_ratios_size, uint min_sizes_size, uint max_sizes_size,
-                                                int layer_width, int layer_height, int &num_priors,
+Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint32_t aspect_ratios_size,
+                                                uint32_t min_sizes_size,
+                                                uint32_t max_sizes_size,
+                                                int layer_width,
+                                                int layer_height,
+                                                int &num_priors,
                                                 int &dim_size) const {
   if (ge::CheckUint32MulOverflow(min_sizes_size, aspect_ratios_size) != SUCCESS) {
     return PARAM_INVALID;
   }
 
-  uint tmp_value = aspect_ratios_size * min_sizes_size;
+  uint32_t tmp_value = aspect_ratios_size * min_sizes_size;
   if (ge::CheckUint32AddOverflow(tmp_value, max_sizes_size) != SUCCESS) {
     GELOGW("Failed to get list param.");
     return PARAM_INVALID;
@@ -199,7 +203,7 @@ Status SsdPriorboxKernel::GetNumPriorAndDimSize(uint aspect_ratios_size, uint mi
     return PARAM_INVALID;
   }
   num_priors = static_cast<int>(tmp_value);
-  
+
   if (ge::CheckIntMulOverflow(layer_width, layer_height) != SUCCESS) {
     GELOGW("Failed to get list param.");
     return PARAM_INVALID;
@@ -288,7 +292,7 @@ std::unique_ptr<float[]> SsdPriorboxKernel::BoundaryCalulate(int dim_size, int l
     }
   }
 
-  return std::move(output_data);
+  return output_data;
 }
 
 Status SsdPriorboxKernel::Compute(const NodePtr &node, std::vector<GeTensorPtr> &v_output) {
diff --git a/ge/host_kernels/ssd_prior_box_kernel.h b/ge/host_kernels/ssd_prior_box_kernel.h
index 0ebf221d..c08217e2 100755
--- a/ge/host_kernels/ssd_prior_box_kernel.h
+++ b/ge/host_kernels/ssd_prior_box_kernel.h
@@ -100,8 +100,8 @@ class SsdPriorboxKernel : public Kernel {
    * @return OTHERS:  Execution failed
    * @author
    */
-  Status GetNumPriorAndDimSize(uint aspect_ratios_size, uint min_sizes_size, uint max_sizes_size, int layer_width,
-                               int layer_height, int &num_priors, int &dim_size) const;
+  Status GetNumPriorAndDimSize(uint32_t aspect_ratios_size, uint32_t min_sizes_size, uint32_t max_sizes_size,
+                               int layer_width, int layer_height, int &num_priors, int &dim_size) const;
   void DataCalulate(float x, float y, float box_x, float box_y, int img_x, int img_y, vector<float> &result);
   std::unique_ptr<float[]> BoundaryCalulate(int dim_size, int layer_width, int layer_height, float step_width,
                                             float step_height, int img_width, int img_height, float offset,
diff --git a/ge/host_kernels/strided_slice_kernel.cc b/ge/host_kernels/strided_slice_kernel.cc
index 2fe74415..b1bfb10a 100644
--- a/ge/host_kernels/strided_slice_kernel.cc
+++ b/ge/host_kernels/strided_slice_kernel.cc
@@ -272,6 +272,10 @@ Status StridedSliceKernel::InitParamWithAttrs(const std::vector<ConstGeTensorPtr
 void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_tensor, const size_t x_dims_num,
                                                vector<int64_t> &x_dims) {
   auto begin_data_type_size = GetSizeByDataType(begin_tensor->GetTensorDesc().GetDataType());
+  if (begin_data_type_size == 0) {
+    GELOGW("Param begin_data_type_size should not be zero.");
+    return;
+  }
   size_t begin_vec_size = begin_tensor->GetData().size() / begin_data_type_size;
   auto final_dim_num = x_dims_num < begin_vec_size ? begin_vec_size : x_dims_num;
   for (size_t i = 0; i < final_dim_num; i++) {
@@ -284,8 +288,10 @@ void StridedSliceKernel::ExpandDimsWithNewAxis(const ConstGeTensorPtr &begin_ten
 }
 
 void StridedSliceKernel::ExpandStrideWithEllipsisMask(const size_t x_dims_num, 
-                                    const vector<int64_t> &x_dims, vector<int64_t> &orig_begin_vec,
-                                    vector<int64_t> &orig_end_vec, vector<int64_t> &orig_stride_vec) {
+                                                      const vector<int64_t> &x_dims, 
+                                                      vector<int64_t> &orig_begin_vec,
+                                                      vector<int64_t> &orig_end_vec, 
+                                                      vector<int64_t> &orig_stride_vec) {
   
   if (attr_value_map_.at(STRIDE_SLICE_ATTR_ELLIPSIS_MASK) != 0) {
     auto end_mask = attr_value_map_.at(STRIDE_SLICE_ATTR_END_MASK);
@@ -308,7 +314,7 @@ void StridedSliceKernel::ExpandStrideWithEllipsisMask(const size_t x_dims_num,
         if (orig_begin_vec.size() < x_dims_num) {
           for (size_t j = 1; j < (x_dims_num - orig_begin_vec.size() + 1); ++j) {
             orig_begin_vec.insert((orig_begin_vec.begin() + ellipsis_dim + j), 0);
-            orig_end_vec.insert((orig_end_vec.begin() + ellipsis_dim + j), x_dims.at(ellipsis_dim +j));
+            orig_end_vec.insert((orig_end_vec.begin() + ellipsis_dim + j), x_dims.at(ellipsis_dim + j));
             orig_stride_vec.insert((orig_stride_vec.begin() + ellipsis_dim + j), 1);
           }
         }
diff --git a/ge/hybrid/common/npu_memory_allocator.cc b/ge/hybrid/common/npu_memory_allocator.cc
index f506caec..ccd6a624 100644
--- a/ge/hybrid/common/npu_memory_allocator.cc
+++ b/ge/hybrid/common/npu_memory_allocator.cc
@@ -20,9 +20,12 @@
 #include "graph/manager/graph_caching_allocator.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/rdma_pool_allocator.h"
+#include "graph/manager/host_mem_allocator.h"
 
 namespace ge {
 namespace hybrid {
+const size_t kPaddingUnit = 2;
+
 size_t kMaxHbmMemorySize = 1024UL * 1024UL * 1024UL * 1024UL; // 1024G
 
 std::map<uint32_t, std::unique_ptr<NpuMemoryAllocator>> NpuMemoryAllocator::allocators_;
@@ -62,7 +65,7 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
   if (mem_type == RDMA_HBM) {
     buffer = MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Malloc(allocate_size, device_id_);
   } else if (mem_type == HOST_DDR) {
-    buffer = malloc(allocate_size);
+    buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size);
   } else {
     if (allocate_size > kMaxHbmMemorySize) {
       GELOGE(PARAM_INVALID, "Invalid HBM memory size: %zu", allocate_size);
@@ -77,7 +80,7 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
       }
     }
     // padding up to multiple of padding, and add extra padding
-    allocate_size = (size + 2 * padding - 1) / padding * padding;
+    allocate_size = (size + kPaddingUnit * padding - 1) / padding * padding;
     GELOGD("Padding size %ld by %d. final size = %zu.", size, padding, allocate_size);
     buffer = MemManager::Instance()
                  .CachingInstance(RT_MEMORY_HBM)
@@ -99,7 +102,7 @@ void NpuMemoryAllocator::Deallocate(void *data, MemStorageType mem_type) {
     if (mem_type == RDMA_HBM) {
       MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Free(reinterpret_cast<uint8_t *>(data), device_id_);
     } else if (mem_type == HOST_DDR) {
-      free(data);
+      MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Free(data);
     } else {
       MemManager::Instance().CachingInstance(RT_MEMORY_HBM).Free(reinterpret_cast<uint8_t *>(data), device_id_);
     }
diff --git a/ge/hybrid/common/tensor_value.cc b/ge/hybrid/common/tensor_value.cc
index a1a57f63..16ecfaa4 100644
--- a/ge/hybrid/common/tensor_value.cc
+++ b/ge/hybrid/common/tensor_value.cc
@@ -56,7 +56,7 @@ std::unique_ptr<TensorBuffer> TensorBuffer::Create(void *buffer, size_t size) {
 }
 
 TensorBuffer::~TensorBuffer() {
-  if (allocator_ != nullptr && buffer_ != nullptr) {
+  if (allocator_ != nullptr) {
     allocator_->Deallocate(buffer_, mem_type_);
     buffer_ = nullptr;
   }
diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc
index 491220be..87207e94 100644
--- a/ge/hybrid/executor/hybrid_execution_context.cc
+++ b/ge/hybrid/executor/hybrid_execution_context.cc
@@ -18,6 +18,12 @@
 
 namespace ge {
 namespace hybrid {
+namespace {
+const uint32_t kEndOfSequence = 0x0704000a;
+const uint32_t kEndOfSequenceNew = 507005;
+const int32_t kModelAbortNormal = 0x0704000e;
+const int32_t kModelAbortNormalNew = 507024;
+}  // namespace
 void GraphExecutionContext::SetErrorCode(Status error_code) {
   std::lock_guard<std::mutex> lk(mu);
   this->status = error_code;
@@ -27,5 +33,26 @@ Status GraphExecutionContext::GetStatus() const {
   std::lock_guard<std::mutex> lk(mu);
   return this->status;
 }
+
+Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) {
+  auto rt_ret = rtStreamSynchronize(rt_stream);
+  if (rt_ret == RT_ERROR_NONE) {
+    return SUCCESS;
+  }
+
+  if (rt_ret == kEndOfSequence || rt_ret == kEndOfSequenceNew) {
+    GELOGI("Got end of sequence");
+    is_eos_ = true;
+    return END_OF_SEQUENCE;
+  }
+
+  if (rt_ret == kModelAbortNormal || rt_ret == kModelAbortNormalNew) {
+    GELOGI("The model with multiple datasets aborts normally");
+    return SUCCESS;
+  }
+
+  GELOGE(RT_FAILED, "Failed to invoke rtStreamSynchronize, ret = %d", rt_ret);
+  return RT_FAILED;
+}
 }  // namespace hybrid
 }  // namespace ge
\ No newline at end of file
diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h
index 0910d2c7..c398e83d 100644
--- a/ge/hybrid/executor/hybrid_execution_context.h
+++ b/ge/hybrid/executor/hybrid_execution_context.h
@@ -22,6 +22,7 @@
 #include "common/blocking_queue.h"
 #include "common/properties_manager.h"
 #include "framework/common/debug/ge_log.h"
+#include "graph/ge_local_context.h"
 #include "hybrid/common/npu_memory_allocator.h"
 #include "hybrid/common/tensor_value.h"
 #include "hybrid/executor/hybrid_profiler.h"
@@ -30,14 +31,30 @@
 #include "hybrid/executor/rt_callback_manager.h"
 #include "hybrid/model/hybrid_model.h"
 
+// If expr is not SUCCESS, print the log and return the same value
+#define HYBRID_CHK_STATUS_RET(expr, ...)        \
+  do {                                          \
+    const ge::Status _status = (expr);          \
+    if (_status != ge::SUCCESS) {               \
+      if (_status == ge::END_OF_SEQUENCE) {     \
+        GELOGD("Got end of sequence");          \
+      } else {                                  \
+        GELOGE(_status, __VA_ARGS__);           \
+      }                                         \
+      return _status;                           \
+    }                                           \
+  } while (0)
+
 namespace ge {
 namespace hybrid {
 struct GraphExecutionContext {
   void SetErrorCode(Status error_code);
   Status GetStatus() const;
+  Status Synchronize(rtStream_t rt_stream);
 
   uint64_t session_id = 0;
   const HybridModel *model = nullptr;
+  const GEThreadLocalContext *ge_context = nullptr;
   rtStream_t stream = nullptr;
   rtContext_t rt_context = nullptr;
   rtContext_t rt_gen_context = nullptr;
@@ -47,6 +64,7 @@ struct GraphExecutionContext {
   DumpProperties dump_properties;
   bool trace_enabled = false;
   bool dump_enabled = false;
+  std::atomic_bool is_eos_;
   long profiling_level = 0;
   long iteration = 0;
   Status status = SUCCESS;
@@ -57,7 +75,8 @@ struct GraphExecutionContext {
 do { \
   if ((context != nullptr) && (context)->profiler != nullptr) { \
     if (node_name != nullptr) { \
-      context->profiler->RecordEvent(evt_type, "tid:%lu [%s] [%s] " fmt, GeLog::GetTid(), node_name, category, ##__VA_ARGS__);\
+      context->profiler->RecordEvent(evt_type, "tid:%lu [%s] [%s] " fmt, GeLog::GetTid(), node_name, category, \
+                                     ##__VA_ARGS__); \
     } else { \
       context->profiler->RecordEvent(evt_type, "tid:%lu [%s] " fmt, GeLog::GetTid(), category, ##__VA_ARGS__); \
     }\
@@ -77,7 +96,7 @@ do { \
   RECORD_PROFILING_EVENT((context), HybridProfiler::EXECUTION, fmt, "Execution", name,  ##__VA_ARGS__)
 
 #define RECORD_CALLBACK_EVENT(context, name, fmt, ...) \
-  RECORD_PROFILING_EVENT((context), HybridProfiler::CALLBACK, fmt, "Callback", name,  ##__VA_ARGS__)
+  RECORD_PROFILING_EVENT((context), HybridProfiler::CALLBACKS, fmt, "Callback", name,  ##__VA_ARGS__)
 }  // namespace hybrid
 }  // namespace ge
 #endif // GE_HYBRID_EXECUTOR_HYBRID_EXECUTION_CONTEXT_H_
diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc
index 91996ab3..e9881224 100644
--- a/ge/hybrid/executor/hybrid_model_async_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_async_executor.cc
@@ -24,7 +24,7 @@
 namespace ge {
 namespace hybrid {
 namespace {
-int kDataOutputIndex = 0;
+const int kDataOutputIndex = 0;
 }
 HybridModelAsyncExecutor::HybridModelAsyncExecutor(HybridModel *model)
     : model_(model), run_flag_(false) {
@@ -94,14 +94,14 @@ Status HybridModelAsyncExecutor::Init() {
   executor_ = std::unique_ptr<HybridModelExecutor>(new(std::nothrow) HybridModelExecutor(model_, device_id_, stream_));
   GE_CHECK_NOTNULL(executor_);
   GE_CHK_STATUS_RET(executor_->Init(), "Failed to init hybrid engine");
-  GE_CHK_STATUS_RET(InitInputTensors(), "Failed to init input tensors");
+  GE_CHK_STATUS_RET(InitInputDesc(), "Failed to init input tensors");
   return SUCCESS;
 }
 
-Status HybridModelAsyncExecutor::PreRun(InputData &current_data) {
+Status HybridModelAsyncExecutor::PreRun(InputData &current_data, HybridModelExecutor::ExecuteArgs &args) {
   GE_CHK_STATUS_RET(SyncVarData(), "Failed to sync var data");
   RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[SyncVarData] End");
-  GE_CHK_STATUS_RET(CopyInputData(current_data), "Failed to copy input data to model");
+  GE_CHK_STATUS_RET(PrepareInputs(current_data, args), "Failed to copy input data to model");
   RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[CopyInputData] End");
   return SUCCESS;
 }
@@ -126,14 +126,9 @@ Status HybridModelAsyncExecutor::RunInternal() {
     InputData current_data = data_wrapper->GetInput();
     GELOGI("Model thread Run begin, model id:%u, data index:%u.", model_id_, current_data.index);
 
-    HybridModelExecutor::ExecuteArgs args;
-    args.inputs.resize(input_tensors_.size());
-    for (auto &it : input_tensors_) {
-      args.inputs[it.first] = it.second;
-    }
-
     RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[RunInternal] [iteration = %d] Start", iterator_count_);
-    ret = PreRun(current_data);
+    HybridModelExecutor::ExecuteArgs args;
+    ret = PreRun(current_data, args);
     GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
         ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput());
         CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
@@ -162,9 +157,10 @@ Status HybridModelAsyncExecutor::HandleResult(Status exec_ret,
                                               OutputData *output_data) {
   GELOGD("Start to handle result. model id = %u, data index = %u, execution ret = %u", model_id_, data_id, exec_ret);
   std::vector<ge::OutputTensorInfo> output_tensor_info_list;
-  if (exec_ret == END_OF_SEQUENCE) {
-    GELOGW("End of sequence, model id = %u", model_id_);
-    return OnComputeDone(data_id, END_OF_SEQUENCE, output_tensor_info_list);
+  if (args.is_eos) {
+    GELOGI("End of sequence, model id = %u", model_id_);
+    GE_CHK_STATUS_RET_NOLOG(OnComputeDone(data_id, END_OF_SEQUENCE, output_tensor_info_list));
+    return SUCCESS;
   }
 
   if (exec_ret != SUCCESS) {
@@ -202,30 +198,57 @@ Status HybridModelAsyncExecutor::SyncVarData() {
   return SUCCESS;
 }
 
-Status HybridModelAsyncExecutor::CopyInputData(const InputData &current_data) {
+Status HybridModelAsyncExecutor::PrepareInputs(const InputData &current_data, HybridModelExecutor::ExecuteArgs &args) {
+  if (current_data.blobs.size() < input_tensor_desc_.size()) {
+    GELOGE(PARAM_INVALID, "Blob size mismatches, expect at least %zu, but got %zu",
+           input_tensor_desc_.size(), current_data.blobs.size());
+    return PARAM_INVALID;
+  }
+
+  auto allocator = NpuMemoryAllocator::GetAllocator(device_id_);
+  GE_CHECK_NOTNULL(allocator);
+  args.input_desc.resize(input_tensor_desc_.size());
   const std::vector<DataBuffer> &blobs = current_data.blobs;
-  for (const auto &it : input_tensors_) {
-    auto input_index = it.first;
-    auto input_tensor = it.second;
-    auto data_size = input_tensor.GetSize();
-    GELOGD("To copy input data for input[%u]", input_index);
-    if (input_index >= blobs.size()) {
-      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld",
-             blobs.size(), model_->input_nodes_.size(), input_index, data_size);
-      return FAILED;
+  for (size_t input_index = 0; input_index < input_tensor_desc_.size(); ++input_index) {
+    auto tensor_size = input_sizes_[input_index];
+    if (is_input_dynamic_[input_index]) {
+      if (input_index >= current_data.shapes.size()) {
+        GELOGE(PARAM_INVALID, "Shape index out of range, index = %zu, shape size = %zu",
+               input_index, current_data.shapes.size());
+        return PARAM_INVALID;
+      }
+      auto &tensor_desc = input_tensor_desc_[input_index];
+      tensor_desc->SetShape(GeShape(current_data.shapes[input_index]));
+      args.input_desc[input_index] = tensor_desc;
+      GELOGD("Update shape of input[%u] to [%s]", input_index, tensor_desc->MutableShape().ToString().c_str());
+      GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, tensor_size),
+                              "Failed to calc tensor size, index = %zu, shape = [%s]",
+                              input_index,
+                              tensor_desc->GetShape().ToString().c_str());
+      GELOGD("Input tensor[%zu] size = %zu", input_index, tensor_size);
     }
 
+    GE_CHECK_GE(tensor_size, 0);
+    auto tensor_buffer = TensorBuffer::Create(allocator, tensor_size);
+    GE_CHECK_NOTNULL(tensor_buffer);
+    args.inputs.emplace_back(std::shared_ptr<TensorBuffer>(tensor_buffer.release()));
+
+    GELOGD("To copy input data for input[%u]", input_index);
     const DataBuffer &data_buf = blobs[input_index];
-    auto mem_size = static_cast<uint32_t>(data_size);
+    auto mem_size = static_cast<uint64_t>(tensor_size);
     GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length,
                            PARAM_INVALID,
-                           "input data size(%lu) does not match model required size(%u), ret failed.",
+                           "input data size(%lu) does not match model required size(%lu), ret failed.",
                            data_buf.length,
                            mem_size);
 
     GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%u] datasize[%lu]",
-           model_->root_runtime_param_.graph_id, input_index, input_tensor.GetData(), mem_size, data_buf.length);
-    GE_CHK_RT_RET(rtMemcpy(input_tensor.MutableData(),
+           model_->root_runtime_param_.graph_id,
+           input_index,
+           args.inputs[input_index].GetData(),
+           mem_size,
+           data_buf.length);
+    GE_CHK_RT_RET(rtMemcpy(args.inputs[input_index].MutableData(),
                            mem_size,
                            data_buf.data,
                            data_buf.length,
@@ -235,29 +258,32 @@ Status HybridModelAsyncExecutor::CopyInputData(const InputData &current_data) {
   return SUCCESS;
 }
 
-Status HybridModelAsyncExecutor::InitInputTensors() {
-  auto allocator = NpuMemoryAllocator::GetAllocator(device_id_);
-  GE_CHECK_NOTNULL(allocator);
+Status HybridModelAsyncExecutor::InitInputDesc() {
   int input_index = 0;
   for (const auto &input_node : model_->GetRootGraphItem()->GetInputNodes()) {
-    GELOGD("Init input[%u], node = %s", input_index, input_node->NodeName().c_str());
+    GELOGD("Init input[%u], node = %s, is_dynamic = %d",
+           input_index,
+           input_node->NodeName().c_str(),
+           input_node->is_dynamic);
     auto output_desc = input_node->MutableOutputDesc(kDataOutputIndex);
     GE_CHECK_NOTNULL(output_desc);
-    int64_t tensor_size = 0;
-    GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*output_desc, tensor_size),
-                            "Failed to get size from %s",
-                            input_node->NodeName().c_str());
-    if (tensor_size == 0) {
-      GELOGW("[%s] Tensor size == 0", input_node->NodeName().c_str());
-      GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*output_desc, tensor_size),
-                              "Failed to calc tensor size");
-      GELOGD("[%s] Tensor size updated to %ld", input_node->NodeName().c_str(), tensor_size);
+    int64_t tensor_size = -1;
+    if (!input_node->is_dynamic) {
+      GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(*output_desc, tensor_size),
+                              "Failed to get size from %s",
+                              input_node->NodeName().c_str());
+
+      if (tensor_size == 0) {
+        GELOGW("[%s] Tensor size == 0", input_node->NodeName().c_str());
+        GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*output_desc, tensor_size),
+                                "Failed to calc tensor size");
+        GELOGD("[%s] Tensor size updated to %ld", input_node->NodeName().c_str(), tensor_size);
+      }
     }
-    auto buffer = TensorBuffer::Create(allocator, tensor_size);
-    GE_CHECK_NOTNULL(buffer);
-    TensorValue tensor(shared_ptr<TensorBuffer>(buffer.release()));
-    tensor.SetName("Input_" + input_node->NodeName());
-    input_tensors_.emplace(input_index, tensor);
+
+    input_sizes_.emplace(input_index, tensor_size);
+    input_tensor_desc_.emplace(input_index, output_desc);
+    is_input_dynamic_.push_back(input_node->is_dynamic);
     input_index += 1;
   }
 
@@ -379,11 +405,13 @@ Status HybridModelAsyncExecutor::Execute(const std::vector<DataBuffer> &inputs,
     }
     if (output_real_size > 0) {
       if (outputs[i].length < static_cast<uint64_t>(output_real_size)) {
-        GELOGE(FAILED, "output idx[%zu], the memory size of output[%lu] given by user should be greater than or equal to the real size of output[%ld]",
+        GELOGE(FAILED, "output idx[%zu], the memory size of output[%lu] given by "
+                       "user should be greater than or equal to the real size of output[%ld]",
                i, outputs[i].length, output_real_size);
         return FAILED;
       }
-      GE_CHK_RT_RET(rtMemcpy(outputs[i].data, outputs[i].length, args.outputs[i].GetData(), output_real_size, RT_MEMCPY_DEVICE_TO_DEVICE));
+      GE_CHK_RT_RET(rtMemcpy(outputs[i].data, outputs[i].length, args.outputs[i].GetData(), output_real_size, 
+                    RT_MEMCPY_DEVICE_TO_DEVICE));
     }
     outputs[i].length = output_real_size;
   }
@@ -400,18 +428,12 @@ Status HybridModelAsyncExecutor::Execute(const vector<GeTensor> &inputs, vector<
     buffer.data = const_cast<uint8_t *>(tensor.GetData().GetData());
     buffer.length = tensor.GetData().size();
     input_data.blobs.emplace_back(buffer);
+    input_data.shapes.emplace_back(tensor.GetTensorDesc().GetShape().GetDims());
   }
-  GE_CHK_STATUS_RET(CopyInputData(input_data), "Failed to copy input data to model");
-  GELOGD("Done copying input data successfully.");
 
   HybridModelExecutor::ExecuteArgs args;
-  args.inputs.resize(input_tensors_.size());
-  args.input_desc.resize(input_tensors_.size());
-  for (auto &it : input_tensors_) {
-    args.inputs[it.first] = it.second;
-    args.input_desc[it.first] = MakeShared<GeTensorDesc>(inputs[it.first].GetTensorDesc());
-  }
-
+  GE_CHK_STATUS_RET(PrepareInputs(input_data, args), "Failed to copy input data to model");
+  GELOGD("Done copying input data successfully.");
   GE_CHK_STATUS_RET(executor_->Execute(args), "Failed to execute model.");
 
   std::vector<ge::OutputTensorInfo> output_tensor_info_list;
diff --git a/ge/hybrid/executor/hybrid_model_async_executor.h b/ge/hybrid/executor/hybrid_model_async_executor.h
index 21833b0b..21d2d033 100644
--- a/ge/hybrid/executor/hybrid_model_async_executor.h
+++ b/ge/hybrid/executor/hybrid_model_async_executor.h
@@ -53,7 +53,7 @@ class HybridModelAsyncExecutor {
   Status EnqueueData(const std::shared_ptr<InputDataWrapper> &data);
 
  private:
-  Status InitInputTensors();
+  Status InitInputDesc();
 
   Status RunInternal();
 
@@ -70,9 +70,9 @@ class HybridModelAsyncExecutor {
 
   Status OnComputeDone(uint32_t data_index, uint32_t result_code, std::vector<ge::OutputTensorInfo> &outputs);
 
-  Status PreRun(InputData &current_data);
+  Status PreRun(InputData &current_data, HybridModelExecutor::ExecuteArgs &args);
 
-  Status CopyInputData(const InputData &current_data);
+  Status PrepareInputs(const InputData &current_data, HybridModelExecutor::ExecuteArgs &args);
 
   std::mutex mu_;
   HybridModel *model_;
@@ -85,7 +85,9 @@ class HybridModelAsyncExecutor {
   uint64_t iterator_count_ = 0;
 
   rtStream_t stream_ = nullptr;
-  std::map<uint32_t, TensorValue> input_tensors_;
+  std::map<uint32_t, int64_t> input_sizes_;
+  std::map<uint32_t, GeTensorDescPtr> input_tensor_desc_;
+  std::vector<bool> is_input_dynamic_;
   std::shared_ptr<ModelListener> listener_;
 };
 }  // namespace hybrid
diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc
index 4af34451..c47dafc1 100755
--- a/ge/hybrid/executor/hybrid_model_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_executor.cc
@@ -50,15 +50,18 @@ Status HybridModelExecutor::Execute(HybridModelExecutor::ExecuteArgs &args) {
   auto ret = ExecuteGraphInternal(executor, args);
   Cleanup();
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[Cleanup] End");
-  GE_CHK_STATUS_RET(ret, "Failed to execute model");
   GELOGD("Model executed successfully.");
-
   if (context_.profiler != nullptr) {
     context_.profiler->Dump(std::cout);
     context_.profiler->Reset();
   }
 
   context_.iteration += 1;
+  if (ret == END_OF_SEQUENCE) {
+    args.is_eos = true;
+  } else {
+    GE_CHK_STATUS_RET(ret, "Failed to execute model");
+  }
   return SUCCESS;
 }
 
@@ -68,13 +71,13 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor,
   GE_CHK_STATUS_RET_NOLOG(ResetExecutionContext(context_));
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[InitContext] End");
 
-  GE_CHK_STATUS_RET(executor.ExecuteAsync(args.inputs, args.input_desc), "Failed to execute partitioned call.");
+  HYBRID_CHK_STATUS_RET(executor.ExecuteAsync(args.inputs, args.input_desc), "Failed to execute partitioned call.");
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[ExecuteAsync] End");
 
-  GE_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph.");
+  HYBRID_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph.");
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End");
 
-  GE_CHK_STATUS_RET(executor.GetOutputs(args.outputs, args.output_desc), "Failed to get outputs");
+  HYBRID_CHK_STATUS_RET(executor.GetOutputs(args.outputs, args.output_desc), "Failed to get outputs");
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[GetOutput] End");
   return SUCCESS;
 }
@@ -82,7 +85,7 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor,
 Status HybridModelExecutor::Cleanup() {
   GELOGD("Start to cleanup.");
   context_.callback_manager->Destroy();
-  RuntimeInferenceContext::DestroyContext(to_string(context_.session_id));
+  RuntimeInferenceContext::DestroyContext(std::to_string(context_.session_id));
   GELOGD("Cleanup successfully.");
   return SUCCESS;
 }
@@ -94,7 +97,9 @@ Status HybridModelExecutor::InitExecutionContext() {
 
   context_.stream = stream_;
   context_.model = model_;
+  context_.is_eos_ = false;
   context_.session_id = ::ge::GetContext().SessionId();
+  context_.ge_context = &GetThreadLocalContext();
   GELOGD("session id from model = %lu, from context = %lu", model_->GetSessionId(), context_.session_id);
   context_.allocator = NpuMemoryAllocator::GetAllocator(device_id_);
   GE_CHECK_NOTNULL(context_.allocator);
diff --git a/ge/hybrid/executor/hybrid_model_executor.h b/ge/hybrid/executor/hybrid_model_executor.h
index 04aef6a5..6299d4ff 100644
--- a/ge/hybrid/executor/hybrid_model_executor.h
+++ b/ge/hybrid/executor/hybrid_model_executor.h
@@ -31,6 +31,7 @@ class HybridModelExecutor {
     std::vector<ConstGeTensorDescPtr> input_desc;
     std::vector<TensorValue> outputs;
     std::vector<ConstGeTensorDescPtr> output_desc;
+    bool is_eos = false;
   };
 
   HybridModelExecutor(HybridModel *model, uint32_t device_id, rtStream_t stream);
diff --git a/ge/hybrid/executor/hybrid_profiler.cc b/ge/hybrid/executor/hybrid_profiler.cc
index 7228197f..336a633f 100644
--- a/ge/hybrid/executor/hybrid_profiler.cc
+++ b/ge/hybrid/executor/hybrid_profiler.cc
@@ -25,7 +25,7 @@ namespace ge {
 namespace hybrid {
 namespace {
 const int kMaxEvents = 10000;
-const int kEventDescMax = 256;
+const int kEventDescMax = 512;
 const int kMaxEventTypes = 8;
 const int kIndent = 8;
 }
diff --git a/ge/hybrid/executor/hybrid_profiler.h b/ge/hybrid/executor/hybrid_profiler.h
index 62ef9c73..94a042e4 100644
--- a/ge/hybrid/executor/hybrid_profiler.h
+++ b/ge/hybrid/executor/hybrid_profiler.h
@@ -33,7 +33,7 @@ class HybridProfiler {
     SHAPE_INFERENCE,
     COMPILE,
     EXECUTION,
-    CALLBACK,
+    CALLBACKS
   };
 
   struct Event {
diff --git a/ge/hybrid/executor/node_done_manager.cc b/ge/hybrid/executor/node_done_manager.cc
index c0b0b17b..f0d4324a 100644
--- a/ge/hybrid/executor/node_done_manager.cc
+++ b/ge/hybrid/executor/node_done_manager.cc
@@ -21,7 +21,7 @@
 namespace ge {
 namespace hybrid {
 namespace {
-constexpr int kDefaultWaitTimeoutInSec = 60 * 10;
+constexpr int kDefaultWaitTimeoutInSec = 600;
 }
 bool NodeDoneManager::Cond::Await() {
   std::unique_lock<std::mutex> lk(cond_mu_);
diff --git a/ge/hybrid/executor/node_state.cc b/ge/hybrid/executor/node_state.cc
index 033c5304..171ddaf3 100644
--- a/ge/hybrid/executor/node_state.cc
+++ b/ge/hybrid/executor/node_state.cc
@@ -18,6 +18,7 @@
 #include <chrono>
 #include "framework/common/debug/log.h"
 #include "graph/compute_graph.h"
+#include "graph/utils/tensor_utils.h"
 #include "hybrid_execution_context.h"
 #include "subgraph_context.h"
 
@@ -35,30 +36,32 @@ ShapeInferenceState::ShapeInferenceState(const NodeItem &node_item) : node_item(
          this->num_pending_shapes_);
 }
 
-Status ShapeInferenceState::UpdateInputShape(int idx,
-                                             const GeShape &ori_shape,
-                                             const GeShape &shape) {
+Status ShapeInferenceState::UpdateInputShape(int idx, const GeTensorDesc &target) {
   if (node_item.IsInputShapeStatic(idx)) {
     GELOGD("[%s] Trying to update static shape, idx = %d. old shape = [%s], new shape = [%s]",
            node_item.NodeName().c_str(),
            idx,
            node_item.MutableInputDesc(idx)->GetShape().ToString().c_str(),
-           shape.ToString().c_str());
+           target.GetShape().ToString().c_str());
     return SUCCESS;
   }
 
-  GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s]",
+  int64_t tensor_size = -1;
+  (void) TensorUtils::GetSize(target, tensor_size);
+  GELOGD("[%s] Update input shape [%d] with Shape: [%s] and OriginalShape: [%s], size = %ld",
          node_item.NodeName().c_str(),
          idx,
-         shape.ToString().c_str(),
-         ori_shape.ToString().c_str());
+         target.GetShape().ToString().c_str(),
+         target.GetOriginShape().ToString().c_str(),
+         tensor_size);
 
   std::lock_guard<std::mutex> lk(mu_);
   auto tensor_desc = node_item.MutableInputDesc(idx);
   GE_CHECK_NOTNULL(tensor_desc);
-  tensor_desc->SetShape(shape);
-  tensor_desc->SetOriginShape(ori_shape);
-  if (--num_pending_shapes_ == 0) {
+  tensor_desc->SetShape(target.GetShape());
+  tensor_desc->SetOriginShape(target.GetOriginShape());
+  (void) TensorUtils::SetSize(*tensor_desc, tensor_size);
+  if (--num_pending_shapes_ <= 0) {
     ready_cv_.notify_all();
   }
 
@@ -95,6 +98,11 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex
         break;
       }
 
+      if (context.is_eos_) {
+        GELOGD("[%s] Await pending shape cancelled due to end of sequence", node_item.NodeName().c_str());
+        return END_OF_SEQUENCE;
+      }
+
       if (context.GetStatus() != SUCCESS) {
         GELOGE(FAILED, "[%s] Await pending shape cancelled", node_item.NodeName().c_str());
         break;
@@ -110,24 +118,25 @@ Status ShapeInferenceState::AwaitShapesReady(const GraphExecutionContext &contex
   for (auto &p : shape_futures) {
     auto idx = p.first;
     auto &future = p.second;
-    GeShape shape;
-    GeShape ori_shape;
     RECORD_SHAPE_INFERENCE_EVENT(&context, node_item.NodeName().c_str(), "[AwaitShape] [idx = %u] Start", idx);
-    GE_CHK_STATUS_RET(future.Get(ori_shape, shape),
-                      "[%s] Get shape failed. index = %u",
-                      node_item.NodeName().c_str(),
-                      idx);
+    GeTensorDescPtr src_tensor_desc;
+    GE_CHK_STATUS_RET_NOLOG(future.GetTensorDesc(src_tensor_desc));
+    GE_CHECK_NOTNULL(src_tensor_desc);
     RECORD_SHAPE_INFERENCE_EVENT(&context, node_item.NodeName().c_str(), "[AwaitShape] [idx = %u] End", idx);
 
-    GELOGD("[%s] Update input shape [%u] with shape: [%s] and ori_shape: [%s]",
-           node_item.NodeName().c_str(),
-           idx,
-           shape.ToString().c_str(),
-           ori_shape.ToString().c_str());
     auto input_desc = node_item.MutableInputDesc(idx);
     GE_CHECK_NOTNULL(input_desc);
-    input_desc->SetShape(std::move(shape));
-    input_desc->SetOriginShape(ori_shape);
+    int64_t tensor_size = -1;
+    (void) TensorUtils::GetSize(*src_tensor_desc, tensor_size);
+    GELOGD("[%s] Update input shape [%u] with shape: [%s] and ori_shape: [%s], index = %zu",
+           node_item.NodeName().c_str(),
+           idx,
+           src_tensor_desc->GetShape().ToString().c_str(),
+           src_tensor_desc->GetOriginShape().ToString().c_str(),
+           tensor_size);
+    input_desc->SetShape(src_tensor_desc->GetShape());
+    input_desc->SetOriginShape(src_tensor_desc->GetOriginShape());
+    (void) TensorUtils::SetSize(*input_desc, tensor_size);
   }
 
   return SUCCESS;
@@ -153,10 +162,11 @@ Status NodeState::AwaitInputTensors(GraphExecutionContext &context) const {
                            node_item_->NodeName().c_str(),
                            "[AwaitNodeDone] [%s] Start",
                            src_node->GetName().c_str());
-    if (!subgraph_context_->Await(src_node)) {
-      GELOGE(INTERNAL_ERROR, "[%s] Await node [%s] failed.", GetName().c_str(), src_node->GetName().c_str());
-      return INTERNAL_ERROR;
-    }
+
+    HYBRID_CHK_STATUS_RET(subgraph_context_->Await(src_node),
+                          "[%s] Await node [%s] failed.",
+                          GetName().c_str(),
+                          src_node->GetName().c_str());
 
     RECORD_EXECUTION_EVENT(&context,
                            node_item_->NodeName().c_str(),
@@ -180,15 +190,18 @@ Status NodeState::WaitForPrepareDone() {
 
 Status ShapeFuture::Get(GeShape &ori_shape, GeShape &shape) {
   GELOGD("Start to wait node: %s for getting shape", src_node_->GetName().c_str());
-  if (!subgraph_context_->Await(src_node_)) {
-    GELOGE(INTERNAL_ERROR, "cancelled");
-    return INTERNAL_ERROR;
-  }
-
+  HYBRID_CHK_STATUS_RET(subgraph_context_->Await(src_node_), "cancelled");
   shape = src_node_->GetOpDesc()->MutableOutputDesc(src_index_)->MutableShape();
   ori_shape = src_node_->GetOpDesc()->MutableOutputDesc(src_index_)->GetOriginShape();
   GELOGD("Get shape from %s:%u. shape = [%s]", src_node_->GetName().c_str(), src_index_, shape.ToString().c_str());
   return SUCCESS;
 }
+
+Status ShapeFuture::GetTensorDesc(GeTensorDescPtr &tensor_desc) {
+  GELOGD("Start to wait node: %s for getting shape", src_node_->GetName().c_str());
+  HYBRID_CHK_STATUS_RET(subgraph_context_->Await(src_node_), "cancelled");
+  tensor_desc = src_node_->GetOpDesc()->MutableOutputDesc(src_index_);
+  return SUCCESS;
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/executor/node_state.h b/ge/hybrid/executor/node_state.h
index 48b2ed72..02a362b4 100644
--- a/ge/hybrid/executor/node_state.h
+++ b/ge/hybrid/executor/node_state.h
@@ -27,7 +27,7 @@
 namespace ge {
 namespace hybrid {
 class NodeTask;
-class GraphExecutionContext;
+struct GraphExecutionContext;
 class SubgraphContext;
 
 class ShapeFuture {
@@ -35,6 +35,7 @@ class ShapeFuture {
   ShapeFuture(NodePtr src_node, uint32_t src_index, SubgraphContext *subgraph_context);
   ~ShapeFuture() = default;
   Status Get(GeShape &ori_shape, GeShape &shape);
+  Status GetTensorDesc(GeTensorDescPtr &tensor_desc);
 
  private:
   NodePtr src_node_;
@@ -45,7 +46,7 @@ class ShapeFuture {
 struct ShapeInferenceState {
   explicit ShapeInferenceState(const NodeItem &node_item);
 
-  Status UpdateInputShape(int idx, const GeShape &ori_shape, const GeShape &shape);
+  Status UpdateInputShape(int idx, const GeTensorDesc &tensor_desc);
 
   void UpdateInputShapeFuture(int idx, ShapeFuture &&future);
 
diff --git a/ge/hybrid/executor/subgraph_context.cc b/ge/hybrid/executor/subgraph_context.cc
index 923c2aa3..0fa112a4 100644
--- a/ge/hybrid/executor/subgraph_context.cc
+++ b/ge/hybrid/executor/subgraph_context.cc
@@ -17,11 +17,12 @@
 #include "subgraph_context.h"
 
 #include "common/debug/log.h"
+#include "hybrid/executor/hybrid_model_executor.h"
 
 namespace ge {
 namespace hybrid {
-SubgraphContext::SubgraphContext(const GraphItem *graph_item) : graph_item_(graph_item) {
-
+SubgraphContext::SubgraphContext(const GraphItem *graph_item, const GraphExecutionContext *execution_context)
+    : graph_item_(graph_item), execution_context_(execution_context) {
 }
 
 Status SubgraphContext::Init() {
@@ -111,12 +112,22 @@ Status SubgraphContext::GetOutputs(std::vector<TensorValue> &outputs) {
   return SUCCESS;
 }
 
-bool SubgraphContext::Await(const NodePtr &node) {
-  return node_done_manager_.Await(node);
+Status SubgraphContext::Await(const NodePtr &node) {
+  if (node_done_manager_.Await(node)) {
+    return SUCCESS;
+  }
+
+  if (execution_context_->is_eos_) {
+    return END_OF_SEQUENCE;
+  }
+
+  return FAILED;
 }
 
 void SubgraphContext::OnError(Status error) {
-  GELOGE(error, "[%s] Error occurred while executing graph.", graph_item_->GetName().c_str());
+  if (error != END_OF_SEQUENCE) {
+    GELOGE(error, "[%s] Error occurred while executing graph.", graph_item_->GetName().c_str());
+  }
   node_done_manager_.Destroy();
 }
 
diff --git a/ge/hybrid/executor/subgraph_context.h b/ge/hybrid/executor/subgraph_context.h
index b86765f7..8ce33f23 100644
--- a/ge/hybrid/executor/subgraph_context.h
+++ b/ge/hybrid/executor/subgraph_context.h
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "hybrid/common/tensor_value.h"
+#include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/executor/node_state.h"
 #include "hybrid/executor/node_done_manager.h"
 #include "hybrid/model/graph_item.h"
@@ -29,7 +30,7 @@ namespace ge {
 namespace hybrid {
 class SubgraphContext {
  public:
-  explicit SubgraphContext(const GraphItem *graph_item);
+  explicit SubgraphContext(const GraphItem *graph_item, const GraphExecutionContext *execution_context);
   ~SubgraphContext() = default;
 
   Status Init();
@@ -43,12 +44,13 @@ class SubgraphContext {
   Status GetInput(int index, TensorValue &tensor);
   Status GetOutputs(std::vector<TensorValue> &outputs);
 
-  bool Await(const NodePtr &node);
+  Status Await(const NodePtr &node);
   void NodeDone(const NodePtr &node);
 
  private:
   friend class TaskContext;
   const GraphItem *graph_item_;
+  const GraphExecutionContext *execution_context_;
   std::mutex mu_;
   std::vector<TensorValue> all_inputs_;
   std::vector<TensorValue> all_outputs_;
diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc
index 76a6cc37..f7b063c7 100644
--- a/ge/hybrid/executor/subgraph_executor.cc
+++ b/ge/hybrid/executor/subgraph_executor.cc
@@ -40,7 +40,7 @@ SubgraphExecutor::~SubgraphExecutor() {
 
 Status SubgraphExecutor::Init(const std::vector<TensorValue> &inputs,
                               const std::vector<ConstGeTensorDescPtr> &input_desc) {
-  subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_));
+  subgraph_context_.reset(new(std::nothrow)SubgraphContext(graph_item_, context_));
   GE_CHECK_NOTNULL(subgraph_context_);
   GE_CHK_STATUS_RET(subgraph_context_->Init(), "[%s] Failed to init subgraph context.", graph_item_->GetName().c_str());
 
@@ -93,9 +93,10 @@ Status SubgraphExecutor::InitInputsForUnknownShape(const std::vector<TensorValue
       GELOGD("[%s] Start to update input[%zu] for subgraph data node.", graph_item_->GetName().c_str(), i);
       GE_CHECK_LE(i + 1, input_desc.size());
       const auto &tensor_desc = input_desc[i];
+      GE_CHECK_NOTNULL(tensor_desc);
       auto node_state = subgraph_context_->GetOrCreateNodeState(input_node);
       GE_CHECK_NOTNULL(node_state);
-      node_state->GetShapeInferenceState().UpdateInputShape(0, tensor_desc->GetOriginShape(), tensor_desc->GetShape());
+      node_state->GetShapeInferenceState().UpdateInputShape(0, *tensor_desc);
     }
   }
 
@@ -138,7 +139,7 @@ Status SubgraphExecutor::ExecuteAsync(const std::vector<TensorValue> &inputs,
     return ExecuteAsyncForKnownShape(inputs);
   }
 
-  GE_CHK_STATUS_RET(ScheduleTasks(), "[%s] Failed to execute tasks.", graph_item_->GetName().c_str());
+  HYBRID_CHK_STATUS_RET(ScheduleTasks(), "[%s] Failed to execute tasks.", graph_item_->GetName().c_str());
   GELOGD("[%s] Done executing subgraph successfully.", graph_item_->GetName().c_str());
   return SUCCESS;
 }
@@ -162,10 +163,10 @@ Status SubgraphExecutor::ExecuteAsyncForKnownShape(const std::vector<TensorValue
   known_shape_task_context_ = TaskContext::Create(*node_item, context_, subgraph_context_.get());
   GE_CHECK_NOTNULL(known_shape_task_context_);
 
-  GE_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, known_shape_task_context_, *context_),
-                    "[%s] Failed to execute node [%s] for known subgraph.",
-                    graph_item_->GetName().c_str(),
-                    known_shape_task_context_->GetNodeName());
+  HYBRID_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, known_shape_task_context_, *context_),
+                        "[%s] Failed to execute node [%s] for known subgraph.",
+                        graph_item_->GetName().c_str(),
+                        known_shape_task_context_->GetNodeName());
 
   GELOGD("[%s] Done execute non-dynamic subgraph successfully.", graph_item_->GetName().c_str());
   return SUCCESS;
@@ -210,35 +211,34 @@ Status SubgraphExecutor::PrepareNodes() {
     GE_CHECK_NOTNULL(node_state);
     auto p_node_state = node_state.get();
 
-    if (node_item.node_type == NETOUTPUT) {
-      // Wait for all inputs become valid
-      // after PrepareNodes returned. all output tensors and shapes are valid
-      GE_CHK_STATUS_RET_NOLOG(p_node_state->GetShapeInferenceState().AwaitShapesReady(*context_));
-      GE_CHK_STATUS_RET_NOLOG(p_node_state->AwaitInputTensors(*context_));
-      continue;
-    }
+    if (node_item.node_type != NETOUTPUT) {
+      // only do shape inference and compilation for nodes with dynamic shapes.
+      if (node_item.is_dynamic) {
+        auto prepare_future = pre_run_pool_.commit([this, p_node_state]() -> Status {
+          GetContext().SetSessionId(context_->session_id);
+          GE_CHK_STATUS_RET_NOLOG(InferShape(shape_inference_engine_.get(), *p_node_state));
+          return PrepareForExecution(context_, *p_node_state);
+        });
 
-    // only do shape inference and compilation for nodes with dynamic shapes.
-    if (node_item.is_dynamic) {
-      auto prepare_future = pre_run_pool_.commit([this, p_node_state]() -> Status {
-        GetContext().SetSessionId(context_->session_id);
-        GE_CHK_STATUS_RET_NOLOG(InferShape(shape_inference_engine_.get(), *p_node_state));
-        return PrepareForExecution(context_, *p_node_state);
-      });
-
-      p_node_state->SetPrepareFuture(std::move(prepare_future));
-    } else {
-      GELOGD("[%s] Skipping shape inference and compilation for node with static shape.", node_item.NodeName().c_str());
-      if (node_item.kernel_task == nullptr) {
-        GELOGW("[%s] Node of static shape got no task.", node_item.NodeName().c_str());
-        GE_CHK_STATUS_RET(TaskCompileEngine::Compile(*p_node_state, context_),
-                          "[%s] Failed to create task.", p_node_state->GetName().c_str());
+        p_node_state->SetPrepareFuture(std::move(prepare_future));
       } else {
-        node_state->SetKernelTask(node_item.kernel_task);
+        GELOGD("[%s] Skipping shape inference and compilation for node with static shape.",
+               node_item.NodeName().c_str());
+        if (node_item.kernel_task == nullptr) {
+          GELOGW("[%s] Node of static shape got no task.", node_item.NodeName().c_str());
+          GE_CHK_STATUS_RET(TaskCompileEngine::Compile(*p_node_state, context_),
+                            "[%s] Failed to create task.", p_node_state->GetName().c_str());
+        } else {
+          node_state->SetKernelTask(node_item.kernel_task);
+        }
       }
     }
 
     if (!ready_queue_.Push(p_node_state)) {
+      if (context_->is_eos_) {
+        GELOGD("Got end of sequence");
+        return SUCCESS;
+      }
       GELOGE(INTERNAL_ERROR, "[%s] Error occurs while launching tasks. quit from preparing nodes.",
              graph_item_->GetName().c_str());
       return INTERNAL_ERROR;
@@ -252,10 +252,10 @@ Status SubgraphExecutor::PrepareNodes() {
 
 Status SubgraphExecutor::InferShape(ShapeInferenceEngine *shape_inference_engine, NodeState &node_state) {
   const auto &node_item = *node_state.GetNodeItem();
-  GE_CHK_STATUS_RET(shape_inference_engine->InferShape(node_state),
-                    "[%s] Failed to InferShape.", node_state.GetName().c_str());
-  GE_CHK_STATUS_RET(shape_inference_engine->PropagateOutputShapes(node_item),
-                    "[%s] Failed to PropagateOutputShapes.", node_state.GetName().c_str());
+  HYBRID_CHK_STATUS_RET(shape_inference_engine->InferShape(node_state),
+                        "[%s] Failed to InferShape.", node_state.GetName().c_str());
+  HYBRID_CHK_STATUS_RET(shape_inference_engine->PropagateOutputShapes(node_item),
+                        "[%s] Failed to PropagateOutputShapes.", node_state.GetName().c_str());
   return SUCCESS;
 }
 
@@ -267,13 +267,6 @@ Status SubgraphExecutor::PrepareForExecution(GraphExecutionContext *ctx, NodeSta
   } else {
     node_state.SetKernelTask(node_item.kernel_task);
   }
-
-  GELOGD("[%s] Start to invoke CalcOpRunningParam.", node_item.NodeName().c_str());
-  RECORD_COMPILE_EVENT(ctx, node_item.NodeName().c_str(), "[CalcOpRunningParam] Start");
-  GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().CalcOpRunningParam(*node_item.node),
-                    "[%s] Failed to invoke CalcOpRunningParam.", node_item.NodeName().c_str());
-  RECORD_COMPILE_EVENT(ctx, node_item.NodeName().c_str(), "[CalcOpRunningParam] End");
-  GELOGD("[%s] Done invoking CalcOpRunningParam successfully.", node_item.NodeName().c_str());
   return SUCCESS;
 }
 
@@ -290,6 +283,15 @@ Status SubgraphExecutor::LaunchTasks() {
       return SUCCESS;
     }
 
+    if (node_state->GetType() == NETOUTPUT) {
+      // Wait for all inputs become valid
+      // after PrepareNodes returned. all output tensors and shapes are valid
+      GE_CHK_STATUS_RET_NOLOG(node_state->GetShapeInferenceState().AwaitShapesReady(*context_));
+      GE_CHK_STATUS_RET_NOLOG(node_state->AwaitInputTensors(*context_));
+      GELOGD("[%s] Done executing node successfully.", node_state->GetName().c_str());
+      continue;
+    }
+
     GE_CHK_STATUS_RET_NOLOG(node_state->WaitForPrepareDone());
 
     GELOGD("[%s] Start to execute.", node_state->GetName().c_str());
@@ -297,10 +299,9 @@ Status SubgraphExecutor::LaunchTasks() {
     GE_CHECK_NOTNULL(task_context);
     task_context->SetForceInferShape(force_infer_shape_);
     auto shared_task_context = std::shared_ptr<TaskContext>(task_context.release());
-    GE_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, shared_task_context, *context_),
-                      "[%s] Execute node failed.",
-                      node_state->GetName().c_str());
-
+    HYBRID_CHK_STATUS_RET(ExecutionEngine::ExecuteAsync(*node_state, shared_task_context, *context_),
+                          "[%s] Execute node failed.",
+                          node_state->GetName().c_str());
     GELOGD("[%s] Done executing node successfully.", node_state->GetName().c_str());
   }
 }
@@ -317,7 +318,6 @@ Status SubgraphExecutor::ScheduleTasks() {
   GELOGD("[%s] Start to execute subgraph.", graph_item_->GetName().c_str());
   auto ret = LaunchTasks();
   if (ret != SUCCESS) {
-    GELOGE(ret, "[%s] Failed to execute subgraph.", graph_item_->GetName().c_str());
     subgraph_context_->OnError(ret);
     context_->SetErrorCode(ret);
     ready_queue_.Stop();
@@ -356,7 +356,7 @@ Status SubgraphExecutor::GetOutputs(vector<TensorValue> &outputs, std::vector<Co
 
 Status SubgraphExecutor::Synchronize() {
   GELOGD("[%s] Synchronize start.", graph_item_->GetName().c_str());
-  GE_CHK_RT_RET(rtStreamSynchronize(context_->stream));
+  GE_CHK_STATUS_RET_NOLOG(context_->Synchronize(context_->stream));
   GELOGD("[%s] Done synchronizing successfully.", graph_item_->GetName().c_str());
   return SUCCESS;
 }
diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc
index e6729352..b5de2a70 100755
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@@ -20,12 +20,9 @@
 #include "graph/utils/tensor_adapter.h"
 #include "graph/debug/ge_attr_define.h"
 #include "hybrid/node_executor/node_executor.h"
-#include "common/dump/dump_manager.h"
+#include "hybrid/executor//worker//shape_inference_engine.h"
 #include "common/dump/dump_op.h"
-#include "common/types.h"
-#include "common/ge_types.h"
 #include "common/profiling/profiling_manager.h"
-#include "runtime/base.h"
 
 namespace ge {
 namespace hybrid {
@@ -154,18 +151,19 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel *
   GE_CHECK_NOTNULL(node);
   GE_CHECK_NOTNULL(model);
 
+  // only report aicpu and aicore node
+  bool is_profiling_report = context_->GetNodeItem().is_profiling_report;
+  if (!is_profiling_report) {
+    GELOGD("Node[%s] is not aicore or aicpu, and no need to report data.", node->GetName().c_str());
+    return SUCCESS;
+  }
+
   GELOGD("GetTaskDescInfo of node [%s] start.", node->GetName().c_str());
   auto op_desc = node->GetOpDesc();
   std::string op_name = op_desc->GetName();
   std::string dynamic_model_name = model->GetModelName();
-
-  uint32_t task_id = 0;
-  uint32_t stream_id = 0;
-  if (rtGetTaskIdAndStreamID(&task_id, &stream_id) != RT_ERROR_NONE) {
-    GELOGE(PARAM_INVALID, "Get task_id and stream_id failed.");
-    return PARAM_INVALID;
-  }
-
+  uint32_t task_id = context_->GetTaskId();
+  uint32_t stream_id = context_->GetStreamId();
   TaskDescInfo tmp_task_desc_info;
   tmp_task_desc_info.model_name = dynamic_model_name;
   tmp_task_desc_info.op_name = op_name;
@@ -177,6 +175,8 @@ Status NodeDoneCallback::GetTaskDescInfo(const NodePtr node, const HybridModel *
   }
   tmp_task_desc_info.task_id = task_id;
   tmp_task_desc_info.stream_id = stream_id;
+  tmp_task_desc_info.shape_type = "dynamic";
+  tmp_task_desc_info.cur_iter_num = graph_context_->iteration;
   GELOGD("GetTaskDescInfo of node [%s] end, task_id[%u], stream_id[%u]",
          node->GetName().c_str(), task_id, stream_id);
   task_desc_info.emplace_back(tmp_task_desc_info);
@@ -221,6 +221,8 @@ Status NodeDoneCallback::GetGraphDescInfo(const NodePtr node, const HybridModel
       tmp_compute_graph_info.output_shape.emplace_back(output_desc.GetShape().GetDims());
       tmp_compute_graph_info.output_data_type.emplace_back(output_desc.GetDataType());
     }
+    tmp_compute_graph_info.task_id = context_->GetTaskId();
+    tmp_compute_graph_info.stream_id = context_->GetStreamId();
     compute_graph_info.emplace_back(tmp_compute_graph_info);
     GELOGD("GetComputeGraphInfo of node [%s] end.", node->GetName().c_str());
   }
@@ -260,8 +262,7 @@ Status NodeDoneCallback::ProfilingReport() {
   }
 
   auto &profiling_manager = ProfilingManager::Instance();
-  profiling_manager.ReportProfilingData(model->GetModelId(), task_desc_info, compute_graph_info,
-                                        !profiling_manager.IsAclApiMode());
+  profiling_manager.ReportProfilingData(model->GetModelId(), task_desc_info, compute_graph_info);
   return SUCCESS;
 }
 
@@ -349,6 +350,10 @@ Status NodeDoneCallback::OnNodeDone() {
   }
 
   GE_CHK_STATUS_RET_NOLOG(PrepareConstInputs(node_item));
+  if (node_item.shape_inference_type == DEPEND_SHAPE_RANGE || node_item.shape_inference_type == DEPEND_COMPUTE) {
+    // update output tensor sizes
+    GE_CHK_STATUS_RET_NOLOG(ShapeInferenceEngine::CalcOutputTensorSizes(node_item));
+  }
   // PropagateOutputs for type == DEPEND_COMPUTE
   if (node_item.shape_inference_type == DEPEND_COMPUTE) {
     if (graph_context_->trace_enabled) {
@@ -403,9 +408,9 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state,
 
   // Wait for dependent nodes(DEPEND_COMPUTE), so that the input tensors are valid.
   RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[AwaitDependents] Start");
-  GE_CHK_STATUS_RET(node_state.AwaitInputTensors(context),
-                    "[%s] Failed to wait for dependent nodes.",
-                    node_state.GetName().c_str());
+  HYBRID_CHK_STATUS_RET(node_state.AwaitInputTensors(context),
+                        "[%s] Failed to wait for dependent nodes.",
+                        node_state.GetName().c_str());
 
   const auto &node_item = *node_state.GetNodeItem();
   auto executor = node_item.node_executor;
@@ -435,9 +440,9 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state,
     });
   }
   RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ExecuteTask] Start");
-  GE_CHK_STATUS_RET(node_item.node_executor->ExecuteTask(*task, task_context, callback),
-                    "[%s] Failed to execute task",
-                    node_state.GetName().c_str());
+  HYBRID_CHK_STATUS_RET(node_item.node_executor->ExecuteTask(*task, task_context, callback),
+                        "[%s] Failed to execute task",
+                        node_state.GetName().c_str());
   RECORD_EXECUTION_EVENT(&context, task_context.GetNodeName(), "[ExecuteTask] End");
 
   GELOGD("[%s] Done task launch successfully.", node_state.GetName().c_str());
diff --git a/ge/hybrid/executor/worker/shape_inference_engine.cc b/ge/hybrid/executor/worker/shape_inference_engine.cc
index bd429b21..56ae3ea3 100755
--- a/ge/hybrid/executor/worker/shape_inference_engine.cc
+++ b/ge/hybrid/executor/worker/shape_inference_engine.cc
@@ -17,9 +17,15 @@
 #include "hybrid/executor/worker/shape_inference_engine.h"
 #include "graph/shape_refiner.h"
 #include "graph/utils/node_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/utils/type_utils.h"
+#include "common/math/math_util.h"
 #include "hybrid/node_executor/node_executor.h"
 
 namespace ge {
+namespace {
+const int kAlignment = 32;
+}
 namespace hybrid {
 ShapeInferenceEngine::ShapeInferenceEngine(GraphExecutionContext *execution_context, SubgraphContext *subgraph_context)
     : execution_context_(execution_context),
@@ -40,7 +46,9 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) {
   }
 
   if (node_item.fused_subgraph != nullptr) {
-    return InferShapeForSubgraph(node_item, *node_item.fused_subgraph);
+    GE_CHK_STATUS_RET_NOLOG(InferShapeForSubgraph(node_item, *node_item.fused_subgraph));
+    GE_CHK_STATUS_RET_NOLOG(CalcOutputTensorSizes(node_item));
+    return SUCCESS;
   }
 
   // Skip shape inference for node of type DEPEND_COMPUTE
@@ -62,21 +70,16 @@ Status ShapeInferenceEngine::InferShape(NodeState &node_state) {
   {
     std::lock_guard<std::mutex> lk(mu_);
     RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] Start");
-    GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true), "Invoke InferShapeAndType failed.");
+    GE_CHK_STATUS_RET(ShapeRefiner::InferShapeAndTypeForRunning(node_item.node, true),
+                      "Invoke InferShapeAndType failed.");
     RECORD_SHAPE_INFERENCE_EVENT(execution_context_, node_item.NodeName().c_str(), "[InferShapeAndType] End");
   }
-  // Check again to make sure shape is valid after shape inference
-  if (node_item.shape_inference_type != DEPEND_SHAPE_RANGE) {
-    bool is_unknown_shape = false;
-    GE_CHK_STATUS_RET(NodeUtils::GetNodeUnknownShapeStatus(*node_item.node, is_unknown_shape),
-                      "Failed to get shape status. node = %s",
-                      node_item.NodeName().c_str());
 
-    GE_CHK_BOOL_RET_STATUS(!is_unknown_shape,
-                           INTERNAL_ERROR,
-                           "[%s] Shape is still unknown after shape inference.",
-                           node_item.NodeName().c_str());
-  }
+  // update output tensor sizes after shape inference
+  // error if shape is still unknown and not of type DEPEND_SHAPE_RANGE
+  RECORD_COMPILE_EVENT(execution_context_, node_item.NodeName().c_str(), "[CalcOpRunningParam] Start");
+  GE_CHK_STATUS_RET_NOLOG(CalcOutputTensorSizes(node_item, node_item.shape_inference_type == DEPEND_SHAPE_RANGE));
+  RECORD_COMPILE_EVENT(execution_context_, node_item.NodeName().c_str(), "[CalcOpRunningParam] End");
 
   GELOGD("[%s] [HybridTrace] After shape inference. Node = %s",
          node_item.NodeName().c_str(),
@@ -96,11 +99,7 @@ Status ShapeInferenceEngine::AwaitDependentNodes(NodeState &node_state) {
                                  node_item.NodeName().c_str(),
                                  "[AwaitNodeDone] [%s] Start",
                                  src_node->GetName().c_str());
-    if (!subgraph_context_->Await(src_node)) {
-      GELOGE(INTERNAL_ERROR, "[%s] Await node failed.", src_node->GetName().c_str());
-      return INTERNAL_ERROR;
-    }
-
+    HYBRID_CHK_STATUS_RET(subgraph_context_->Await(src_node), "[%s] Await node failed.", src_node->GetName().c_str());
     RECORD_SHAPE_INFERENCE_EVENT(execution_context_,
                                  node_item.NodeName().c_str(),
                                  "[AwaitNodeDone] [%s] End",
@@ -126,8 +125,6 @@ Status ShapeInferenceEngine::PropagateOutputShapes(const NodeItem &node_item) {
   // propagate each output
   for (int i = 0; i < node_item.num_outputs; ++i) {
     auto output_desc = node_item.op_desc->MutableOutputDesc(i);
-    const auto &shape = output_desc->MutableShape();
-    const auto &ori_shape = output_desc->GetOriginShape();
     auto &output_nodes = node_item.outputs[i];
 
     // propagate output to all sub-inputs
@@ -148,9 +145,7 @@ Status ShapeInferenceEngine::PropagateOutputShapes(const NodeItem &node_item) {
         infer_state.UpdateInputShapeFuture(dst_input_index_and_node.first,
                                            std::move(future));
       } else {
-        GE_CHK_STATUS_RET_NOLOG(infer_state.UpdateInputShape(dst_input_index_and_node.first,
-                                                             ori_shape,
-                                                             shape));
+        GE_CHK_STATUS_RET_NOLOG(infer_state.UpdateInputShape(dst_input_index_and_node.first, *output_desc));
       }
     }
   }
@@ -164,7 +159,7 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co
   for (auto &it : fused_subgraph.input_mapping) {
     auto parent_tensor_desc = node_item.MutableInputDesc(it.first);
     GE_CHECK_NOTNULL(parent_tensor_desc);
-    GELOGD("Start to update shape by input[%u]", it.first);
+    GELOGD("Start to update shape by input[%d]", it.first);
     GELOGD("Update shape to [%s]", parent_tensor_desc->GetShape().ToString().c_str());
     GELOGD("Update original shape to [%s]", parent_tensor_desc->GetOriginShape().ToString().c_str());
     for (auto &tensor_desc : it.second) {
@@ -183,12 +178,12 @@ Status ShapeInferenceEngine::InferShapeForSubgraph(const NodeItem &node_item, co
   }
 
   for (auto &it : fused_subgraph.output_mapping) {
-    uint32_t parent_output_idx = it.first;
+    int parent_output_idx = it.first;
     const auto &op_desc = it.second;
     GELOGD("Update parent output[%d] by [%s]", parent_output_idx, op_desc->GetName().c_str());
     auto input_desc = op_desc->MutableInputDesc(0);
     GE_CHECK_NOTNULL(input_desc);
-    auto parent_output_tensor_desc = node_item.op_desc->MutableOutputDesc(parent_output_idx);
+    auto parent_output_tensor_desc = node_item.MutableOutputDesc(parent_output_idx);
     GE_CHECK_NOTNULL(parent_output_tensor_desc);
     GELOGD("Update shape to [%s]", input_desc->GetShape().ToString().c_str());
     GELOGD("Update original shape to [%s]", input_desc->GetOriginShape().ToString().c_str());
@@ -229,5 +224,92 @@ Status ShapeInferenceEngine::UpdatePeerNodeShape(const Node &node) {
   }
   return SUCCESS;
 }
+
+Status ShapeInferenceEngine::CanonicalizeShape(GeTensorDesc &tensor_desc,
+                                               std::vector<int64_t> &shape,
+                                               bool fallback_with_range) {
+  const auto &tensor_shape = tensor_desc.MutableShape();
+  if (tensor_shape.IsUnknownShape()) {
+    if (!fallback_with_range) {
+      GELOGE(INTERNAL_ERROR, "Output shape is still unknown after shape inference. shape = [%s]",
+             tensor_shape.ToString().c_str());
+      return INTERNAL_ERROR;
+    }
+
+    GELOGD("Calc output size by range");
+    std::vector<std::pair<int64_t, int64_t>> shape_range;
+    GE_CHK_GRAPH_STATUS_RET(tensor_desc.GetShapeRange(shape_range), "Failed to get shape range");
+    if (shape_range.size() != shape.size()) {
+      GELOGE(INTERNAL_ERROR, "Number of shape ranges (%zu) mismatches that of dims (%zu)",
+             shape_range.size(),
+             shape.size());
+      return INTERNAL_ERROR;
+    }
+
+    for (size_t dim_index = 0; dim_index < shape.size(); ++dim_index) {
+      if (shape[dim_index] == ge::UNKNOWN_DIM) {
+        shape[dim_index] = shape_range[dim_index].second;
+      }
+    }
+
+    GELOGD("After canonicalization, shape = [%s], before = [%s]",
+           GeShape(shape).ToString().c_str(),
+           tensor_shape.ToString().c_str());
+  }
+
+  return SUCCESS;
+}
+
+Status ShapeInferenceEngine::CalcTensorSize(DataType data_type,
+                                            const std::vector<int64_t> &shape,
+                                            int64_t &tensor_size) {
+  GELOGD("To calc tensor size by shape = [%s]", GeShape(shape).ToString().c_str());
+  uint32_t type_size;
+  if (!TypeUtils::GetDataTypeLength(data_type, type_size)) {
+    GELOGE(INTERNAL_ERROR, "Failed to get data type size");
+    return INTERNAL_ERROR;
+  }
+
+  tensor_size = type_size;
+  for (const auto &dim : shape) {
+    GE_CHECK_GE(dim, 0);
+    GE_CHK_STATUS_RET(Int64MulCheckOverflow(tensor_size, dim),
+                      "Shape size overflow, shape = [%s]",
+                      GeShape(shape).ToString().c_str());
+    tensor_size *= dim;
+  }
+
+  GE_CHK_STATUS_RET(CheckInt64AddOverflow(tensor_size, kAlignment - 1),
+                    "Tensor size is too large: %ld, shape = [%s]",
+                    tensor_size,
+                    GeShape(shape).ToString().c_str());
+  tensor_size = (tensor_size + kAlignment - 1) / kAlignment * kAlignment;
+  return SUCCESS;
+}
+
+Status ShapeInferenceEngine::CalcOutputTensorSizes(const NodeItem &node_item, bool fallback_with_range) {
+  auto op_desc = node_item.GetOpDesc();
+  for (size_t output_index = 0; output_index < op_desc->GetOutputsSize(); ++output_index) {
+    auto tensor_desc = op_desc->MutableOutputDesc(output_index);
+    GE_CHECK_NOTNULL(tensor_desc);
+    const auto &shape = tensor_desc->MutableShape();
+    // modify on copy
+    auto dims = shape.GetDims();
+    GE_CHK_STATUS_RET(CanonicalizeShape(*tensor_desc, dims, fallback_with_range),
+                      "[%s] Failed to canonicalize shape for output %zu",
+                      node_item.NodeName().c_str(),
+                      output_index);
+
+    int64_t tensor_size;
+    GE_CHK_STATUS_RET(CalcTensorSize(tensor_desc->GetDataType(), dims, tensor_size),
+                      "[%s] Failed to calc tensor size for output %zu",
+                      node_item.NodeName().c_str(),
+                      output_index);
+    GELOGD("[%s] Tensor size of output %zu = %ld", node_item.NodeName().c_str(), output_index, tensor_size);
+    (void) TensorUtils::SetSize(*tensor_desc, tensor_size);
+  }
+
+  return SUCCESS;
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/executor/worker/shape_inference_engine.h b/ge/hybrid/executor/worker/shape_inference_engine.h
index 7bb9269c..b946577f 100644
--- a/ge/hybrid/executor/worker/shape_inference_engine.h
+++ b/ge/hybrid/executor/worker/shape_inference_engine.h
@@ -34,7 +34,11 @@ class ShapeInferenceEngine {
 
   Status PropagateOutputShapes(const NodeItem &node_item);
 
+  static Status CalcOutputTensorSizes(const NodeItem &node_item, bool fallback_with_range = false);
+
  private:
+  static Status CanonicalizeShape(GeTensorDesc &tensor_desc, std::vector<int64_t> &shape, bool fallback_with_range);
+  static Status CalcTensorSize(DataType data_type, const std::vector<int64_t> &shape, int64_t &tensor_size);
   static Status UpdatePeerNodeShape(const Node &node);
   Status AwaitDependentNodes(NodeState &node_state);
 
diff --git a/ge/hybrid/executor/worker/task_compile_engine.cc b/ge/hybrid/executor/worker/task_compile_engine.cc
index e2e94f66..f80374c6 100755
--- a/ge/hybrid/executor/worker/task_compile_engine.cc
+++ b/ge/hybrid/executor/worker/task_compile_engine.cc
@@ -26,6 +26,9 @@ Status TaskCompileEngine::Compile(NodeState &node_state, GraphExecutionContext *
   RECORD_COMPILE_EVENT(context, node_item.NodeName().c_str(), "[Compile] Start");
   GE_CHK_RT_RET(rtCtxSetCurrent(context->rt_gen_context));
 
+  if (context->ge_context != nullptr) {
+    GetThreadLocalContext() = *context->ge_context;
+  }
   shared_ptr<NodeTask> kernel_task;
   auto ret = node_item.node_executor->CompileTask(*context->model, node_item.node, kernel_task);
   RECORD_COMPILE_EVENT(context, node_state.GetName().c_str(), "[Compile] End");
diff --git a/ge/hybrid/hybrid_davinci_model.cc b/ge/hybrid/hybrid_davinci_model.cc
index a491c9a5..7009331c 100755
--- a/ge/hybrid/hybrid_davinci_model.cc
+++ b/ge/hybrid/hybrid_davinci_model.cc
@@ -113,8 +113,8 @@ HybridDavinciModel::~HybridDavinciModel() {
   delete impl_;
 }
 
-unique_ptr<HybridDavinciModel> HybridDavinciModel::Create(const GeRootModelPtr &ge_root_model) {
-  auto instance = unique_ptr<HybridDavinciModel>(new (std::nothrow)HybridDavinciModel());
+std::unique_ptr<HybridDavinciModel> HybridDavinciModel::Create(const GeRootModelPtr &ge_root_model) {
+  auto instance = std::unique_ptr<HybridDavinciModel>(new (std::nothrow)HybridDavinciModel());
   if (instance != nullptr) {
     instance->impl_ = new (std::nothrow) HybridDavinciModel::Impl(ge_root_model);
     if (instance->impl_ != nullptr) {
diff --git a/ge/hybrid/model/graph_item.cc b/ge/hybrid/model/graph_item.cc
index 067070c5..4e3faf70 100644
--- a/ge/hybrid/model/graph_item.cc
+++ b/ge/hybrid/model/graph_item.cc
@@ -71,5 +71,8 @@ int GraphItem::GetParentOutputIndex(size_t index) const {
 const NodeItem *GraphItem::GetOutputNode() const {
   return output_node_;
 }
+const vector<std::pair<const NodeItem *, int>> &GraphItem::GetOutputEdges() const {
+  return output_edges_;
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/model/graph_item.h b/ge/hybrid/model/graph_item.h
index 64d809ee..6fab9b44 100644
--- a/ge/hybrid/model/graph_item.h
+++ b/ge/hybrid/model/graph_item.h
@@ -29,7 +29,7 @@ class GraphItem {
   const vector<NodeItem *> &GetAllNodes() const;
   const vector<const NodeItem *> &GetInputNodes() const;
   Status GetOutputDescList(std::vector<ConstGeTensorDescPtr> &output_desc_list) const;
-
+  const vector<std::pair<const NodeItem *, int>> &GetOutputEdges() const;
   int TotalInputs() const {
     return total_inputs_;
   }
diff --git a/ge/hybrid/model/hybrid_model.cc b/ge/hybrid/model/hybrid_model.cc
index feb6757b..91b6a549 100644
--- a/ge/hybrid/model/hybrid_model.cc
+++ b/ge/hybrid/model/hybrid_model.cc
@@ -176,20 +176,9 @@ Status HybridModel::GetInputOutputDescInfo(vector<InputOutputDescInfo> &input_de
   return SUCCESS;
 }
 
-void HybridModel::SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims, std::vector<std::pair<int64_t,int64_t>> &shape_ranges,
-                                                 Format &format, InputOutputDescInfo &input) {
-  uint32_t n, c, h, w;
-  n = format == FORMAT_NHWC ? NHWC_DIM_N : NCHW_DIM_N;
-  c = format == FORMAT_NHWC ? NHWC_DIM_C : NCHW_DIM_C;
-  h = format == FORMAT_NHWC ? NHWC_DIM_H : NCHW_DIM_H;
-  w = format == FORMAT_NHWC ? NHWC_DIM_W : NCHW_DIM_W;
-
-  if (model_input_dims.size() == static_cast<size_t>(NORMAL_TENSOR_SIZE)) {
-    input.shape_info.num = model_input_dims[n];
-    input.shape_info.height = model_input_dims[h];
-    input.shape_info.width = model_input_dims[w];
-    input.shape_info.channel = model_input_dims[c];
-  }
+void HybridModel::SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims,
+                                                 std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
+                                                 InputOutputDescInfo &input) {
   for (auto model_input_dim : model_input_dims) {
     input.shape_info.dims.push_back(model_input_dim);
   }
@@ -197,25 +186,25 @@ void HybridModel::SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_in
   return;
 }
 
-void HybridModel::CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input) {
+void HybridModel::CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescInfo &input) {
   std::vector<std::pair<int64_t,int64_t>> shape_ranges;
   if (is_new_model_desc_ && op_desc->HasAttr(ATTR_NAME_INPUT_DIMS)) {
     // When static aipp is set, need to get the model input dims which processed by aipp
     vector<int64_t> model_input_dims;
     (void)AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_DIMS, model_input_dims);
-    SetInputDimsAndShapeRangesInfo(model_input_dims, shape_ranges, format, input);
+    SetInputDimsAndShapeRangesInfo(model_input_dims, shape_ranges, input);
     return;
   }
   // judge if this data is linked dynamic aipp first, multiply batch has been considered
   if (op_desc->HasAttr("_dynamic_aipp_input_dims")) {
     vector<int64_t> dynamic_aipp_input_dims;
     (void)AttrUtils::GetListInt(op_desc, "_dynamic_aipp_input_dims", dynamic_aipp_input_dims);
-    SetInputDimsAndShapeRangesInfo(dynamic_aipp_input_dims, shape_ranges, format, input);
+    SetInputDimsAndShapeRangesInfo(dynamic_aipp_input_dims, shape_ranges, input);
     return;
   } else {
     vector<int64_t> input_dims = op_desc->GetInputDescPtr(0)->GetShape().GetDims();
     op_desc->GetInputDescPtr(0)->GetShapeRange(shape_ranges);
-    SetInputDimsAndShapeRangesInfo(input_dims, shape_ranges, format, input);
+    SetInputDimsAndShapeRangesInfo(input_dims, shape_ranges, input);
     return;
   }
 }
@@ -248,7 +237,7 @@ Status HybridModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, st
     // not support dynamic shape input for now, so input_size here will be not less than zero.
     input.size = input_size;
 
-    CreateInputDimsInfo(op_desc, format, input);
+    CreateInputDimsInfo(op_desc, input);
 
     formats.push_back(format);
     input_desc.push_back(input);
@@ -257,29 +246,15 @@ Status HybridModel::GetInputDescInfo(vector<InputOutputDescInfo> &input_desc, st
   return SUCCESS;
 }
 
-void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDescInfo &output_desc_info, uint32_t &format_result) {
+void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc,
+                               InputOutputDescInfo &output_desc_info, uint32_t &format_result) {
   GE_IF_BOOL_EXEC(output_desc == nullptr, GELOGE(FAILED, "output desc ptr is nullptr"); return );
   Format format = output_desc->GetFormat();
   GeShape shape = output_desc->GetShape();
   std::vector<std::pair<int64_t,int64_t>> shape_ranges;
   output_desc->GetShapeRange(shape_ranges);
   DataType data_type = output_desc->GetDataType();
-  int64_t dims[] = {1, 1, 1, 1};
   format_result = format;
-  if (format == FORMAT_ND) {  // for ND tensor
-    for (size_t i = 0; i < shape.GetDimNum() && i < (sizeof(dims) / sizeof(dims[0])); i++) {
-      dims[i] = shape.GetDim(i);
-    }
-  } else {                                                                    // FOR FORMAT_NHWC or FORMAT_NCHW
-    dims[0] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_N : NCHW_DIM_N);  // 0: first dim
-    dims[1] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_C : NCHW_DIM_C);  // 1: second dim
-    dims[2] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_H : NCHW_DIM_H);  // 2: third dim
-    dims[3] = shape.GetDim(format == FORMAT_NHWC ? NHWC_DIM_W : NCHW_DIM_W);  // 3: forth dim
-  }
-  output_desc_info.shape_info.num = dims[0];      // 0: first dim
-  output_desc_info.shape_info.channel = dims[1];  // 1: second dim
-  output_desc_info.shape_info.height = dims[2];   // 2: third dim
-  output_desc_info.shape_info.width = dims[3];    // 3: forth dim
   if (format == FORMAT_FRACTAL_Z) {  // FraczToHWCK
     int64_t k = shape.GetDim(0);                                           // 0: first dim
     int64_t c = shape.GetDim(1);                                           // 1: second dim
@@ -310,7 +285,8 @@ void HybridModel::CreateOutput(ConstGeTensorDescPtr &output_desc, InputOutputDes
 
 Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats) {
   std::vector<ConstGeTensorDescPtr> output_desc_list;
-  GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), "get output desc info failed");  // output_desc_list contains vaild input desc
+  // output_desc_list contains vaild input desc
+  GE_CHK_STATUS_RET(root_graph_item_->GetOutputDescList(output_desc_list), "get output desc info failed");
 
   vector<std::string> out_node_names;
   (void)ge::AttrUtils::GetListStr(ge_root_model_->GetRootGraph(), ATTR_MODEL_OUT_NODES_NAME, out_node_names);
@@ -320,7 +296,8 @@ Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
   GE_CHECK_NOTNULL(op_desc);
 
   auto out_size = static_cast<uint32_t>(op_desc->GetInputsSize());
-  GE_CHK_BOOL_RET_STATUS(out_size == output_desc_list.size(), FAILED, "output size[%u] not match output_desc_list size[%zu]", out_size, output_desc_list.size());
+  GE_CHK_BOOL_RET_STATUS(out_size == output_desc_list.size(),
+      FAILED, "output size[%u] not match output_desc_list size[%zu]", out_size, output_desc_list.size());
 
   for (uint32_t index = 0; index < out_size; ++index) {
     string output_name;
@@ -328,9 +305,11 @@ Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
     std::vector<int64_t> src_index = op_desc->GetSrcIndex();
     if (out_size == out_node_names.size()) {
       bool contains_colon = out_node_names[index].find(":") != std::string::npos;
-      output_name = contains_colon ? out_node_names[index] : out_node_names[index] + ":" + std::to_string(src_index[index]);
+      output_name = contains_colon ? out_node_names[index] : out_node_names[index] +
+          ":" + std::to_string(src_index[index]);
     } else {
-      output_name = std::string("output_") + std::to_string(index) + "_" + src_name[index] + "_" + std::to_string(src_index[index]);
+      output_name = std::string("output_") + std::to_string(index) + "_" + src_name[index] +
+          "_" + std::to_string(src_index[index]);
     }
 
     InputOutputDescInfo output_desc_info;
@@ -343,5 +322,36 @@ Status HybridModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
   }
   return SUCCESS;
 }
+
+TensorValue *HybridModel::GetConstant(const NodePtr &node) const {
+  if (node == nullptr) {
+    GELOGE(PARAM_INVALID, "Param is null");
+    return nullptr;
+  }
+
+  auto it = constant_tensors_.find(node);
+  if (it == constant_tensors_.end()) {
+    GELOGD("constant not found, node name = [%s]", node->GetName().c_str());
+    return nullptr;
+  }
+
+  GELOGD("Got constant tensor, node name = [%s], tensor = %s",
+         node->GetName().c_str(),
+         it->second->DebugString().c_str());
+  return it->second.get();
+}
+
+TensorValue *HybridModel::GetTensor(const NodePtr &node) const {
+  if (node == nullptr) {
+    GELOGE(PARAM_INVALID, "Param is null");
+    return nullptr;
+  }
+
+  if (node->GetType() == CONSTANT) {
+    return GetConstant(node);
+  }
+
+  return GetVariable(node->GetName());
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h
index 1ec2f8a8..e521b776 100644
--- a/ge/hybrid/model/hybrid_model.h
+++ b/ge/hybrid/model/hybrid_model.h
@@ -73,6 +73,8 @@ class HybridModel {
 
   NodePtr GetVariableNode(const string &name) const;
 
+  TensorValue* GetTensor(const NodePtr &node) const;
+
   const std::vector<domi::TaskDef>* GetTaskDefs(const NodePtr &node) const;
 
   const GraphItem *GetRootGraphItem() const;
@@ -100,24 +102,27 @@ class HybridModel {
 
   Status GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc, std::vector<uint32_t> &formats);
 
-  void CreateInputDimsInfo(const OpDescPtr &op_desc, Format format, InputOutputDescInfo &input);
+  void CreateInputDimsInfo(const OpDescPtr &op_desc, InputOutputDescInfo &input);
 
   void SetModelDescVersion(bool is_new_model_desc) { is_new_model_desc_ = is_new_model_desc; }
 
-  void SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims, std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
-                                      Format &format, InputOutputDescInfo &input);
+  void SetInputDimsAndShapeRangesInfo(const vector<int64_t> &model_input_dims,
+                                      std::vector<std::pair<int64_t, int64_t>> &shape_ranges,
+                                      InputOutputDescInfo &input);
 
  private:
   friend class HybridModelBuilder;
   friend class HybridModelAsyncExecutor;
 
+  TensorValue* GetConstant(const NodePtr &node) const;
+
   std::string model_name_;
   GeRootModelPtr ge_root_model_;
   std::map<uint32_t, NodeItem *> input_nodes_;
-  std::map<std::string, NodePtr> constant_op_nodes_;
   std::map<std::string, NodePtr> device_variable_nodes_; //lint !e148
   std::map<std::string, NodePtr> host_variable_nodes_; //lint !e148
   std::map<std::string, std::unique_ptr<TensorValue>> variable_tensors_;
+  std::map<NodePtr, std::unique_ptr<TensorValue>> constant_tensors_;
   std::map<NodePtr, std::vector<domi::TaskDef>> task_defs_;
   std::map<NodePtr, GeModelPtr> known_shape_sub_models_;
 
diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc
index f4da3dcf..d1f61985 100755
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@@ -21,23 +21,34 @@
 #include "graph/build/memory/var_mem_assign_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/load/new_model_manager/model_utils.h"
+#include "graph/load/new_model_manager/model_manager.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/manager/host_mem_manager.h"
 #include "graph/manager/trans_var_data_utils.h"
+#include "graph/manager/graph_mem_allocator.h"
+#include "graph/manager/host_mem_allocator.h"
 #include "graph/utils/graph_utils.h"
 #include "hybrid/common/npu_memory_allocator.h"
 #include "hybrid/node_executor/node_executor.h"
-#include "framework/common/debug/ge_log.h"
-#include "graph/utils/attr_utils.h"
 
 namespace ge {
 namespace hybrid {
+using domi::LogTimeStampDef;
+using domi::TaskDef;
 namespace {
 const uint32_t kSubgraphIndex = 0U;
 const uint32_t kVarOutputIndex = 0U;
-const uint32_t kAlignment = 32;
+const uint64_t kProfilingFpStartLogid = 1U;
+const uint64_t kProfilingBpEndLogid = 2U;
+const uint64_t kProfilingIterEndLogid = 65535U;
 const int kBytes = 8;
 const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown";
+const char *const kProfilingGraph = "ProfilingGraph";
+const char *const kProfilingFpNode = "ProfilingFpNode";
+const char *const kProfilingBpNode = "ProfilingBpNode";
+const char *const kProfilingEndNode = "ProfilingEndNode";
+const char *const kProfilingArNode = "ProfilingAllReduceNode";
+const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE";
 
 Status SetOutputNameAttr(ComputeGraph &graph) {
   vector<string> output_names;
@@ -227,7 +238,10 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n
   new_node->node_id = node_index;
   new_node->op_desc->SetId(node_index);
   node_index += 1;
-
+  NodeExecutorManager::ExecutorType executor_type = NodeExecutorManager::GetInstance().ResolveExecutorType(*node);
+  new_node->is_profiling_report = (executor_type == NodeExecutorManager::ExecutorType::AICORE) ||
+                                  (executor_type == NodeExecutorManager::ExecutorType::AICPU_TF) ||
+                                  (executor_type == NodeExecutorManager::ExecutorType::AICPU_CUSTOM);
   *node_item = new_node.get();
   node_items[node] = std::move(new_node);
   return SUCCESS;
@@ -339,9 +353,9 @@ Status HybridModelBuilder::ParseDependentForFusedSubgraph(NodeItem &node_item) {
     uint32_t parent_index = 0;
     if (!AttrUtils::GetInt(*op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
       GELOGE(INTERNAL_ERROR,
-            "[%s] Failed to get attr [%s]",
-            op_desc->GetName().c_str(),
-            ATTR_NAME_PARENT_NODE_INDEX.c_str());
+             "[%s] Failed to get attr [%s]",
+             op_desc->GetName().c_str(),
+             ATTR_NAME_PARENT_NODE_INDEX.c_str());
       return INTERNAL_ERROR;
     }
 
@@ -793,7 +807,7 @@ Status HybridModelBuilder::HandleDtString(const GeTensor &tensor, void *var_addr
                            "Shape size is invalid");
     auto offset = static_cast<uint64_t>(elem_num * kBytes);
     auto hbm_raw_data_base_addr =
-        reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_addr) + offset);
+        static_cast<uint64_t>(reinterpret_cast<uintptr_t>(var_addr) + offset);
     for (int64_t i = elem_num - 1; i >= 0; --i) {
       buff[i] = hbm_raw_data_base_addr + (buff[i] - buff[0]);
     }
@@ -807,7 +821,7 @@ Status HybridModelBuilder::AssignUninitializedConstantOps() {
     GELOGI("no need to assign when exec on host.");
     return SUCCESS;
   }
-  for (auto &it : hybrid_model_.constant_op_nodes_) {
+  for (auto &it : constant_op_nodes_) {
     const string &var_name = it.first;
     const NodePtr &var_node = it.second;
     auto tensor_desc = var_node->GetOpDesc()->MutableOutputDesc(0);
@@ -836,7 +850,7 @@ Status HybridModelBuilder::AssignUninitializedConstantOps() {
 }
 
 Status HybridModelBuilder::InitConstantOps() {
-  for (auto &it : hybrid_model_.constant_op_nodes_) {
+  for (auto &it : constant_op_nodes_) {
     const string &var_name = it.first;
     const NodePtr &var_node = it.second;
     auto op_desc = var_node->GetOpDesc();
@@ -849,9 +863,18 @@ Status HybridModelBuilder::InitConstantOps() {
 
     std::unique_ptr<TensorValue> var_tensor;
     if (GetContext().GetHostExecFlag()) {
-      auto buffer = ge_tensor->MutableData();
-      GELOGD("Init tensor with host constant. size = %zu", buffer.GetSize());
-      var_tensor.reset(new(std::nothrow)TensorValue(buffer.GetData(), buffer.GetSize()));
+      GE_CHECK_NOTNULL(ge_tensor);
+      // Address for eigen kernel should be aligned with 16 bytes
+      // Tensors return by api GetWeights share data with proto, whose addr is not confirmed to be aligned
+      GeTensor aligned_tensor = ge_tensor->Clone();
+      GELOGD("Init tensor with host constant %s size = %zu", var_name.c_str(), aligned_tensor.MutableData().GetSize());
+      if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(aligned_tensor.GetAlignedPtr(),
+                                                                       aligned_tensor.GetData().size()) == nullptr) {
+        GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed.");
+        return MEMALLOC_FAILED;
+      }
+      var_tensor.reset(new(std::nothrow)TensorValue(aligned_tensor.MutableData().data(),
+                                                    aligned_tensor.GetData().size()));
     } else {
       GE_CHK_STATUS_RET_NOLOG(VarNodeToTensor(var_node, var_tensor));
       GELOGD("Init const op tensor. name = %s, size = %ld", var_name.c_str(), var_tensor->GetSize());
@@ -906,9 +929,15 @@ Status HybridModelBuilder::InitVariableTensors() {
       GELOGE(GE_GRAPH_MALLOC_FAILED, "Host variable [%s] malloc failed.", it.first.c_str());
       return GE_GRAPH_MALLOC_FAILED;
     }
-    GELOGD("Host variable [%s] malloc success.", it.first.c_str());
+    if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(mem_info.host_aligned_ptr,
+                                                                     tensor_size) == nullptr) {
+      GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed.");
+      return MEMALLOC_FAILED;
+    }
+    GELOGD("Host variable [%s] malloc success, size=%lld.", it.first.c_str(), tensor_size);
 
-    std::unique_ptr<TensorValue> tensor(new (std::nothrow) TensorValue(mem_info.host_address, tensor_size));
+    std::unique_ptr<TensorValue> tensor(new (std::nothrow) TensorValue(mem_info.host_aligned_ptr->MutableGet(),
+                                                                       tensor_size));
     GE_CHECK_NOTNULL(tensor);
     hybrid_model_.variable_tensors_.emplace(it.first, std::move(tensor));
   }
@@ -917,11 +946,52 @@ Status HybridModelBuilder::InitVariableTensors() {
 }
 
 Status HybridModelBuilder::InitWeights() {
-  // Train do not have weight. (only got ConstOp)
+  auto allocator = NpuMemoryAllocator::GetAllocator();
+  GE_CHECK_NOTNULL(allocator);
+
+  for (auto &it : hybrid_model_.node_items_) {
+    auto &node_item = it.second;
+    if (node_item->node_type != CONSTANT) {
+      continue;
+    }
+
+    const auto &constant_node = node_item->node;
+    auto op_desc = constant_node->GetOpDesc();
+    auto v_weights = ModelUtils::GetWeights(op_desc);
+    if (v_weights.empty()) {
+      GELOGE(INTERNAL_ERROR, "[%s] Constant has no value", constant_node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+    auto *ge_tensor = const_cast<GeTensor *>(v_weights[0].get());
+    auto output_desc = op_desc->MutableOutputDesc(0);
+    GE_CHECK_NOTNULL(output_desc);
+    auto tensor_size = ge_tensor->GetData().GetSize();
+    GELOGD("[%s] Start to init Constant node [%s], size = %ld",
+           GetGraphName(),
+           constant_node->GetName().c_str(),
+           tensor_size);
+
+    auto tensor_buffer = TensorBuffer::Create(allocator, tensor_size);
+    GE_CHECK_NOTNULL(tensor_buffer);
+    std::unique_ptr<TensorValue> constant_tensor(new (std::nothrow)TensorValue(std::move(tensor_buffer)));
+    GE_CHECK_NOTNULL(constant_tensor);
+    constant_tensor->SetName("Constant_" + op_desc->GetName());
+    if (tensor_size > 0) {
+      GE_CHK_RT_RET(rtMemcpy(constant_tensor->MutableData(),
+                             constant_tensor->GetSize(),
+                             ge_tensor->GetData().data(),
+                             ge_tensor->GetData().size(),
+                             RT_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    hybrid_model_.constant_tensors_.emplace(constant_node, std::move(constant_tensor));
+    GELOGD("[%s] Constant node [%s] added, size = %ld", GetGraphName(), constant_node->GetName().c_str(), tensor_size);
+  }
   return SUCCESS;
 }
 
 Status HybridModelBuilder::LoadTasks() {
+  GE_CHK_STATUS_RET(CheckAicpuOpList(), "Check Aicpu op failed.");
   for (auto &it : hybrid_model_.node_items_) {
     auto &node_item = it.second;
     auto &node_ptr = node_item->node;
@@ -987,7 +1057,7 @@ Status HybridModelBuilder::IndexTaskDefs() {
 
     // index task defs
     GELOGD("To index tasks for subgraph: %s", name.c_str());
-    unordered_map<int64_t, NodePtr> node_map;
+    std::unordered_map<int64_t, NodePtr> node_map;
     for (const auto &node : sub_graph->GetDirectNode()) {
       GE_CHECK_NOTNULL(node);
       GE_CHECK_NOTNULL(node->GetOpDesc());
@@ -1049,7 +1119,7 @@ Status HybridModelBuilder::IndexSpecialNodes() {
         hybrid_model_.device_variable_nodes_.emplace(node->GetName(), node);
       }
     } else if (op_type == CONSTANTOP) {
-      hybrid_model_.constant_op_nodes_.emplace(node->GetName(), node);
+      constant_op_nodes_.emplace(node->GetName(), node);
     } else if (op_type == DATA && node->GetOwnerComputeGraph() != root_graph) {
       NodePtr src_node;
       int peer_out_index = -1;
@@ -1322,7 +1392,7 @@ Status HybridModelBuilder::GetParentNodeOutputIndex(const OpDesc &op_desc, int i
 Status HybridModelBuilder::InitModelMem() {
   hybrid_model_.var_mem_base_ = var_manager_->GetVarMemoryBase(RT_MEMORY_HBM);
   auto total_var_size = hybrid_model_.TotalVarMemSize();
-  if (total_var_size == 0 && !hybrid_model_.constant_op_nodes_.empty()) {
+  if (total_var_size == 0 && !constant_op_nodes_.empty()) {
     total_var_size = var_manager_->GetVarMemSize(RT_MEMORY_HBM) > 0 ? var_manager_->GetVarMemMaxSize() : 0;
     GELOGD("Model var size = 0. but got uninitialized constant. set var size to %zu.", total_var_size);
   }
@@ -1458,6 +1528,188 @@ Status HybridModelBuilder::RecoverGraphUnknownFlag() {
   return SUCCESS;
 }
 
+Status HybridModelBuilder::GenerateFpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) {
+  uint64_t jobid_log_id = ge::GetContext().TraceId();
+  GELOGD("The first FP operator is %s,, job_id %lu", op_desc->GetName().c_str(), jobid_log_id);
+
+  TaskDef job_task_def;
+  job_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
+  job_task_def.set_stream_id(op_desc->GetStreamId());
+  LogTimeStampDef *job_log_def = job_task_def.mutable_log_timestamp();
+  if (job_log_def != nullptr) {
+    job_log_def->set_logid(jobid_log_id);
+    job_log_def->set_notify(false);
+  }
+  task_def_list.emplace_back(job_task_def);
+  TaskDef fp_task_def;
+  fp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
+  fp_task_def.set_stream_id(op_desc->GetStreamId());
+  LogTimeStampDef *fp_log_def = fp_task_def.mutable_log_timestamp();
+  if (fp_log_def != nullptr) {
+    fp_log_def->set_logid(kProfilingFpStartLogid);
+    fp_log_def->set_notify(false);
+  }
+  task_def_list.emplace_back(fp_task_def);
+
+  return SUCCESS;
+}
+
+Status HybridModelBuilder::GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id,
+                                                   vector<domi::TaskDef> &task_def_list) {
+  TaskDef ar_task_def;
+  ar_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
+  ar_task_def.set_stream_id(op_desc->GetStreamId());
+  LogTimeStampDef *ar_log_def = ar_task_def.mutable_log_timestamp();
+  if (ar_log_def != nullptr) {
+    ar_log_def->set_logid(log_id);
+    ar_log_def->set_notify(false);
+  }
+  task_def_list.emplace_back(ar_task_def);
+
+  return SUCCESS;
+}
+
+Status HybridModelBuilder::GenerateBpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) {
+    TaskDef bp_task_def;
+    bp_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
+    bp_task_def.set_stream_id(op_desc->GetStreamId());
+    LogTimeStampDef *bp_log_def = bp_task_def.mutable_log_timestamp();
+    GE_CHECK_NOTNULL(bp_log_def);
+    bp_log_def->set_logid(kProfilingBpEndLogid);
+    bp_log_def->set_notify(false);
+    task_def_list.emplace_back(bp_task_def);
+
+  return SUCCESS;
+}
+
+Status HybridModelBuilder::GenerateEndProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list) {
+  TaskDef end_task_def;
+  end_task_def.set_type(RT_MODEL_TASK_PROFILER_TRACE);
+  end_task_def.set_stream_id(op_desc->GetStreamId());
+  LogTimeStampDef *end_log_def = end_task_def.mutable_log_timestamp();
+  GE_CHECK_NOTNULL(end_log_def);
+  end_log_def->set_logid(kProfilingIterEndLogid);
+  end_log_def->set_notify(true);
+  task_def_list.emplace_back(end_task_def);
+
+  return SUCCESS;
+}
+
+Status HybridModelBuilder::CreateProfilingNodeBefore(GraphItem &graph_item, const NodePtr &node) {
+  GE_CHECK_NOTNULL(node);
+  const OpDescPtr &op_desc = node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  const auto &compute_graph = MakeShared<ComputeGraph>(kProfilingGraph);
+  GE_CHECK_NOTNULL(compute_graph);
+
+  NodePtr node_ptr = nullptr;
+  vector<domi::TaskDef> task_def_list;
+  // create fp node
+  bool is_insert_fp_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task);
+  if (is_insert_fp_profiling_task) {
+    (void)GenerateFpProfilingTask(op_desc, task_def_list);
+    auto fp_desc = MakeShared<OpDesc>(kProfilingFpNode, PROFILINGTRAININGTRACE);
+    GE_CHECK_NOTNULL(fp_desc);
+    fp_desc->SetOpKernelLibName(kEngineNameRts);
+    node_ptr = compute_graph->AddNode(fp_desc);
+    GELOGD("Create fp profiling node success before.");
+  }
+  // creat all reduce start node
+  bool is_insert_bp_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
+  bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
+  if (is_all_reduce && is_insert_bp_profiling_task) {
+    int64_t log_id = 0;
+    (void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id);
+    GELOGD("All reduce node profiling task log id: %ld before", log_id);
+    (void) GenerateArProfilingTask(op_desc, log_id, task_def_list);
+    string op_name = string(kProfilingArNode) + std::to_string(log_id);
+    auto ar_desc_start = MakeShared<OpDesc>(op_name, PROFILINGTRAININGTRACE);
+    GE_CHECK_NOTNULL(ar_desc_start);
+    ar_desc_start->SetOpKernelLibName(kEngineNameRts);
+    node_ptr = compute_graph->AddNode(ar_desc_start);
+    GELOGD("Create all reduce start profiling node success before.");
+  }
+
+  if (node_ptr != nullptr) {
+    for (const auto &task_def : task_def_list) {
+      hybrid_model_.task_defs_[node_ptr].emplace_back(task_def);
+    }
+    NodeItem *node_item = nullptr;
+    GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item));
+    node_item->input_start = 0;
+    node_item->output_start = 0;
+    graph_item.node_items_.emplace_back(node_item);
+  } else {
+    GELOGD("No need to create profiling node before.");
+  }
+
+  return SUCCESS;
+}
+
+Status HybridModelBuilder::CreateProfilingNodeAfter(GraphItem &graph_item, const NodePtr &node) {
+  GE_CHECK_NOTNULL(node);
+  const OpDescPtr &op_desc = node->GetOpDesc();
+  GE_CHECK_NOTNULL(op_desc);
+  const auto &compute_graph = MakeShared<ComputeGraph>(kProfilingGraph);
+  GE_CHECK_NOTNULL(compute_graph);
+
+  NodePtr node_ptr = nullptr;
+  vector<domi::TaskDef> task_def_list;
+  // Create all reduce end node
+  bool is_insert_bp_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task);
+  bool is_all_reduce = (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE);
+  if (is_all_reduce && is_insert_bp_profiling_task) {
+    int64_t log_id = 0;
+    (void)ge::AttrUtils::GetInt(op_desc, ATTR_NAME_INSERT_PROFILILNG_TASK_LOG_ID, log_id);
+    GELOGD("All reduce node profiling task log id: %ld after", log_id);
+    (void) GenerateArProfilingTask(op_desc, log_id + 1, task_def_list);
+    string op_name = string(kProfilingArNode) + std::to_string(log_id + 1);
+    auto ar_desc_end = MakeShared<OpDesc>(op_name, PROFILINGTRAININGTRACE);
+    GE_CHECK_NOTNULL(ar_desc_end);
+    ar_desc_end->SetOpKernelLibName(kEngineNameRts);
+    node_ptr = compute_graph->AddNode(ar_desc_end);
+    GELOGD("Create all reduce end profiling node success after.");
+  }
+  // create bp node
+  if (!is_all_reduce && is_insert_bp_profiling_task) {
+    (void) GenerateBpProfilingTask(op_desc, task_def_list);
+    auto bp_op_desc = MakeShared<OpDesc>(kProfilingBpNode, PROFILINGTRAININGTRACE);
+    GE_CHECK_NOTNULL(bp_op_desc);
+    bp_op_desc->SetOpKernelLibName(kEngineNameRts);
+    node_ptr = compute_graph->AddNode(bp_op_desc);
+    GELOGD("Create bp profiling node success after.");
+  }
+  // create end node
+  bool is_insert_end_profiling_task = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_END_PROFILILNG_TASK, is_insert_end_profiling_task);
+  if (is_insert_end_profiling_task) {
+    (void)GenerateEndProfilingTask(op_desc, task_def_list);
+    auto end_desc = MakeShared<OpDesc>(kProfilingEndNode, PROFILINGTRAININGTRACE);
+    GE_CHECK_NOTNULL(end_desc);
+    end_desc->SetOpKernelLibName(kEngineNameRts);
+    node_ptr = compute_graph->AddNode(end_desc);
+    GELOGD("Create end profiling node success after.");
+  }
+
+  if (node_ptr != nullptr) {
+    for (const auto &task_def : task_def_list) {
+      hybrid_model_.task_defs_[node_ptr].emplace_back(task_def);
+    }
+    NodeItem *node_item = nullptr;
+    GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node_ptr, &node_item));
+    node_item->input_start = 0;
+    node_item->output_start = 0;
+    graph_item.node_items_.emplace_back(node_item);
+  } else {
+    GELOGD("No need to create profiling node after.");
+  }
+
+  return SUCCESS;
+}
+
 Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root_graph) {
   GELOGD("Start to load subgraph [%s]", graph.GetName().c_str());
   // for known partitioned call, load all nodes
@@ -1473,6 +1725,10 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root
     GE_CHECK_NOTNULL(node);
     GE_CHECK_NOTNULL(node->GetOpDesc());
     const auto &op_type = node->GetType();
+    if (op_type == NOOP) {
+      GELOGD("[%s] Skip NoOp", node->GetName().c_str());
+      continue;
+    }
 
     NodeItem *node_item = nullptr;
     GE_CHK_STATUS_RET_NOLOG(GetOrCreateNodeItem(node, &node_item));
@@ -1490,8 +1746,9 @@ Status HybridModelBuilder::LoadDynamicSubgraph(ComputeGraph &graph, bool is_root
       graph_item->output_node_ = node_item;
       GE_CHK_STATUS_RET_NOLOG(BuildOutputMapping(*graph_item, *node_item, is_root_graph));
     }
-
+    GE_CHK_STATUS_RET_NOLOG(CreateProfilingNodeBefore(*graph_item, node));
     graph_item->node_items_.emplace_back(node_item);
+    GE_CHK_STATUS_RET_NOLOG(CreateProfilingNodeAfter(*graph_item, node));
     // parse var outputs
     GE_CHK_STATUS_RET_NOLOG(ParseVarOutputs(*node_item));
     GELOGD("NodeItem created: %s", node_item->DebugString().c_str());
@@ -1558,5 +1815,29 @@ Status HybridModelBuilder::BuildInputMapping(GraphItem &graph_item,
 
   return SUCCESS;
 }
+
+Status HybridModelBuilder::CheckAicpuOpList() {
+  std::vector<std::string> aicpu_optype_list;
+  std::vector<std::string> aicpu_tf_optype_list;
+  std::set<std::string> aicpu_optype_set;
+  std::set<std::string> aicpu_tf_optype_set;
+  for (auto &it : ge_root_model_->GetSubgraphInstanceNameToModel()) {
+    auto &ge_model = it.second;
+    GE_CHECK_NOTNULL(ge_model);
+    if (ge::AttrUtils::GetListStr(*ge_model, "needCheckCpu", aicpu_optype_list)) {
+      aicpu_optype_set.insert(aicpu_optype_list.begin(), aicpu_optype_list.end());
+    }
+
+    if (ge::AttrUtils::GetListStr(*ge_model, "needCheckTf", aicpu_tf_optype_list)) {
+      aicpu_tf_optype_set.insert(aicpu_tf_optype_list.begin(), aicpu_tf_optype_list.end());
+    }
+  }
+  // reset list with set
+  aicpu_optype_list.assign(aicpu_optype_set.begin(), aicpu_optype_set.end());
+  aicpu_tf_optype_list.assign(aicpu_tf_optype_set.begin(), aicpu_tf_optype_set.end());
+  GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchKernelCheckAicpuOp(aicpu_optype_list, aicpu_tf_optype_list),
+                    "Launch check aicpu op type failed.");
+  return SUCCESS;
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h
index b90ec982..55a19b6c 100644
--- a/ge/hybrid/model/hybrid_model_builder.h
+++ b/ge/hybrid/model/hybrid_model_builder.h
@@ -48,7 +48,6 @@ class HybridModelBuilder {
   static Status MergeNetOutputNode(ComputeGraph &compute_graph);
   static Status UnfoldSubgraphs(ComputeGraph &root_graph, ComputeGraphPtr &merged_graph);
   static Status UnfoldSubgraph(ComputeGraph &root_graph, ComputeGraph &parent_graph, ComputeGraph &sub_graph);
-  static Status InitWeights();
   static Status BuildInputMapping(GraphItem &graph_item,
                                   std::vector<NodeItem *> &data_nodes,
                                   bool is_root_graph);
@@ -68,6 +67,7 @@ class HybridModelBuilder {
   Status IndexSpecialNodes();
   Status InitRuntimeParams();
   Status InitModelMem();
+  Status InitWeights();
   Status TransAllVarData();
   Status CopyVarData();
   Status VarNodeToTensor(const NodePtr &var_node, std::unique_ptr<TensorValue> &tensor);
@@ -78,6 +78,13 @@ class HybridModelBuilder {
   Status ParseVarOutputs(NodeItem &node_item);
   Status LoadKnownShapedSubgraph(ComputeGraph &graph, NodeItem *parent_node_item);
   Status RecoverGraphUnknownFlag();
+  Status CheckAicpuOpList();
+  Status CreateProfilingNodeBefore(GraphItem &graph_item, const NodePtr &node);
+  Status CreateProfilingNodeAfter(GraphItem &graph_item, const NodePtr &node);
+  Status GenerateFpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list);
+  Status GenerateBpProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list);
+  Status GenerateEndProfilingTask(const OpDescPtr &op_desc, vector<domi::TaskDef> &task_def_list);
+  Status GenerateArProfilingTask(const OpDescPtr &op_desc, int64_t log_id, vector<domi::TaskDef> &task_def_list);
 
   const char* GetGraphName() const {
     return hybrid_model_.model_name_.c_str();
@@ -87,8 +94,9 @@ class HybridModelBuilder {
   NodeItem *MutableNodeItem(const NodePtr &node);
 
   GeRootModelPtr ge_root_model_;
-  std::map<int, std::unique_ptr<TensorValue>> weights_;
   std::map<std::string, GeModelPtr> subgraph_models_;
+  std::map<std::string, NodePtr> constant_op_nodes_;
+
   HybridModel &hybrid_model_;
   std::map<NodePtr, std::vector<std::pair<int, NodePtr>>> node_ref_inputs_;
   int node_index = 0;
diff --git a/ge/hybrid/model/node_item.cc b/ge/hybrid/model/node_item.cc
index 69cf334d..acc250ef 100644
--- a/ge/hybrid/model/node_item.cc
+++ b/ge/hybrid/model/node_item.cc
@@ -22,6 +22,7 @@
 #include "graph/debug/ge_attr_define.h"
 #include "graph/utils/node_utils.h"
 #include "hybrid/node_executor/node_executor.h"
+#include "hybrid/executor/worker/shape_inference_engine.h"
 
 namespace ge {
 namespace hybrid {
@@ -47,7 +48,7 @@ Status ParseInputMapping(Node &node, OpDesc &op_desc, FusedSubgraph &fused_subgr
     GE_CHECK_NOTNULL(dst_op_desc);
     auto in_idx = node_and_anchor.second->GetIdx();
     auto tensor_desc = dst_op_desc->MutableInputDesc(in_idx);
-    fused_subgraph.input_mapping[parent_index].emplace_back(tensor_desc);
+    fused_subgraph.input_mapping[static_cast<int>(parent_index)].emplace_back(tensor_desc);
     GELOGD("Input[%u] mapped to [%s:%u]", parent_index, dst_op_desc->GetName().c_str(), in_idx);
   }
 
@@ -64,7 +65,7 @@ Status ParseOutputMapping(const OpDescPtr &op_desc, FusedSubgraph &fused_subgrap
     return FAILED;
   }
 
-  fused_subgraph.output_mapping.emplace(parent_index, op_desc);
+  fused_subgraph.output_mapping.emplace(static_cast<int>(parent_index), op_desc);
   return SUCCESS;
 }
 
@@ -126,12 +127,7 @@ Status NodeItem::Create(const NodePtr &node, std::unique_ptr<NodeItem> &node_ite
   return SUCCESS;
 }
 
-Status NodeItem::Init() {
-  GE_CHECK_LE(op_desc->GetInputsSize(), INT32_MAX);
-  GE_CHECK_LE(op_desc->GetOutputsSize(), INT32_MAX);
-  num_inputs = static_cast<int>(op_desc->GetInputsSize());
-  num_outputs = static_cast<int>(op_desc->GetOutputsSize());
-
+void NodeItem::ResolveOptionalInputs() {
   if (op_desc->GetAllInputsSize() != op_desc->GetInputsSize()) {
     has_optional_inputs = true;
     for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) {
@@ -143,7 +139,18 @@ Status NodeItem::Init() {
       }
     }
   }
+}
+
+Status NodeItem::InitInputsAndOutputs() {
+  GE_CHECK_LE(op_desc->GetInputsSize(), INT32_MAX);
+  GE_CHECK_LE(op_desc->GetOutputsSize(), INT32_MAX);
+  num_inputs = static_cast<int>(op_desc->GetInputsSize());
+  num_outputs = static_cast<int>(op_desc->GetOutputsSize());
+  ResolveOptionalInputs();
+  return SUCCESS;
+}
 
+Status NodeItem::ResolveDynamicState() {
   (void) AttrUtils::GetBool(op_desc, ATTR_NAME_FORCE_UNKNOWN_SHAPE, is_dynamic);
   GELOGD("node name = %s, is_dynamic = %d.", this->node_name.c_str(), is_dynamic);
   if (!is_dynamic) {
@@ -151,38 +158,73 @@ Status NodeItem::Init() {
                       "[%s] Failed to get shape status.",
                       node->GetName().c_str());
   }
+  return SUCCESS;
+}
 
-  if (is_dynamic) {
-    for (int i = 0; i < num_inputs; ++i) {
-      const auto &input_desc = MutableInputDesc(i);
-      GE_CHECK_NOTNULL(input_desc);
-      if (input_desc->MutableShape().IsUnknownShape()) {
-        is_input_shape_static_.push_back(false);
-      } else {
+Status NodeItem::ResolveStaticInputsAndOutputs() {
+  for (int i = 0; i < num_inputs; ++i) {
+    // Data has unconnected input but set by framework
+    if (node_type != DATA) {
+      int origin_index = i;
+      if (has_optional_inputs) {
+        origin_index = input_desc_indices_[i];
+      }
+      auto in_data_anchor = node->GetInDataAnchor(origin_index);
+      GE_CHECK_NOTNULL(in_data_anchor);
+
+      // If no node was connected to the current input anchor
+      // increase num_static_input_shapes in case dead wait in ShapeInferenceState::AwaitShapesReady
+      if (in_data_anchor->GetPeerOutAnchor() == nullptr ||
+          in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() == nullptr) {
         num_static_input_shapes++;
         is_input_shape_static_.push_back(true);
-        GELOGD("[%s] The shape of input[%d] is static. shape = [%s]",
-               NodeName().c_str(), i, input_desc->MutableShape().ToString().c_str());
+        GELOGW("[%s] Peer node of input[%d] is empty", NodeName().c_str(), i);
+        continue;
       }
     }
-
-    for (int i = 0; i < num_outputs; ++i) {
-      const auto &output_desc = op_desc->MutableOutputDesc(i);
-      GE_CHECK_NOTNULL(output_desc);
-      if (output_desc->MutableShape().IsUnknownShape()) {
-        is_output_shape_static = false;
-        break;
-      }
+    const auto &input_desc = MutableInputDesc(i);
+    GE_CHECK_NOTNULL(input_desc);
+    if (input_desc->MutableShape().IsUnknownShape()) {
+      is_input_shape_static_.push_back(false);
+    } else {
+      num_static_input_shapes++;
+      is_input_shape_static_.push_back(true);
+      GELOGD("[%s] The shape of input[%d] is static. shape = [%s]",
+             NodeName().c_str(), i, input_desc->MutableShape().ToString().c_str());
     }
+  }
 
-    if (IsControlOp() || node_type == PARTITIONEDCALL) {
-      shape_inference_type = DEPEND_COMPUTE;
-    } else {
-      int32_t unknown_shape_type_val = 0;
-      (void) AttrUtils::GetInt(op_desc, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
-      shape_inference_type = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
+  for (int i = 0; i < num_outputs; ++i) {
+    const auto &output_desc = op_desc->MutableOutputDesc(i);
+    GE_CHECK_NOTNULL(output_desc);
+    if (output_desc->MutableShape().IsUnknownShape()) {
+      is_output_shape_static = false;
+      break;
     }
+  }
 
+  if (is_output_shape_static) {
+    GE_CHK_STATUS_RET_NOLOG(ShapeInferenceEngine::CalcOutputTensorSizes(*this));
+  }
+  return SUCCESS;
+}
+
+void NodeItem::ResolveUnknownShapeType() {
+  if (IsControlOp() || node_type == PARTITIONEDCALL) {
+    shape_inference_type = DEPEND_COMPUTE;
+  } else {
+    int32_t unknown_shape_type_val = 0;
+    (void) AttrUtils::GetInt(op_desc, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, unknown_shape_type_val);
+    shape_inference_type = static_cast<UnknowShapeOpType>(unknown_shape_type_val);
+  }
+}
+
+Status NodeItem::Init() {
+  GE_CHK_STATUS_RET_NOLOG(InitInputsAndOutputs());
+  GE_CHK_STATUS_RET_NOLOG(ResolveDynamicState());
+  if (is_dynamic) {
+    ResolveUnknownShapeType();
+    GE_CHK_STATUS_RET_NOLOG(ResolveStaticInputsAndOutputs());
     GE_CHK_STATUS_RET(ParseFusedSubgraph(*this), "[%s] Failed to parse fused subgraph", node_name.c_str());
   }
 
diff --git a/ge/hybrid/model/node_item.h b/ge/hybrid/model/node_item.h
index 8fac4a73..a34227c5 100644
--- a/ge/hybrid/model/node_item.h
+++ b/ge/hybrid/model/node_item.h
@@ -30,8 +30,8 @@ class NodeTask;
 class NodeExecutor;
 
 struct FusedSubgraph {
-  std::map<uint32_t, std::vector<GeTensorDescPtr>> input_mapping;
-  std::map<uint32_t, OpDescPtr> output_mapping;
+  std::map<int, std::vector<GeTensorDescPtr>> input_mapping;
+  std::map<int, OpDescPtr> output_mapping;
   std::vector<NodePtr> nodes;
   ComputeGraphPtr graph;
 };
@@ -99,10 +99,16 @@ struct NodeItem {
   std::map<int, int> reuse_inputs;
   std::map<int, int> reuse_outputs;
   int num_static_input_shapes = 0;
+  bool is_profiling_report = false;
 
  private:
   explicit NodeItem(NodePtr node);
   Status Init();
+  Status InitInputsAndOutputs();
+  void ResolveOptionalInputs();
+  Status ResolveDynamicState();
+  Status ResolveStaticInputsAndOutputs();
+  void ResolveUnknownShapeType();
 
   std::vector<bool> is_input_shape_static_;
   std::vector<uint32_t> input_desc_indices_;
diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
index 3b87c8b8..2abc5b03 100755
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc
@@ -15,13 +15,26 @@
  */
 
 #include "aicore_node_executor.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 
 namespace ge {
 namespace hybrid {
 REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::AICORE, AiCoreNodeExecutor);
+namespace {
+bool IsNoOp(const NodeItem &node_item) {
+  for (int i = 0; i < node_item.num_outputs; ++i) {
+    const auto &tensor_desc = node_item.MutableOutputDesc(i);
+    GE_CHECK_NOTNULL(tensor_desc);
+    const auto &shape = tensor_desc->MutableShape();
+    if (shape.IsScalar() || shape.GetShapeSize() > 0) {
+      return false;
+    }
+  }
 
+  return true;
+}
+}  // namespace
 AiCoreNodeTask::AiCoreNodeTask(std::vector<std::unique_ptr<AiCoreOpTask>> &&tasks) : tasks_(std::move(tasks)) {
 }
 
@@ -104,9 +117,13 @@ std::shared_ptr<NodeTask> AiCoreNodeTaskRegistry::GetTask(const std::string &nod
 
 Status AiCoreNodeExecutor::CompileTask(const HybridModel &model,
                                        const NodePtr &node, shared_ptr<NodeTask> &task) const {
-  GE_CHECK_NOTNULL(node);
-  auto op_desc = node->GetOpDesc();
-  GE_CHECK_NOTNULL(op_desc);
+  auto node_item = model.GetNodeItem(node);
+  GE_CHECK_NOTNULL(node_item);
+  if (IsNoOp(*node_item)) {
+    task = MakeShared<NoOpTask>();
+    return SUCCESS;
+  }
+  auto op_desc = node_item->op_desc;
   GELOGI("AiCoreNodeExecutor(%s) CompileTask Start.", node->GetName().c_str());
 
   auto ori_node_name = node->GetName();
@@ -150,7 +167,7 @@ Status AiCoreNodeExecutor::CompileTask(const HybridModel &model,
 
 Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
   RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeTaskExecuteAsync] Start");
-  if (IsNoOp(context)) {
+  if (IsNoOp(context.GetNodeItem())) {
     GELOGD("[%s] Skipping execution for op with empty outputs", context.GetNodeName());
     auto ret = context.TryExecuteCallback(done_callback);
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeTaskExecuteAsync] End");
@@ -165,6 +182,16 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function<void()>
     }
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] Start");
     GE_CHK_STATUS_RET_NOLOG((*it)->LaunchKernel(context.GetStream()));
+    uint32_t task_id = 0;
+    uint32_t stream_id = 0;
+    rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(rt_ret, "Get task_id and stream_id failed.");
+      return rt_ret;
+    }
+    context.SetTaskId(task_id);
+    context.SetStreamId(stream_id);
+    GELOGD("AiCore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id);
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");
     RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End");
   }
@@ -217,19 +244,6 @@ bool AiCoreNodeTask::IsSupportDynamicShape() {
   return true;
 }
 
-bool AiCoreNodeTask::IsNoOp(TaskContext &task_context) {
-  for (int i = 0; i < task_context.NumOutputs(); ++i) {
-    const auto &tensor_desc = task_context.MutableOutputDesc(i);
-    GE_CHECK_NOTNULL(tensor_desc);
-    const auto &shape = tensor_desc->MutableShape();
-    if (shape.IsScalar() || shape.GetShapeSize() > 0) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 TaskCompilerFactory &TaskCompilerFactory::GetInstance() {
   static TaskCompilerFactory instance;
   return instance;
diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.h b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
index 989090e9..f036ce85 100755
--- a/ge/hybrid/node_executor/aicore/aicore_node_executor.h
+++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.h
@@ -60,7 +60,6 @@ class AiCoreNodeTask : public NodeTask {
   Status UpdateArgs(TaskContext &context) override;
   Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
  private:
-  static bool IsNoOp(TaskContext &task_context);
   std::vector<std::unique_ptr<AiCoreOpTask>> tasks_;
 };
 
@@ -89,7 +88,7 @@ class TaskCompilerFactory {
 
 class CompilerFunctionRegistrar {
  public:
-  CompilerFunctionRegistrar(CreateFn fn);
+  explicit CompilerFunctionRegistrar(CreateFn fn);
   ~CompilerFunctionRegistrar() = default;
 };
 }  // namespace hybrid
diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/ge/hybrid/node_executor/aicore/aicore_op_task.cc
index 7ed14309..80ea579b 100644
--- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc
+++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc
@@ -15,7 +15,7 @@
  */
 
 #include "hybrid/node_executor/aicore/aicore_op_task.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "framework/common/debug/log.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/node_executor/aicore/aicore_task_builder.h"
@@ -38,7 +38,7 @@ Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def)
 }
 
 Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) {
-  auto op_desc_ptr = make_shared<OpDesc>(op_desc);
+  auto op_desc_ptr = std::make_shared<OpDesc>(op_desc);
   GE_CHECK_NOTNULL(op_desc_ptr);
   auto tbe_kernel = op_desc_ptr->TryGetExtAttr(OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
   if (tbe_kernel == nullptr) {
@@ -151,8 +151,8 @@ Status AiCoreOpTask::ValidateTaskDef(const domi::TaskDef &task_def) {
 
   const domi::KernelDef &kernel_def = task_def.kernel();
   const domi::KernelContext &context = kernel_def.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type != cce::ccKernelType::TE) {
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type != ccKernelType::TE) {
     GELOGE(INTERNAL_ERROR, "Invalid kernel type(%d) in AiCore TaskDef.", static_cast<int>(kernel_type));
     return INTERNAL_ERROR;
   }
diff --git a/ge/hybrid/node_executor/aicore/aicore_task_compiler.h b/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
index bf948349..b6dfd82b 100755
--- a/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
+++ b/ge/hybrid/node_executor/aicore/aicore_task_compiler.h
@@ -26,7 +26,7 @@ namespace hybrid {
 class AiCoreTaskCompiler : public TaskCompiler {
  public:
   AiCoreTaskCompiler() = default;
-  ~AiCoreTaskCompiler() = default;
+  ~AiCoreTaskCompiler() override = default;
 
   Status CompileOp(const NodePtr &node, std::vector<domi::TaskDef> &tasks) override;
   Status Initialize() override;
diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
index 1a47e525..63ce65e9 100755
--- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
+++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc
@@ -15,10 +15,11 @@
  */
 
 #include "hybrid/node_executor/aicpu/aicpu_node_executor.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "common/formats/formats.h"
 #include "aicpu/common/aicpu_task_struct.h"
 #include "graph/load/new_model_manager/model_manager.h"
+#include "graph/utils/node_utils.h"
 #include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/model/hybrid_model.h"
 #include "opskernel_manager/ops_kernel_builder_manager.h"
@@ -187,7 +188,18 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::function<void(
   RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AicpuNodeTaskBaseExecuteAsync] Start");
   GELOGD("Node[%s] execute async start. unknown_type=%d.", node_name_.c_str(), unknown_type_);
 
-  GE_CHK_STATUS_RET(LaunchTask(context));
+  HYBRID_CHK_STATUS_RET(LaunchTask(context), "[%s] Failed to launch task", node_name_.c_str());
+
+  uint32_t task_id = 0;
+  uint32_t stream_id = 0;
+  rtError_t rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(rt_ret, "Get task_id and stream_id failed.");
+    return rt_ret;
+  }
+  context.SetTaskId(task_id);
+  context.SetStreamId(stream_id);
+  GELOGD("AiCpu node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id);
 
   auto callback = [=, &context]() {
     GELOGD("Node[%s] callback start.", node_name_.c_str());
@@ -335,7 +347,11 @@ Status AicpuTfNodeTask::Init(const HybridModel &model) {
   GE_CHK_RT_RET(rtMemcpy(kernel_buf_->GetData(), sizeof(STR_FWK_OP_KERNEL),
                          &fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
                          RT_MEMCPY_HOST_TO_DEVICE));
-
+  auto node_type = NodeUtils::GetNodeType(node_item_->node);
+  if (node_type.find(GETNEXT) != string::npos) {
+    GELOGD("[%s] Is GetNext, set need sync to true, node type = %s", node_name_.c_str(), node_type.c_str());
+    need_sync_ = true;
+  }
   GELOGI("Node[%s] init end.", node_name_.c_str());
   return SUCCESS;
 }
@@ -605,6 +621,10 @@ Status AicpuTfNodeTask::LaunchTask(TaskContext &context) {
   GE_CHK_RT_RET(rtKernelLaunchEx(kernel_buf_->GetData(), kernel_buf_->GetSize(), flag, context.GetStream()));
   RECORD_EXECUTION_EVENT(context.GetExecutionContext(), node_name_.c_str(), "[AicpuTfNodertKernelLaunchEx] End");
   GELOGD("Node[%s] launch end.", node_name_.c_str());
+  if (need_sync_) {
+    GELOGD("[%s] Task needs sync", node_name_.c_str());
+    GE_CHK_STATUS_RET_NOLOG(context.Synchronize());
+  }
   return SUCCESS;
 }
 
@@ -642,10 +662,14 @@ Status AicpuNodeTask::Init(const HybridModel &model) {
   const std::string &so_name = kernel_def.so_name();
   const OpDescPtr op_desc = node_item_->GetOpDesc();
   const auto &context = kernel_def.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type == cce::ccKernelType::CUST_AI_CPU) {
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name), "load cust aicpu so failed.");
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed.");
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type == ccKernelType::CUST_AI_CPU) {
+    bool loaded = false;
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc, so_name, loaded), 
+                      "load cust aicpu so failed.");
+    if (!loaded) {
+      GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed.");
+    }
   }
 
   GE_CHK_BOOL_RET_STATUS(args.size() == args_size_, FAILED,
@@ -723,9 +747,9 @@ Status AicpuNodeTask::UpdateIoAddr(TaskContext &context) {
 
   auto io_addr = args_.get() + sizeof(aicpu::AicpuParamHead);
   // if has input and output, need copy to ioaddr
-  error_t cpy_ret = memcpy_s(io_addr, args_size_ - sizeof(aicpu::AicpuParamHead),
-                             &io_addrs[0], sizeof(uint64_t) * io_addrs.size());
-  GE_CHK_BOOL_RET_STATUS(cpy_ret == EOK, INTERNAL_ERROR,
+  int cpy_ret = memcpy_s(io_addr, args_size_ - sizeof(aicpu::AicpuParamHead),
+                         &io_addrs[0], sizeof(uint64_t) * io_addrs.size());
+  GE_CHK_BOOL_RET_STATUS(cpy_ret == 0, INTERNAL_ERROR,
                          "Node[%s] memcpy io addr to AicpuParamHead failed, ret=%d, args_size=%u, io nums=%zu.",
                          node_name_.c_str(), cpy_ret, args_size_, io_addrs.size());
   return SUCCESS;
@@ -736,9 +760,9 @@ Status AicpuNodeTask::LaunchTask(TaskContext &context) {
   const auto &so_name = task_def_.kernel().so_name();
   const auto &kernel_name = task_def_.kernel().kernel_name();
   const auto &kcontext = task_def_.kernel().context();
-  auto kernel_type = static_cast<cce::ccKernelType>(kcontext.kernel_type());
+  auto kernel_type = static_cast<ccKernelType>(kcontext.kernel_type());
   uint32_t flag = RT_KERNEL_DEFAULT;
-  if (kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+  if (kernel_type == ccKernelType::CUST_AI_CPU) {
     flag |= static_cast<uint32_t>(RT_KERNEL_CUSTOM_AICPU);
   }
   auto rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name.c_str()),
diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h
index b984cc86..8f0b1d0a 100644
--- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h
+++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.h
@@ -37,6 +37,8 @@ class AicpuNodeTaskBase : public NodeTask {
 
   ~AicpuNodeTaskBase() override = default;
 
+  using NodeTask::Init;
+
   virtual Status Init(const HybridModel &model) = 0;
 
   Status UpdateArgs(TaskContext &context) override;
@@ -142,6 +144,7 @@ class AicpuTfNodeTask : public AicpuNodeTaskBase {
   std::unique_ptr<TensorBuffer> copy_input_data_size_dev_;
   std::unique_ptr<TensorBuffer> copy_input_src_dev_;
   std::unique_ptr<TensorBuffer> copy_input_dst_dev_;
+  bool need_sync_ = false;
 };
 
 class AicpuNodeTask : public AicpuNodeTaskBase {
diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
index f985a3d0..7f2c6288 100755
--- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
+++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc
@@ -123,11 +123,22 @@ Status KnownNodeTask::Init(TaskContext &context) {
            davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size);
   }
   if (!load_flag_) {
+    auto dump_properties = context.GetDumpProperties();
+    if (dump_properties.IsDumpOpen()) {
+      davinci_model_->SetDumpProperties(dump_properties);
+    }
+    int32_t device_id = 0;
+    rtError_t rt_ret = rtGetDevice(&device_id);
+    if (rt_ret != RT_ERROR_NONE || device_id < 0) {
+      GELOGE(rt_ret, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
+      return RT_ERROR_TO_GE_STATUS(rt_ret);
+    }
+    davinci_model_->SetDeviceId(device_id);
     GE_CHK_STATUS_RET(davinci_model_->Init(), "KnownNodeExecutor::InitDavinciModel failed.");
     load_flag_ = true;
   } else {
     GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(),
-            davinci_model_->Id()), "KnownNodeTask::Init destroy aicpu kernel failed.");
+            davinci_model_->Id(), davinci_model_->SubModelId()), "KnownNodeTask::Init destroy aicpu kernel failed.");
   }
   GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName());
   return SUCCESS;
@@ -161,8 +172,9 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node
 
   // set known node flag as true
   davinci_model->SetKnownNode(true);
+  davinci_model->SetId(model.GetModelId());
   // set model id as root node's node id
-  davinci_model->SetId(node->GetOpDesc()->GetId());
+  davinci_model->SetSubModelId(node->GetOpDesc()->GetId());
   GELOGD("KnownNodeExecutor::LoadTask node id %ld.", node->GetOpDesc()->GetId());
 
   GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), "KnownNodeExecutor::LoadTask davincimodel assign failed.");
diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h
index fb1966b4..2dde993b 100644
--- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h
+++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h
@@ -27,7 +27,7 @@ class HybridModel;
 
 class KnownNodeTask : public NodeTask {
  public:
-  KnownNodeTask(std::shared_ptr<DavinciModel> davinci_model)
+  explicit KnownNodeTask(std::shared_ptr<DavinciModel> davinci_model)
       : davinci_model_(davinci_model)
     {}
 
diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.cc b/ge/hybrid/node_executor/controlop/control_op_executor.cc
index 83fc09ee..74920b22 100644
--- a/ge/hybrid/node_executor/controlop/control_op_executor.cc
+++ b/ge/hybrid/node_executor/controlop/control_op_executor.cc
@@ -405,7 +405,7 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model,
   auto node_item = model.GetNodeItem(node);
   GE_CHECK_NOTNULL(node_item);
 
-  unique_ptr<ControlOpNodeTask> node_task;
+  std::unique_ptr<ControlOpNodeTask> node_task;
   auto node_type = node->GetType();
   if (node_type == IF || node_type == STATELESSIF) {
     node_task.reset(new(std::nothrow) IfOpNodeTask());
diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.h b/ge/hybrid/node_executor/controlop/control_op_executor.h
index 7520afd1..3becfaaa 100644
--- a/ge/hybrid/node_executor/controlop/control_op_executor.h
+++ b/ge/hybrid/node_executor/controlop/control_op_executor.h
@@ -25,6 +25,7 @@ namespace ge {
 namespace hybrid {
 class ControlOpNodeTask : public NodeTask {
  public:
+  using NodeTask::Init;
   virtual Status Init(const NodePtr &node, const HybridModel &model) = 0;
   Status UpdateArgs(TaskContext &context) override;
 
diff --git a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
index ee45964c..50890d6a 100755
--- a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
+++ b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc
@@ -61,18 +61,18 @@ Status RefInputTask::Execute(TaskContext &context) {
 
 Status RefInputTask::RefOneByOne(TaskContext &context) {
   GELOGI("node %s type %s ref input one by one begin.", node_name_.c_str(), node_type_.c_str());
-  uint32_t input_num = context.NumInputs();
-  uint32_t output_num = context.NumOutputs();
+  int input_num = context.NumInputs();
+  int output_num = context.NumOutputs();
   if (output_num > input_num) {
-    GELOGE(INTERNAL_ERROR, "node %s type %s has %u outputs but only %u inputs, can't ref one by one.",
+    GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only %d inputs, can't ref one by one.",
            node_name_.c_str(), node_type_.c_str(), output_num, input_num);
     return INTERNAL_ERROR;
   }
-  for (uint32_t out_index = 0; out_index < output_num; ++out_index) {
+  for (uint32_t out_index = 0; out_index < static_cast<uint32_t>(output_num); ++out_index) {
     auto input = context.GetInput(out_index);
     GE_CHECK_NOTNULL(input);
     GE_CHK_STATUS_RET(context.SetOutput(out_index, *input));
-    GELOGD("node %s type %s output[%u] ref input[%u] addr=%p.",
+    GELOGD("node %s type %s output[%d] ref input[%d] addr=%p.",
            node_name_.c_str(), node_type_.c_str(), out_index, out_index, input->GetData());
   }
   GELOGI("node %s type %s ref input one by one end.", node_name_.c_str(), node_type_.c_str());
@@ -224,9 +224,9 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model,
              node->GetName().c_str(), node_type.c_str());
       return MEMALLOC_FAILED;
     }
-  } else if (node_type == CONSTANTOP || node_type == VARIABLE) {
+  } else if (node_type == CONSTANT || node_type == CONSTANTOP || node_type == VARIABLE) {
     GELOGI("node %s type %s, use ConstantNodeTask.", node->GetName().c_str(), node_type.c_str());
-    auto tensor = model.GetVariable(node->GetName());
+    auto tensor = model.GetTensor(node);
     if (tensor == nullptr) {
       GELOGE(INTERNAL_ERROR, "Failed to get tensor by name: %s", node->GetName().c_str());
       return INTERNAL_ERROR;
diff --git a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
index 704cab77..94c734ca 100644
--- a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
+++ b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
@@ -42,10 +42,10 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
     GELOGE(FAILED, "hccl handle is nullptr! ");
     return FAILED;
   }
-  auto EnqueueHcomOpertion = (HcclResult(*)(HcomOpertion, std::function<void(HcclResult status)>))dlsym(
-      context.handle_, "EnqueueHcomOpertion");
-  if (EnqueueHcomOpertion == nullptr) {
-    GELOGE(FAILED, "Failed to invoke EnqueueHcomOpertion hcom unknown node function.");
+  auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function<void(HcclResult status)>))dlsym(
+      context.handle_, "HcomExecEnqueueOperation");
+  if (HcomExecEnqueueOperation == nullptr) {
+    GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function.");
     if (dlclose(context.handle_) != 0) {
       GELOGW("Failed to close handle %s", dlerror());
     }
@@ -70,7 +70,7 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
   const OpDescPtr op_desc = node_item.GetOpDesc();
   GE_CHECK_NOTNULL(op_desc);
 
-  HcomOpertion op_info;
+  HcomOperation op_info;
   op_info.hcclType = op_desc->GetType();
   op_info.inputPtr = inputs.empty() ? nullptr : inputs[0];
   op_info.outputPtr = outputs.empty() ? nullptr : outputs[0];
@@ -96,7 +96,8 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
   op_info.root = root_id;
   auto callback = [this, op_desc](HcclResult status) {
     if (status != HCCL_SUCCESS) {
-      GELOGE(HCCL_E_INTERNAL, "node %s call EnqueueHcomOpertion failed, ret: 0x%X", op_desc->GetName().c_str(), status);
+      GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X",
+             op_desc->GetName().c_str(), status);
     }
     std::lock_guard<std::mutex> lock(this->hccl_mutex_);
     this->cond_.notify_all();
@@ -110,9 +111,9 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
          context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root);
   op_info.count = count;
 
-  HcclResult hccl_ret = EnqueueHcomOpertion(op_info, callback);
+  HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback);
   if (hccl_ret != HCCL_SUCCESS) {
-    GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret);
+    GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret);
     return HCCL_E_INTERNAL;
   }
 
@@ -213,11 +214,11 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector<HcomRemoteAccess
 
 Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
   GELOGI("[%s] RdmaNodeTask::ExecuteAsync in.", context.GetNodeName());
-  auto EnqueueRemoteAccess =
+  auto HcomExecEnqueueRemoteAccess =
       (HcclResult(*)(const string &, const vector<HcomRemoteAccessAddrInfo> &,
-                     std::function<void(HcclResult status)>))dlsym(context.handle_, "EnqueueRemoteAccess");
-  if (EnqueueRemoteAccess == nullptr) {
-    GELOGE(FAILED, "Failed to invoke EnqueueRemoteAccess hcom unknown node function.");
+                     std::function<void(HcclResult status)>))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess");
+  if (HcomExecEnqueueRemoteAccess == nullptr) {
+    GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function.");
     if (dlclose(context.handle_) != 0) {
       GELOGW("Failed to close handle %s", dlerror());
     }
@@ -228,15 +229,15 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
 
   auto callback = [this](HcclResult status) {
     if (status != HCCL_SUCCESS) {
-      GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status);
+      GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", status);
     }
     std::lock_guard<std::mutex> lock(this->hccl_mutex_);
     this->cond_.notify_all();
     GELOGI("rdma callback success.");
   };
-  HcclResult hccl_ret = EnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback);
+  HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback);
   if (hccl_ret != HCCL_SUCCESS) {
-    GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret);
+    GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret);
     return HCCL_E_INTERNAL;
   }
 
@@ -307,32 +308,32 @@ Status HcclNodeExecutor::Initialize() {
     GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror());
     return FAILED;
   }
-  auto HcomExcutorInitialize = (HcclResult(*)())dlsym(handle_, "HcomExcutorInitialize");
-  if (HcomExcutorInitialize == nullptr) {
-    GELOGE(FAILED, "Failed to invoke HcomExcutorInitialize hcom unknown node function.");
+  auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize");
+  if (HcomExecInitialize == nullptr) {
+    GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function.");
     return FAILED;
   }
-  HcclResult hccl_ret = HcomExcutorInitialize();
+  HcclResult hccl_ret = HcomExecInitialize();
   if (hccl_ret == HCCL_E_PTR) {
     GELOGI("Hccl comm is null, hcom executor initialize is not required.");
   } else if (hccl_ret == HCCL_SUCCESS) {
     GELOGI("Hcom executor initialize success.");
   } else {
-    GELOGE(FAILED, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret);
+    GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret);
     return FAILED;
   }
   return SUCCESS;
 }
 
 Status HcclNodeExecutor::Finalize() {
-  auto HcomExcutorFinalize = (HcclResult(*)())dlsym(handle_, "HcomExcutorFinalize");
-  if (HcomExcutorFinalize == nullptr) {
-    GELOGE(FAILED, "Failed to invoke HcomExcutorFinalize hcom unknown node function.");
+  auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize");
+  if (HcomExecFinalize == nullptr) {
+    GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function.");
     return FAILED;
   }
-  HcclResult hccl_ret = HcomExcutorFinalize();
+  HcclResult hccl_ret = HcomExecFinalize();
   if (hccl_ret != HCCL_SUCCESS) {
-    GELOGE(FAILED, "Call HcomExcutorFinalize failed, ret: 0x%X", hccl_ret);
+    GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret);
     return FAILED;
   }
   // dlclose file handle
diff --git a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
index a61195b0..0cc635e4 100755
--- a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
+++ b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc
@@ -18,6 +18,8 @@
 #include "hybrid/node_executor/host_cpu/kernel_factory.h"
 #include "graph/passes/folding_pass.h"
 #include "hybrid/model/hybrid_model.h"
+#include "graph/manager/graph_mem_allocator.h"
+#include "graph/manager/host_mem_allocator.h"
 #include "ge_local_engine/engine/host_cpu_engine.h"
 
 namespace ge {
@@ -50,15 +52,16 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
     auto input_desc_ptr = context.GetInputDesc(i);
     GE_CHECK_NOTNULL(input_desc_ptr);
     const auto &input_desc = *input_desc_ptr;
-    GE_CHECK_NOTNULL(context.GetInput(i));
-    auto in_tensor = MakeShared<GeTensor>(input_desc,
-                                          reinterpret_cast<const uint8_t *>(context.GetInput(i)->GetData()),
-                                          context.GetInput(i)->GetSize());
+    auto tensor = context.GetInput(i);
+    GE_CHECK_NOTNULL(tensor);
+    auto item = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).GetAlignedPtr(tensor->GetData());
+    GE_CHECK_NOTNULL(item.second);
+    auto in_tensor = MakeShared<GeTensor>(input_desc, item.second, item.first);
     GE_CHECK_NOTNULL(in_tensor);
     in_tensor->MutableTensorDesc().SetDataType(input_desc.GetDataType());
     in_tensor->MutableTensorDesc().SetShape(input_desc.GetShape());
     inputs.emplace_back(in_tensor);
-    GELOGI("node:%s allocate input %d, size=%zu", op_desc->GetName().c_str(), i, in_tensor->GetData().size());
+    GELOGD("node:%s allocate input %d, size=%zu", op_desc->GetName().c_str(), i, in_tensor->GetData().size());
   }
 
   std::vector<GeTensorPtr> outputs;
@@ -72,14 +75,14 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
     }
     auto tensor = context.GetOutput(i);
     GE_CHECK_NOTNULL(tensor);
-    auto out_tensor = MakeShared<GeTensor>(output_desc,
-                                           reinterpret_cast<const uint8_t *>(tensor->GetData()),
-                                           tensor->GetSize());
+    auto item = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).GetAlignedPtr(tensor->GetData());
+    GE_CHECK_NOTNULL(item.second);
+    auto out_tensor = MakeShared<GeTensor>(output_desc, item.second, item.first);
     GE_CHECK_NOTNULL(out_tensor);
     out_tensor->MutableTensorDesc().SetDataType(output_desc.GetDataType());
     out_tensor->MutableTensorDesc().SetShape(output_desc.GetShape());
     outputs.emplace_back(out_tensor);
-    GELOGI("node:%s allocate output %d, size=%zu", op_desc->GetName().c_str(), i, out_tensor->GetData().size());
+    GELOGD("node:%s allocate output %d, size=%zu", op_desc->GetName().c_str(), i, out_tensor->GetData().size());
   }
 
   return HostCpuEngine::GetInstance().Run(node_, inputs, outputs);
diff --git a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
index 3bf71013..01fd391d 100644
--- a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
+++ b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc
@@ -20,7 +20,6 @@
 #include "hybrid/node_executor/host_cpu/kernel_factory.h"
 
 namespace {
-const size_t kAssignInputNum = 2;
 const size_t kAssignRefInputIndex = 0;
 const size_t kAssignValueInputIndex = 1;
 const size_t kAssignRefOutputIndex = 0;
diff --git a/ge/hybrid/node_executor/node_executor.cc b/ge/hybrid/node_executor/node_executor.cc
index e577f09b..02427b91 100755
--- a/ge/hybrid/node_executor/node_executor.cc
+++ b/ge/hybrid/node_executor/node_executor.cc
@@ -20,6 +20,7 @@
 #include "graph/utils/node_utils.h"
 #include "init/gelib.h"
 #include "graph/utils/tensor_utils.h"
+#include "hybrid/executor/hybrid_execution_context.h"
 #include "hybrid/model/hybrid_model.h"
 #include "graph/debug/ge_attr_define.h"
 #include "opskernel_manager/ops_kernel_builder_manager.h"
@@ -34,7 +35,6 @@ const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel";
 const char *const kEngineNameHccl = "ops_kernel_info_hccl";
 const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE";
 const char *const kEngineNameHostCpu = "DNN_VM_HOST_CPU_OP_STORE";
-const char *const kOwnerGraphIsUnknown = "OwnerGraphIsUnknown";
 }
 Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {
   GE_CHK_STATUS_RET_NOLOG(context.AllocateOutputs());
@@ -45,9 +45,9 @@ Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {
 }
 
 Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function<void()> &callback) const {
-  GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback),
-                    "Failed to execute task. node = %s",
-                    context.GetNodeItem().NodeName().c_str());
+  HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback),
+                        "Failed to execute task. node = %s",
+                        context.GetNodeItem().NodeName().c_str());
   return SUCCESS;
 }
 
@@ -239,5 +239,13 @@ NodeExecutorRegistrar::NodeExecutorRegistrar(NodeExecutorManager::ExecutorType e
                                              NodeExecutor *(*builder)()) {
   NodeExecutorManager::GetInstance().RegisterExecutorBuilder(executor_type, builder);
 }
+Status NoOpTask::UpdateArgs(TaskContext &context) {
+  GELOGD("[%s] Skipping UpdateArgs for op with empty outputs", context.GetNodeName());
+  return SUCCESS;
+}
+Status NoOpTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
+  GELOGD("[%s] Skipping execution for op with empty outputs", context.GetNodeName());
+  return context.TryExecuteCallback(done_callback);
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/node_executor/node_executor.h b/ge/hybrid/node_executor/node_executor.h
index c2d32250..17ccc012 100644
--- a/ge/hybrid/node_executor/node_executor.h
+++ b/ge/hybrid/node_executor/node_executor.h
@@ -75,6 +75,12 @@ class NodeTask {
   virtual Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) = 0;
 };
 
+class NoOpTask : public NodeTask {
+ public:
+  Status UpdateArgs(TaskContext &context) override;
+  Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
+};
+
 // Node executor
 class NodeExecutor {
  public:
diff --git a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h
index 9ea544a1..73873002 100644
--- a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h
+++ b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.h
@@ -41,7 +41,6 @@ class PartitionedCallNodeTask : public NodeTask {
 
   const GraphItem *graph_item_;
   std::unique_ptr<SubgraphExecutor> subgraph_executor_;
-  GraphExecutionContext *context_ = nullptr;
 };
 
 class PartitionedCallNodeExecutor : public NodeExecutor {
diff --git a/ge/hybrid/node_executor/rts/rts_node_executor.cc b/ge/hybrid/node_executor/rts/rts_node_executor.cc
index 18b875fd..90b623e0 100644
--- a/ge/hybrid/node_executor/rts/rts_node_executor.cc
+++ b/ge/hybrid/node_executor/rts/rts_node_executor.cc
@@ -18,6 +18,7 @@
 #include "common/debug/log.h"
 #include "common/ge/ge_util.h"
 #include "graph/utils/tensor_utils.h"
+#include "hybrid/model/hybrid_model.h"
 #include "runtime/rt.h"
 
 namespace ge {
@@ -79,12 +80,44 @@ Status IdentityNNodeTask::ExecuteAsync(TaskContext &context, std::function<void(
   return SUCCESS;
 }
 
+Status ProfilingTraceNodeTask::UpdateArgs(TaskContext &context) {
+  return SUCCESS;
+}
+
+Status ProfilingTraceNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
+  for (const auto &task_def : task_defs_) {
+    auto log_time_stamp_def = task_def.log_timestamp();
+    uint64_t log_id = log_time_stamp_def.logid();
+    bool notify = log_time_stamp_def.notify();
+    uint32_t flat = log_time_stamp_def.flat();
+
+    GELOGD("ProfilingTraceTask execute async start. logid = %lu, notify = %d.", log_id, notify);
+    rtError_t rt_ret = rtProfilerTrace(log_id, notify, flat, context.GetStream());
+    if (rt_ret != RT_ERROR_NONE) {
+      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
+      return RT_ERROR_TO_GE_STATUS(rt_ret);
+    }
+    GELOGD("[%s] ProfilingTraceTask[%lu] execute success.", context.GetNodeName(), log_id);
+  }
+
+  return SUCCESS;
+};
+
 Status RtsNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const {
+  GE_CHECK_NOTNULL(node);
+
   auto op_type = node->GetType();
   if (op_type == IDENTITY) {
     task = MakeShared<IdentityNodeTask>();
   } else if (op_type == IDENTITYN) {
     task = MakeShared<IdentityNNodeTask>();
+  } else if (op_type == PROFILINGTRAININGTRACE) {
+    auto *task_defs = model.GetTaskDefs(node);
+    if (task_defs == nullptr || task_defs->empty()) {
+      GELOGE(INTERNAL_ERROR, "Profiling node has no task to execute.");
+      return INTERNAL_ERROR;
+    }
+    task = MakeShared<ProfilingTraceNodeTask>(*task_defs);
   } else {
     GELOGE(INTERNAL_ERROR, "[%s] Unsupported RTS op type: %s", node->GetName().c_str(), op_type.c_str());
     return INTERNAL_ERROR;
diff --git a/ge/hybrid/node_executor/rts/rts_node_executor.h b/ge/hybrid/node_executor/rts/rts_node_executor.h
index 2576b73b..df487d6c 100644
--- a/ge/hybrid/node_executor/rts/rts_node_executor.h
+++ b/ge/hybrid/node_executor/rts/rts_node_executor.h
@@ -18,6 +18,7 @@
 #define GE_HYBRID_NODE_EXECUTOR_RTS_RTS_NODE_EXECUTOR_H_
 
 #include "hybrid/node_executor/node_executor.h"
+#include "proto/task.pb.h"
 
 namespace ge {
 namespace hybrid {
@@ -35,6 +36,18 @@ class IdentityNNodeTask : public IdentityNodeTask {
   Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
 };
 
+class ProfilingTraceNodeTask :  public NodeTask {
+ public:
+  explicit ProfilingTraceNodeTask(const std::vector<domi::TaskDef> &task_defs) : task_defs_(task_defs) {}
+  ~ProfilingTraceNodeTask() override = default;
+
+  Status UpdateArgs(TaskContext &context) override;
+  Status ExecuteAsync(TaskContext &context, std::function<void()> done_callback) override;
+
+ private:
+  std::vector<domi::TaskDef> task_defs_;
+};
+
 class RtsNodeExecutor : public NodeExecutor {
  public:
   Status LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const override;
diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc
index b7152878..6488fbbe 100644
--- a/ge/hybrid/node_executor/task_context.cc
+++ b/ge/hybrid/node_executor/task_context.cc
@@ -148,6 +148,10 @@ Status TaskContext::AllocateWorkspaces() {
 }
 
 Status TaskContext::RegisterCallback(const std::function<void()> &callback_fun) const {
+  if (callback_fun == nullptr) {
+    GELOGW("[%s] Callback is NULL", GetNodeName());
+    return SUCCESS;
+  }
   auto ret = execution_context_->callback_manager->RegisterCallback(callback_fun);
   if (ret != SUCCESS) {
     GELOGE(ret, "[%s] Failed to register callback", GetNodeName());
@@ -233,9 +237,7 @@ Status TaskContext::AllocateOutput(int index,
       } else {
         GE_CHK_STATUS_RET_NOLOG(AllocateTensor(tensor_desc, outputs_start_[index], attr));
         GELOGD("Allocating output successfully. node: %s. index = %d, size = %zu",
-              node_item_->NodeName().c_str(),
-              index,
-              outputs_start_[index].GetSize());
+               node_item_->NodeName().c_str(), index, outputs_start_[index].GetSize());
       }
     }
   }
@@ -317,6 +319,22 @@ void TaskContext::SetStatus(Status status) {
   }
 }
 
+uint32_t TaskContext::GetTaskId() const {
+  return task_id_;
+}
+
+void TaskContext::SetTaskId(uint32_t task_id) {
+  task_id_ = task_id;
+}
+
+uint32_t TaskContext::GetStreamId() const {
+  return stream_id_;
+}
+
+void TaskContext::SetStreamId(uint32_t stream_id) {
+  stream_id_ = stream_id;
+}
+
 Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr) {
   GE_CHECK_NOTNULL(buffer);
   if (ori_addr == nullptr) {
@@ -386,6 +404,20 @@ const char *TaskContext::GetNodeName() const {
   return node_item_->NodeName().c_str();
 }
 
+void TaskContext::ReleaseInputsAndOutputs() {
+  for (int i = 0; i < node_item_->num_inputs; ++i) {
+    auto tensor = inputs_start_ + i;
+    tensor->Destroy();
+    GELOGD("[%s] Tensor of input[%d] released", GetNodeName(), i);
+  }
+
+  for (int i = 0; i < node_item_->num_outputs; ++i) {
+    auto tensor = outputs_start_ + i;
+    tensor->Destroy();
+    GELOGD("[%s] Tensor of output[%d] released", GetNodeName(), i);
+  }
+}
+
 void TaskContext::ReleaseInput(int index) {
   auto input_tensor = MutableInput(index);
   if (input_tensor != nullptr) {
@@ -458,5 +490,13 @@ Status TaskContext::TryExecuteCallback(const function<void()> &callback_fun) con
 const DumpProperties &TaskContext::GetDumpProperties() const {
   return execution_context_->dump_properties;
 }
+
+bool TaskContext::NeedCallback() {
+  return node_item_->has_observer || IsDumpEnabled() || execution_context_->profiling_level > 0;
+}
+
+Status TaskContext::Synchronize() {
+  return execution_context_->Synchronize(GetStream());
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/node_executor/task_context.h b/ge/hybrid/node_executor/task_context.h
index 2cff0536..6a4bcb8c 100644
--- a/ge/hybrid/node_executor/task_context.h
+++ b/ge/hybrid/node_executor/task_context.h
@@ -29,7 +29,7 @@
 
 namespace ge {
 namespace hybrid {
-class GraphExecutionContext;
+struct GraphExecutionContext;
 class SubgraphContext;
 
 class TaskContext {
@@ -50,6 +50,8 @@ class TaskContext {
   ConstGeTensorDescPtr GetOutputDesc(int index) const;
   GeTensorDescPtr MutableInputDesc(int index) const;
   GeTensorDescPtr MutableOutputDesc(int index) const;
+  void ReleaseInputsAndOutputs();
+  bool NeedCallback();
   void ReleaseInput(int index);
   const TensorValue *GetInput(int index) const;
   const TensorValue *GetOutput(int index) const;
@@ -94,6 +96,14 @@ class TaskContext {
 
   void SetStatus(Status status);
 
+  uint32_t GetTaskId() const;
+  void SetTaskId(uint32_t task_id);
+
+  uint32_t GetStreamId() const;
+  void SetStreamId(uint32_t stream_id);
+
+  Status Synchronize();
+
   bool IsForceInferShape() const;
   void SetForceInferShape(bool force_infer_shape);
   void *handle_ = nullptr;
@@ -115,6 +125,8 @@ class TaskContext {
   Status status_ = SUCCESS;
   std::vector<void *> workspaces_;
   uint64_t iteration_ = 0;
+  uint32_t task_id_ = 0;
+  uint32_t stream_id_ = 0;
 };
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/init/gelib.cc b/ge/init/gelib.cc
index 306a804a..b81632bd 100755
--- a/ge/init/gelib.cc
+++ b/ge/init/gelib.cc
@@ -53,9 +53,6 @@ const int kDecimal = 10;
 const int kSocVersionLen = 50;
 const int kDefaultDeviceIdForTrain = 0;
 const int kDefaultDeviceIdForInfer = -1;
-const uint32_t kAicoreOverflow = (0x1 << 0);
-const uint32_t kAtomicOverflow = (0x1 << 1);
-const uint32_t kAllOverflow = (kAicoreOverflow | kAtomicOverflow);
 const char *const kGlobalOptionFpCeilingModeDefault = "2";
 }  // namespace
 static std::shared_ptr<GELib> instancePtr_ = nullptr;
@@ -485,11 +482,9 @@ Status GELib::Finalize() {
 void GELib::ShutDownProfiling() {
   std::lock_guard<std::mutex> lock(status_mutex_);
 
-  if (!ProfilingManager::Instance().ProfilingOpTraceOn() && ProfilingManager::Instance().ProfilingOn()) {
-    ProfilingManager::Instance().StopProfiling();
-  }
   if (ProfilingManager::Instance().ProfilingOn()) {
-    ProfilingManager::Instance().PluginUnInit(GE_PROFILING_MODULE);
+    ProfilingManager::Instance().StopProfiling();
+    ProfilingManager::Instance().PluginUnInit();
   }
 }
 
diff --git a/ge/ir_build/atc_ir_common.cc b/ge/ir_build/atc_ir_common.cc
index 2a77e386..5b82f8f2 100755
--- a/ge/ir_build/atc_ir_common.cc
+++ b/ge/ir_build/atc_ir_common.cc
@@ -51,6 +51,7 @@ const char *const kDigitError = "is not digit";
 const char *const kCompressWeightError = "it must be appointed when appoint parameter[--optypelist_for_implmode]";
 const char *const kSelectImplmodeError = "only support high_performance, high_precision";
 const char *const kDynamicBatchSizeError = "It can only contains digit, \",\", \" \"";
+const char *const kKeepDtypeError = "file not found";
 
 vector<string> SplitInputShape(const std::string &input_shape) {
   vector<string> shape_pair_vec;
@@ -63,6 +64,19 @@ vector<string> SplitInputShape(const std::string &input_shape) {
 }
 }  // namespace
 
+Status CheckInputFormat(const string &input_format) {
+  if (input_format.empty()) {
+    return ge::SUCCESS;
+  }
+  if (!ge::TypeUtils::IsFormatValid(input_format.c_str())) {
+    ErrorManager::GetInstance().ATCReportErrMessage(
+      "E10001", {"parameter", "value", "reason"}, {"--input_format", input_format, "input format is invalid!"});
+    GELOGE(ge::PARAM_INVALID, "input format [%s] is invalid!", input_format.c_str());
+    return ge::PARAM_INVALID;
+  }
+  return ge::SUCCESS;
+}
+
 bool CheckDynamicBatchSizeInputShapeValid(unordered_map<string, vector<int64_t>> shape_map,
                                           std::string &dynamic_batch_size) {
   int32_t size = 0;
@@ -426,6 +440,17 @@ Status CheckCompressWeightParamValid(const std::string enable_compress_weight, c
   return ge::SUCCESS;
 }
 
+Status CheckKeepTypeParamValid(const std::string &keep_dtype) {
+  if ((!keep_dtype.empty()) && (!CheckInputPathValid(keep_dtype, "--keep_dtype"))) {
+    ErrorManager::GetInstance().ATCReportErrMessage(
+        "E10001", {"parameter", "value", "reason"}, {"--keep_dtype", keep_dtype, kKeepDtypeError});
+    GELOGE(ge::PARAM_INVALID, "keep dtype config file not found, file_name:%s", keep_dtype.c_str());
+    return ge::PARAM_INVALID;
+  }
+
+  return ge::SUCCESS;
+}
+
 int CheckLogParamValidAndSetLogLevel(const std::string log) {
   int ret = -1;
   if (log == "default") {
diff --git a/ge/ir_build/atc_ir_common.h b/ge/ir_build/atc_ir_common.h
index 47361167..2580a206 100644
--- a/ge/ir_build/atc_ir_common.h
+++ b/ge/ir_build/atc_ir_common.h
@@ -32,9 +32,6 @@ namespace ge {
 static std::set<std::string> caffe_support_input_format = {"NCHW", "ND"};
 static std::set<std::string> tf_support_input_format = {"NCHW", "NHWC", "ND", "NCDHW", "NDHWC"};
 static std::set<std::string> onnx_support_input_format = {"NCHW", "ND"};
-static const char *const kCaffeFormatSupport = "only support NCHW, ND in Caffe model";
-static const char *const kTFFormatSupport = "only support NCHW, NHWC, ND, NCDHW, NDHWC in TF model";
-static const char *const kONNXFormatSupport = "only support NCHW, ND in ONNX model";
 
 static std::map<std::string, domiTensorFormat_t> input_format_str_to_geformat = {
     {"ND", domi::DOMI_TENSOR_ND},
@@ -75,6 +72,8 @@ Status CheckInsertOpConfParamValid(const std::string insert_op_conf);
 Status CheckDisableReuseMemoryParamValid(const std::string disable_reuse_memory);
 Status CheckEnableSingleStreamParamValid(const std::string enable_single_stream);
 Status CheckImplmodeParamValid(const std::string &optypelist_for_implmode, std::string &op_select_implmode);
+Status CheckInputFormat(const string &input_format);
+Status CheckKeepTypeParamValid(const std::string &keep_dtype);
 void PrintOptionMap(std::map<std::string, std::string> &options, std::string tips);
 void EraseEndSemicolon(std::string &param);
 }
diff --git a/ge/ir_build/ge_ir_build.cc b/ge/ir_build/ge_ir_build.cc
index 74aa6a60..3d00ff7f 100644
--- a/ge/ir_build/ge_ir_build.cc
+++ b/ge/ir_build/ge_ir_build.cc
@@ -36,6 +36,9 @@
 #include "model/ge_model.h"
 #include "graph/shape_refiner.h"
 #include "graph/opsproto_manager.h"
+#include "inc/pass_manager.h"
+#include "graph/passes/net_output_pass.h"
+#include "graph/passes/data_pass.h"
 
 using std::string;
 using namespace std;
@@ -49,6 +52,8 @@ const std::string IR_OPTION_LOG_LEVEL_DEFAULT = "default";
 const std::string IR_OPTION_BUFFER_OPTIMIZE_DEFAULT = "l2_optimize";
 const std::string IR_OPTION_DISABLE_REUSE_MEMORY_DEFAULT = "0";
 const std::string IR_OPTION_ENABLE_COMPRESS_WEIGHT_DEFAULT = "false";
+const std::string kInputShape = "input_shape";
+const std::string kInputFormat = "input_format";
 }  // namespace
 
 static graphStatus CheckGlobalOptions(std::map<std::string, std::string> &global_options) {
@@ -225,11 +230,13 @@ class Impl {
   ~Impl() { (void)generator_.Finalize(); };
   graphStatus CheckOptions(const std::map<std::string, std::string> &options);
   graphStatus CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTensor> &inputs);
-  graphStatus Init(const std::map<std::string, std::string> &options);
+  graphStatus UpdateDataOpAttr(const Graph &graph);
+  graphStatus Init(const Graph &graph, const std::map<std::string, std::string> &options);
   graphStatus BuildModel(const Graph &graph, const std::map<std::string, std::string> &options,
                          ModelBufferData &ge_models);
   graphStatus InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format,
                                  bool is_dynamic_input);
+  static graphStatus InferShapePrepare(const ComputeGraphPtr &compute_graph);
   void SetRtSocVersion();
   void UpdateThreadContext();
   void LoadOpsProto();
@@ -240,6 +247,56 @@ class Impl {
   OmgContext omg_context_;
 };
 
+graphStatus Impl::InferShapePrepare(const ComputeGraphPtr &compute_graph) {
+  GE_CHECK_NOTNULL(compute_graph);
+
+  PassManager prepare_infershape;
+  prepare_infershape.AddPass("PrepareNetoutput", new(std::nothrow) NetOutputPass);
+  prepare_infershape.AddPass("PrepareSubGraphReflection", new (std::nothrow) DataPass);
+
+  auto ret = prepare_infershape.Run(compute_graph);
+  if ((ret != SUCCESS) && (ret != NOT_CHANGED)) {
+    GELOGE(ret, "Prepair for infershape failed, ret:%d", ret);
+    return ret;
+  }
+  GELOGD("Prepair for infershape success!");
+  return GRAPH_SUCCESS;
+}
+
+graphStatus Impl::UpdateDataOpAttr(const Graph &graph) {
+  GELOGD("Enter Update Data Attr Process!");
+  if (options_.find(kInputShape) == options_.end()) {
+    return GRAPH_SUCCESS;
+  }
+  unordered_map<string, vector<int64_t>> shape_map;
+  vector<pair<string, vector<int64_t>>> user_shape_map;
+  GE_CHK_BOOL_EXEC(ParseInputShape(options_[kInputShape], shape_map, user_shape_map, true),
+    return GRAPH_PARAM_INVALID, "parse input shape failed!");
+  auto compute_graph = ge::GraphUtils::GetComputeGraph(graph);
+  GE_CHECK_NOTNULL(compute_graph);
+  for (ge::NodePtr &input_node : compute_graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(input_node);
+    ge::OpDescPtr op = input_node->GetOpDesc();
+    GE_CHECK_NOTNULL(op);
+    if (op->GetType() == DATA) {
+      auto tensor_input = op->MutableInputDesc(0);
+      auto tensor_output = op->MutableOutputDesc(0);
+      GE_CHECK_NOTNULL(tensor_input);
+      GE_CHECK_NOTNULL(tensor_output);
+      string data_op_name = op->GetName();
+      auto iter = shape_map.find(data_op_name);
+      if (iter != shape_map.end()) {
+        tensor_input->SetShape(ge::GeShape(iter->second));
+        tensor_output->SetShape(ge::GeShape(iter->second));
+        GELOGD("update input [%s] shape info", data_op_name.c_str());
+      } else {
+        GELOGI("no need update input [%s] attr because not found from input_shape.", data_op_name.c_str());
+      }
+    }
+  }
+  return GRAPH_SUCCESS;
+}
+
 graphStatus Impl::CheckOptions(const std::map<std::string, std::string> &options) {
   for (auto &ele : options) {
     auto it = ge::ir_option::ir_builder_suppported_options.find(ele.first);
@@ -275,17 +332,29 @@ graphStatus Impl::CheckOptions(const std::map<std::string, std::string> &options
       return GRAPH_PARAM_INVALID;
     }
   }
+  // Check option EXEC_DISABLE_REUSED_MEMORY
+  it = options_.find(ge::ir_option::EXEC_DISABLE_REUSED_MEMORY);
+  if (it != options_.end() && (CheckDisableReuseMemoryParamValid(it->second) != GRAPH_SUCCESS)) {
+    return GRAPH_PARAM_INVALID;
+  }
+  // Check Input Format
+  if (options_.find(kInputFormat) != options_.end()) {
+    return CheckInputFormat(options_[kInputFormat]);
+  }
   return GRAPH_SUCCESS;
 }
 
-graphStatus Impl::Init(const std::map<std::string, std::string> &options) {
+graphStatus Impl::Init(const Graph &graph, const std::map<std::string, std::string> &options) {
   // 1. check options
   graphStatus ret = CheckOptions(options);
   if (ret != GRAPH_SUCCESS) {
     GELOGE(ret, "User input options are illegal! Please check!");
     return ret;
   }
-
+  ret = UpdateDataOpAttr(graph);
+  if (ret != GRAPH_SUCCESS) {
+    return ret;
+  }
   std::string build_mode = (options_.find(BUILD_MODE) == options_.end() || options_[BUILD_MODE] == BUILD_MODE_NORMAL)
                            ? "" : options_[BUILD_MODE];
   options_[BUILD_MODE] = build_mode;
@@ -416,7 +485,7 @@ graphStatus Impl::CreateInputsForIRBuild(const ge::Graph &graph, vector<ge::GeTe
 graphStatus Impl::BuildModel(const Graph &graph, const std::map<std::string, std::string> &options,
                              ModelBufferData &model) {
   // 1. init GeGenerator with user optios
-  graphStatus ret = Init(options);
+  graphStatus ret = Init(graph, options);
   if (ret != GRAPH_SUCCESS) {
     GELOGE(ret, "Build ir model Init failed!");
     return ret;
@@ -502,7 +571,7 @@ graphStatus aclgrphSaveModel(const string &output_file, const ModelBufferData &m
     GELOGE(GRAPH_PARAM_INVALID, "input model is illegal");
     return GRAPH_PARAM_INVALID;
   }
-  return FileSaver::SaveToFile((output_file + ".om"), reinterpret_cast<void*>(model.data.get()),
+  return FileSaver::SaveToFile((output_file + ".om"), reinterpret_cast<void *>(model.data.get()),
                                static_cast<uint32_t>(model.length));
 }
 
@@ -517,7 +586,7 @@ graphStatus aclgrphSaveModel(const char *output_file, const ModelBufferData &mod
     return GRAPH_PARAM_INVALID;
   }
   std::string str_output_file = output_file;
-  return FileSaver::SaveToFile((str_output_file + ".om"), reinterpret_cast<void*>(model.data.get()),
+  return FileSaver::SaveToFile((str_output_file + ".om"), reinterpret_cast<void *>(model.data.get()),
                                static_cast<uint32_t>(model.length));
 }
 
@@ -532,42 +601,6 @@ graphStatus aclgrphGetIRVersion(int *major_version, int *minor_version, int *pat
   return GRAPH_SUCCESS;
 }
 
-graphStatus aclgrphInferShapeAndType(ge::Graph &graph) {
-  auto compute_graph = GraphUtils::GetComputeGraph(graph);
-  GE_CHECK_NOTNULL(compute_graph);
-
-  auto root_graph = compute_graph->GetParentGraph();
-  if (root_graph != nullptr) {
-    GELOGE(GRAPH_PARAM_INVALID, "Input param should not be subgraph");
-    return GRAPH_PARAM_INVALID;
-  }
-
-  auto ret = compute_graph->TopologicalSorting();
-  if(ret != GRAPH_SUCCESS) {
-    GELOGE(ret, "Acl topo logical sort failed.");
-    return ret;
-  }
-
-  ret = compute_graph->InferOriginFormat();
-  if (ret != GRAPH_SUCCESS) {
-    GELOGE(ret, "Acl InferOriginFormat failed.");
-    return ret;
-  }
-
-  for (auto &node: compute_graph->GetAllNodes()) {
-    graphStatus ret = ShapeRefiner::InferShapeAndType(node);
-    if (ret == GRAPH_PARAM_INVALID) {
-      GELOGW("Can not find infershape func.");
-      continue;
-    } else if (ret != GRAPH_SUCCESS) {
-      GELOGE(ret, "Acl infershape failed.");
-      return ret;
-    }
-  }
-
-  return GRAPH_SUCCESS;
-}
-
 graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const size_t len) {
   GE_CHECK_NOTNULL(file);
 
@@ -622,4 +655,52 @@ graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const siz
   return GRAPH_SUCCESS;
 }
 
+graphStatus aclgrphGenerateForOp(const AscendString &op_type, const vector<TensorDesc> &inputs,
+                                 const vector<TensorDesc> &outputs, Graph &graph) {
+  auto op_type_str = std::string(op_type.GetString());
+  auto op_name = op_type_str + "_" + std::to_string(ge::GetCurrentTimestamp());
+  auto op_desc = ge::MakeShared<ge::OpDesc>(op_name, op_type_str);
+  GE_CHECK_NOTNULL(op_desc);
+
+  // convert input tensordesc to getensor
+  std::vector<ge::GeTensor> input_tensors;
+  for (const auto &input : inputs) {
+    ge::GeTensorDesc tensor_desc(ge::GeShape(input.GetShape().GetDims()), input.GetFormat(), input.GetDataType());
+
+    tensor_desc.SetOriginFormat(input.GetFormat());
+    ge::TensorUtils::SetRealDimCnt(tensor_desc, static_cast<uint32_t>(input.GetShape().GetDims().size()));
+    ge::TensorUtils::SetInputTensor(tensor_desc, true);
+    ge::TensorUtils::SetOutputTensor(tensor_desc, false);
+
+    if (op_desc->AddInputDesc(tensor_desc) != ge::GRAPH_SUCCESS) {
+      GELOGE(ge::FAILED, "AddInputDesc fail.");
+      return ge::FAILED;
+    }
+    input_tensors.emplace_back(tensor_desc);
+  }
+
+  // convert output tensordesc to getensor
+  std::vector<ge::GeTensor> output_tensors;
+  for (const auto &output : outputs) {
+    ge::GeTensorDesc tensor_desc(ge::GeShape(output.GetShape().GetDims()), output.GetFormat(), output.GetDataType());
+
+    tensor_desc.SetOriginFormat(output.GetFormat());
+    ge::TensorUtils::SetRealDimCnt(tensor_desc, static_cast<uint32_t>(output.GetShape().GetDims().size()));
+    ge::TensorUtils::SetInputTensor(tensor_desc, false);
+    ge::TensorUtils::SetOutputTensor(tensor_desc, true);
+
+    (void)op_desc->AddOutputDesc(tensor_desc);
+    output_tensors.emplace_back(tensor_desc);
+  }
+
+  // call api to get graph
+  ge::GeGenerator generator;
+  std::string graph_name = ge::CurrentTimeInStr() + "_graph";
+  if (generator.BuildSingleOpGraph(op_desc, input_tensors, output_tensors, graph_name, graph) != ge::SUCCESS) {
+    GELOGE(GRAPH_FAILED, "make graph fail.");
+    return GRAPH_FAILED;
+  }
+  return GRAPH_SUCCESS;
+}
+
 }  // namespace ge
diff --git a/ge/model/ge_root_model.h b/ge/model/ge_root_model.h
index 53174064..aa5a4d47 100755
--- a/ge/model/ge_root_model.h
+++ b/ge/model/ge_root_model.h
@@ -23,6 +23,7 @@
 namespace ge {
 class GeRootModel {
  public:
+  GeRootModel() = default;
   explicit GeRootModel(ComputeGraphPtr &root_graph) : root_graph_(root_graph), model_id_(INVALID_MODEL_ID) {};
   ~GeRootModel() = default;
 
@@ -35,11 +36,11 @@ class GeRootModel {
   void SetModelId(uint32_t model_id) { model_id_ = model_id; }
   uint32_t GetModelId() const { return model_id_; }
   Status CheckIsUnknownShape(bool &is_dynamic_shape);
-
+  void SetRootGraph(ComputeGraphPtr graph) { root_graph_ = graph; }
  private:
-  ComputeGraphPtr root_graph_;
+  ComputeGraphPtr root_graph_ = nullptr;
   std::map<std::string, GeModelPtr> subgraph_instance_name_to_model_;
-  uint32_t model_id_;
+  uint32_t model_id_ = 0;
 };
 }  // namespace ge
 using GeRootModelPtr = std::shared_ptr<ge::GeRootModel>;
diff --git a/ge/offline/CMakeLists.txt b/ge/offline/CMakeLists.txt
index 49af37c0..3f8d43dc 100644
--- a/ge/offline/CMakeLists.txt
+++ b/ge/offline/CMakeLists.txt
@@ -10,26 +10,29 @@ protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST})
 set(SRC_LIST
     "main.cc"
     "single_op_parser.cc"
+    "keep_dtype_option.cc"
     "../session/omg.cc"
-    "../ir_build/atc_ir_common.cc" 
+    "../ir_build/atc_ir_common.cc"
 )
 
-############ atc ############
-add_executable(atc ${SRC_LIST} ${PROTO_HDRS})
+############ atc_atc.bin ############
+add_executable(atc_atc.bin ${SRC_LIST} ${PROTO_HDRS})
 
-target_compile_options(atc PRIVATE 
+target_compile_options(atc_atc.bin PRIVATE
     -Werror
     -O2
     -Wno-deprecated-declarations
+    -fno-common
 )
 
-target_compile_definitions(atc PRIVATE
+target_compile_definitions(atc_atc.bin PRIVATE
     PROTOBUF_INLINE_NOT_IN_HEADERS=0
     COMPILE_OMG_PACKAGE
     google=ascend_private
+    LOG_CPP
 )
 
-target_include_directories(atc PRIVATE
+target_include_directories(atc_atc.bin PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}
     ${GE_CODE_DIR}
     ${GE_CODE_DIR}/ge
@@ -55,7 +58,7 @@ target_include_directories(atc PRIVATE
     ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
 )
 
-target_link_libraries(atc PRIVATE
+target_link_libraries(atc_atc.bin PRIVATE
     $<BUILD_INTERFACE:intf_pub>
     ascend_protobuf
     ge_common
@@ -74,10 +77,86 @@ target_link_libraries(atc PRIVATE
     -ldl
 )
 
+set_target_properties(atc_atc.bin PROPERTIES
+    OUTPUT_NAME atc.bin
+    RUNTIME_OUTPUT_DIRECTORY atclib
+)
+
+############ fwk_atc.bin ############
+add_executable(fwk_atc.bin ${SRC_LIST} ${PROTO_HDRS})
+
+target_compile_options(fwk_atc.bin PRIVATE
+    -Werror
+    -O2
+    -Wno-deprecated-declarations
+    -fno-common
+)
+
+target_compile_definitions(fwk_atc.bin PRIVATE
+    PROTOBUF_INLINE_NOT_IN_HEADERS=0
+    COMPILE_OMG_PACKAGE
+    google=ascend_private
+    LOG_CPP
+)
+
+target_include_directories(fwk_atc.bin PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}
+    ${GE_CODE_DIR}
+    ${GE_CODE_DIR}/ge
+    ${GE_CODE_DIR}/inc/external
+    ${GE_CODE_DIR}/common/inc/external
+    ${GE_CODE_DIR}/common/inc/external/graph
+    ${GE_CODE_DIR}/inc
+    ${GE_CODE_DIR}/inc/framework
+    ${METADEF_DIR}/inc
+    ${METADEF_DIR}/inc/graph
+    ${METADEF_DIR}/inc/register
+    ${METADEF_DIR}/inc/external
+    ${METADEF_DIR}/inc/external/graph
+    ${METADEF_DIR}/inc/external/register
+    ${PARSER_DIR}
+    ${CMAKE_BINARY_DIR}
+    ${CMAKE_BINARY_DIR}/proto/ge
+    #### yellow zone ####
+    ${GE_CODE_DIR}/../inc
+    ${GE_CODE_DIR}/../inc/common
+    #### blue zone ####
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc
+    ${GE_CODE_DIR}/third_party/fwkacllib/inc/toolchain
+)
+
+target_link_libraries(fwk_atc.bin PRIVATE
+    $<BUILD_INTERFACE:intf_pub>
+    ascend_protobuf
+    ge_common
+    register
+    c_sec
+    graph
+    error_manager
+    ge_runner
+    parser_common
+    gflags
+    json
+    runtime
+    slog
+    static_mmpa
+    -lrt
+    -ldl
+)
+
+set_target_properties(fwk_atc.bin PROPERTIES
+    OUTPUT_NAME atc.bin
+    RUNTIME_OUTPUT_DIRECTORY fwkacl
+)
+
 ############ install ############
 set(INSTALL_BASE_DIR "")
 set(INSTALL_LIBRARY_DIR lib)
 
-install(TARGETS atc OPTIONAL
-    LIBRARY DESTINATION ${INSTALL_LIBRARY_DIR}
+install(TARGETS atc_atc.bin OPTIONAL
+    RUNTIME DESTINATION ${INSTALL_LIBRARY_DIR}/atclib
+)
+
+install(TARGETS fwk_atc.bin OPTIONAL
+    RUNTIME DESTINATION ${INSTALL_LIBRARY_DIR}/fwkacl
 )
diff --git a/ge/offline/atc b/ge/offline/atc
new file mode 100644
index 00000000..05c65c26
--- /dev/null
+++ b/ge/offline/atc
@@ -0,0 +1,21 @@
+#!/bin/bash
+#-------------------------------------------------------------------
+# Purpose:
+# Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
+#-------------------------------------------------------------------
+
+real_path=$(readlink "$0")
+if [ $? -eq 0 ]; then
+    LOCAL_PATH=$(cd "$(dirname "$real_path")"; pwd)
+else
+    LOCAL_PATH=$(cd "$(dirname "$0")"; pwd)
+fi
+PKG_PATH=$(cd ${LOCAL_PATH}/..; pwd)
+LIB_P="/lib64"
+PYTHON_P="/python/site-packages"
+LIB64_PATH="${PKG_PATH}${LIB_P}"
+PYTHON_PATH="${PKG_PATH}${PYTHON_P}"
+export LD_LIBRARY_PATH="${LIB64_PATH}:${LD_LIBRARY_PATH}"
+export PYTHONPATH="${PYTHON_PATH}:${PYTHONPATH}"
+
+${PKG_PATH}/bin/atc.bin "$@"
diff --git a/ge/offline/keep_dtype_option.cc b/ge/offline/keep_dtype_option.cc
new file mode 100644
index 00000000..5624f21c
--- /dev/null
+++ b/ge/offline/keep_dtype_option.cc
@@ -0,0 +1,116 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "keep_dtype_option.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "graph/debug/ge_attr_define.h"
+#include "framework/common/util.h"
+#include "common/util/error_manager/error_manager.h"
+
+namespace ge {
+namespace {
+const size_t kMaxOpsNum = 10;
+}  // namespace
+bool IsOriginalOpFind(OpDescPtr &op_desc, const std::string &op_name) {
+  std::vector<std::string> original_op_names;
+  if (!AttrUtils::GetListStr(op_desc, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names)) {
+    return false;
+  }
+
+  for (auto &origin_name : original_op_names) {
+    if (origin_name == op_name) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void KeepDtypeReportError(const std::vector<std::string> &invalid_list) {
+  std::stringstream err_msg;
+  size_t list_size = invalid_list.size();
+  err_msg << "config file contains " << list_size;
+  if (list_size == 1) {
+    err_msg << " operator not in the graph, op name:";
+  } else {
+    err_msg << " operators not in the graph, op names:";
+  }
+
+  for (size_t i = 0; i < list_size; i++) {
+    if (i == kMaxOpsNum) {
+      err_msg << "..";
+      break;
+    }
+    err_msg << invalid_list[i];
+    if (i != list_size - 1) {
+      err_msg << " ";
+    }
+  }
+
+  ErrorManager::GetInstance().ATCReportErrMessage(
+      "E10042", {"parameter", "reason"}, {"keep_dtype", err_msg.str().c_str()});
+  GELOGE(FAILED, "%s", err_msg.str().c_str());
+}
+
+Status DealKeepDtypeOption(const ComputeGraphPtr &graph, const std::string &keep_dtype) {
+  GE_CHECK_NOTNULL(graph);
+  if (keep_dtype.empty()) {
+    return SUCCESS;
+  }
+  std::string real_path = RealPath(keep_dtype.c_str());
+  if (real_path.empty()) {
+    GELOGE(PARAM_INVALID, "Can not get real path for %s.", keep_dtype.c_str());
+    return PARAM_INVALID;
+  }
+  std::ifstream ifs(real_path);
+  if (!ifs.is_open()) {
+    GELOGE(FAILED, "Open file %s failed", keep_dtype.c_str());
+    return FAILED;
+  }
+
+  std::string op_name;
+  std::vector<std::string> invalid_list;
+  while (std::getline(ifs, op_name)) {
+    if (op_name.empty()) {
+      continue;
+    }
+    op_name = StringUtils::Trim(op_name);
+    bool is_find = false;
+    for (auto &node_ptr : graph->GetDirectNode()) {
+      auto op_desc = node_ptr->GetOpDesc();
+      GE_CHECK_NOTNULL(op_desc);
+
+      if ((op_desc->GetName() == op_name) || IsOriginalOpFind(op_desc, op_name)) {
+        is_find = true;
+        (void)AttrUtils::SetInt(op_desc, ATTR_NAME_KEEP_DTYPE, 1);
+      }
+    }
+    if (!is_find) {
+      invalid_list.push_back(op_name);
+    }
+  }
+  ifs.close();
+
+  if (!invalid_list.empty()) {
+    KeepDtypeReportError(invalid_list);
+    return PARAM_INVALID;
+  }
+
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/ge/offline/keep_dtype_option.h b/ge/offline/keep_dtype_option.h
new file mode 100644
index 00000000..2df2ed8c
--- /dev/null
+++ b/ge/offline/keep_dtype_option.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef KEEP_DTYPE_OPTION_H_
+#define KEEP_DTYPE_OPTION_H_
+
+#include <string>
+#include "graph/compute_graph.h"
+#include "framework/common/ge_inner_error_codes.h"
+
+namespace ge {
+Status DealKeepDtypeOption(const ComputeGraphPtr &graph, const std::string &keep_dtype);
+}  // namespace
+#endif // KEEP_DTYPE_OPTION_H_
\ No newline at end of file
diff --git a/ge/offline/main.cc b/ge/offline/main.cc
index 76494c68..363f9cda 100755
--- a/ge/offline/main.cc
+++ b/ge/offline/main.cc
@@ -43,6 +43,7 @@
 #include "parser/common/register_tbe.h"
 #include "register/op_registry.h"
 #include "single_op_parser.h"
+#include "keep_dtype_option.h"
 
 using domi::BuildMode;
 using domi::OpRegistrationData;
@@ -64,11 +65,16 @@ using std::vector;
 static bool is_dynamic_input = false;
 
 const char *const kModeSupport = "only support 0(model to framework model), "
-                                 "1(framework model to json), 3(only pre-check), 5(pbtxt to json)";
+                                 "1(framework model to json), 3(only pre-check), "
+                                 "5(pbtxt to json), 6(display model info)";
 const char *const kModelToJsonSupport = "only support 0(Caffe) 3(TensorFlow) 5(Onnx)";
 
+static const char *const kCaffeFormatSupport = "only support NCHW, ND in Caffe model";
+static const char *const kTFFormatSupport = "only support NCHW, NHWC, ND, NCDHW, NDHWC in TF model";
+static const char *const kONNXFormatSupport = "only support NCHW, ND in ONNX model";
+
 // limit available mem size 2G
-const long kMinAvailableMem = 2 * 1024 * 1024;
+const long kMinAvailableMem = 2097152;  // 2 * 1024 * 1024
 
 DEFINE_string(model, "", "The model file.");
 DEFINE_string(output, "", "The output file path&name.");
@@ -109,6 +115,9 @@ DEFINE_string(precision_mode, "force_fp16",
               "Optional; precision mode."
               "Support force_fp16, allow_mix_precision, allow_fp32_to_fp16, must_keep_origin_dtype.");
 
+DEFINE_string(keep_dtype, "",
+              "Optional; config file to specify the precision used by the operator during compilation.");
+
 DEFINE_string(input_format, "",
               "Optional; input_format, format of input data, NCHW;NHWC."
               "Format:\"NHWC\"");
@@ -202,6 +211,8 @@ DEFINE_string(mdl_bank_path, "", "Optional; model bank path");
 
 DEFINE_string(op_bank_path, "", "Optional; op bank path");
 
+DEFINE_string(display_model_info, "0", "Optional; display model info");
+
 class GFlagUtils {
  public:
   /**
@@ -221,8 +232,8 @@ class GFlagUtils {
         "===== Basic Functionality =====\n"
         "[General]\n"
         "  --h/help            Show this help message\n"
-        "  --mode              Run mode. 0(default): generate offline model; 1: convert model to JSON format "
-        "3: only pre-check; 5: convert ge dump txt file to JSON format\n"
+        "  --mode              Run mode. 0(default): generate offline model; 1: convert model to JSON format; "
+        "3: only pre-check; 5: convert ge dump txt file to JSON format; 6: display model info\n"
         "\n[Input]\n"
         "  --model             Model file\n"
         "  --weight            Weight file. Required when framework is Caffe\n"
@@ -281,12 +292,17 @@ class GFlagUtils {
         "  --enable_small_channel    Set enable small channel. 0(default): disable; 1: enable\n"
         "  --enable_compress_weight  Enable compress weight. true: enable; false(default): disable\n"
         "  --compress_weight_conf    Config file to compress weight\n"
-        "  --buffer_optimize         Set buffer optimize. \"l2_optimize\" (default). Set \"off_optimize\" to close\n"
+        "  --buffer_optimize         Set buffer optimize. Support \"l2_optimize\" (default), "
+        "\"l1_optimize\", \"off_optimize\"\n"
+	"  --mdl_bank_path           Set the path of the custom repository generated after model tuning.\n"
         "\n[Operator Tuning]\n"
         "  --precision_mode        precision mode, support force_fp16(default), allow_mix_precision, "
         "allow_fp32_to_fp16, must_keep_origin_dtype.\n"
+        "  --keep_dtype            Retains the precision of certain operators in inference "
+        "scenarios by using a configuration file.\n"
         "  --auto_tune_mode        Set tune mode. E.g.: \"GA,RL\", support configure multiple, spit by ,\n"
-        "  --op_select_implmode    Set op select implmode. Support high_precision, high_performance. "
+        "  --op_bank_path          Set the path of the custom repository generated after operator tuning with Auto Tune.\n"
+	"  --op_select_implmode    Set op select implmode. Support high_precision, high_performance. "
         "default: high_performance\n"
         "  --optypelist_for_implmode    Appoint which op to select implmode, cooperated with op_select_implmode.\n"
         "                               Separate multiple nodes with commas (,). Use double quotation marks (\") "
@@ -305,9 +321,10 @@ class GFlagUtils {
         "  --debug_dir                Set the save path of operator compilation intermediate files.\n"
         "Default value: ./kernel_meta\n"
         "  --op_compiler_cache_dir    Set the save path of operator compilation cache files.\n"
-        "Default value: $HOME/atc_data/kernel_cache\n"
+        "Default value: $HOME/atc_data\n"
         "  --op_compiler_cache_mode   Set the operator compilation cache mode."
-        "Options are disable(default), enable and force(force to refresh the cache)");
+        "Options are disable(default), enable and force(force to refresh the cache)\n"
+        "  --display_model_info     enable for display model info; 0(default): close display, 1: open display");
 
     gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
     // Using gflags to analyze input parameters
@@ -421,6 +438,9 @@ class GFlagUtils {
         FLAGS_enable_compress_weight, FLAGS_compress_weight_conf) == ge::SUCCESS,
         ret = ge::FAILED, "check compress weight failed!");
 
+    GE_CHK_BOOL_EXEC(ge::CheckKeepTypeParamValid(FLAGS_keep_dtype) == ge::SUCCESS,
+        ret = ge::FAILED, "check keep dtype failed!");
+
     GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
         !ge::CheckOutputPathValid(FLAGS_check_report, "--check_report"), ret = ge::FAILED,
         "check_report file %s not found!!", FLAGS_check_report.c_str());
@@ -446,6 +466,10 @@ class GFlagUtils {
         ge::CheckEnableSingleStreamParamValid(std::string(FLAGS_enable_single_stream)) == ge::SUCCESS,
         ret = ge::FAILED, "check enable single stream failed!");
 
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((FLAGS_display_model_info != "0") && (FLAGS_display_model_info != "1"),
+      ErrorManager::GetInstance().ATCReportErrMessage("E10006", {"parameter"}, {"display_model_info"});
+      ret = ge::FAILED, "Input parameter[--display_model_info]'s value must be 1 or 0.");
+
     return ret;
   }
 
@@ -601,9 +625,9 @@ static bool CheckInputFormat() {
     }
     // only support NCHW ND
     ErrorManager::GetInstance().ATCReportErrMessage(
-        "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, ge::kCaffeFormatSupport});
+        "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kCaffeFormatSupport});
     GELOGE(ge::FAILED,
-        "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), ge::kCaffeFormatSupport);
+        "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kCaffeFormatSupport);
     return false;
   } else if ((FLAGS_framework == static_cast<int32_t>(domi::TENSORFLOW))) { // tf
     if (ge::tf_support_input_format.find(FLAGS_input_format) != ge::tf_support_input_format.end()) {
@@ -611,9 +635,9 @@ static bool CheckInputFormat() {
     }
     // only support NCHW NHWC ND NCDHW NDHWC
     ErrorManager::GetInstance().ATCReportErrMessage(
-        "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, ge::kTFFormatSupport});
+        "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kTFFormatSupport});
     GELOGE(ge::FAILED,
-        "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), ge::kTFFormatSupport);
+        "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kTFFormatSupport);
     return false;
   } else if (FLAGS_framework == static_cast<int32_t>(domi::ONNX)) {
     if (ge::onnx_support_input_format.find(FLAGS_input_format) != ge::onnx_support_input_format.end()) {
@@ -621,9 +645,9 @@ static bool CheckInputFormat() {
     }
     // only support NCHW ND
     ErrorManager::GetInstance().ATCReportErrMessage(
-        "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, ge::kONNXFormatSupport});
+        "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kONNXFormatSupport});
     GELOGE(ge::FAILED,
-        "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), ge::kONNXFormatSupport);
+        "Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kONNXFormatSupport);
     return false;
   }
   return true;
@@ -853,7 +877,7 @@ domi::Status GenerateInfershapeJson() {
 static Status ConvertModelToJson(int fwk_type, const string &model_file, const string &json_file) {
   Status ret = ge::SUCCESS;
   if (fwk_type == -1) {
-    ret = ge::ConvertOmModelToJson(model_file.c_str(), json_file.c_str());
+    ret = ge::ConvertOm(model_file.c_str(), json_file.c_str(), true);
     return ret;
   }
 
@@ -979,6 +1003,13 @@ domi::Status GenerateModel(std::map<string, string> &options, std::string output
     }
   }
 
+  Status ret = ge::DealKeepDtypeOption(ge::GraphUtils::GetComputeGraph(graph), FLAGS_keep_dtype);
+  if (ret != SUCCESS) {
+    (void)ge_generator.Finalize();
+    (void)ge::GELib::GetInstance()->Finalize();
+    return ret;
+  }
+
   geRet = ge_generator.GenerateOfflineModel(graph, output, inputs);
   if (geRet != ge::SUCCESS) {
     DOMI_LOGE("GE GenerateOfflineModel execute failed");
@@ -1162,6 +1193,8 @@ domi::Status GenerateOmModel() {
   options.insert(std::pair<string, string>(string(ge::MDL_BANK_PATH_FLAG), FLAGS_mdl_bank_path));
 
   options.insert(std::pair<string, string>(string(ge::OP_BANK_PATH_FLAG), FLAGS_op_bank_path));
+
+  options.insert(std::pair<string, string>(string(ge::DISPLAY_MODEL_INFO), FLAGS_display_model_info));
   // set enable scope fusion passes
   SetEnableScopeFusionPasses(FLAGS_enable_scope_fusion_passes);
   // print atc option map
@@ -1174,6 +1207,11 @@ domi::Status GenerateOmModel() {
     return domi::FAILED;
   }
 
+  if (FLAGS_display_model_info == "1") {
+    GELOGI("need to display model info.");
+    return ge::ConvertOm(FLAGS_output.c_str(), "", false);
+  }
+
   return domi::SUCCESS;
 }
 
@@ -1187,6 +1225,26 @@ domi::Status ConvertModelToJson() {
   return domi::SUCCESS;
 }
 
+domi::Status DisplayModelInfo() {
+  // No model path passed in
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(FLAGS_om == "",
+      ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"om"});
+      return ge::FAILED,
+      "Input parameter[--om]'s value is empty!!");
+
+  // Check if the model path is valid
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
+      FLAGS_om != "" && !ge::CheckInputPathValid(FLAGS_om, "--om"),
+      return ge::FAILED,
+      "model file path is invalid: %s.", FLAGS_om.c_str());
+
+  if (FLAGS_framework == -1) {
+    return ge::ConvertOm(FLAGS_om.c_str(), "", false);
+  }
+
+  return ge::FAILED;
+}
+
 bool CheckRet(domi::Status ret) {
   if (ret != domi::SUCCESS) {
     if (FLAGS_mode == ONLY_PRE_CHECK) {
@@ -1330,6 +1388,9 @@ int main(int argc, char* argv[]) {
     } else if (FLAGS_mode == ge::RunMode::PBTXT_TO_JSON) {
       GE_CHK_BOOL_EXEC(ConvertPbtxtToJson() == domi::SUCCESS, ret = domi::FAILED;
                        break, "ATC convert pbtxt to json execute failed!!");
+    } else if (FLAGS_mode == ge::RunMode::DISPLAY_OM_INFO) {
+      GE_CHK_BOOL_EXEC(DisplayModelInfo() == domi::SUCCESS, ret = domi::FAILED;
+        break, "ATC DisplayModelInfo failed!!");
     } else {
       ErrorManager::GetInstance().ATCReportErrMessage(
           "E10001", {"parameter", "value", "reason"}, {"--mode", std::to_string(FLAGS_mode), kModeSupport});
diff --git a/ge/offline/module.mk b/ge/offline/module.mk
index 8859df29..8aabb975 100755
--- a/ge/offline/module.mk
+++ b/ge/offline/module.mk
@@ -10,6 +10,7 @@ LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DCOMPILE_OMG_PACKAGE -O2 -Dg
 
 LOCAL_SRC_FILES := \
     main.cc \
+    keep_dtype_option.cc \
     single_op_parser.cc \
     ../session/omg.cc \
     ../ir_build/atc_ir_common.cc \
@@ -54,3 +55,110 @@ LOCAL_LDFLAGS := -lrt -ldl
 
 include $(BUILD_HOST_EXECUTABLE)
 
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := atclib/atc.bin
+
+LOCAL_CFLAGS += -Werror -Wno-deprecated-declarations
+LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DCOMPILE_OMG_PACKAGE -O2 -Dgoogle=ascend_private
+
+LOCAL_SRC_FILES := \
+    main.cc \
+    keep_dtype_option.cc \
+    single_op_parser.cc \
+    ../session/omg.cc \
+    ../ir_build/atc_ir_common.cc \
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/../ ./ \
+    $(TOPDIR)inc \
+    $(TOPDIR)metadef/inc \
+    $(TOPDIR)graphengine/inc \
+    $(TOPDIR)inc/external \
+    $(TOPDIR)metadef/inc/external \
+    $(TOPDIR)graphengine/inc/external \
+    $(TOPDIR)metadef/inc/external/graph \
+    $(TOPDIR)graphengine/inc/framework \
+    $(TOPDIR)libc_sec/include \
+    $(TOPDIR)metadef/inc/common/util \
+    $(TOPDIR)parser    \
+    third_party/json/include \
+    third_party/gflags/include \
+    third_party/protobuf/include \
+    proto/om.proto \
+    proto/ge_ir.proto \
+    proto/task.proto \
+    proto/insert_op.proto \
+
+LOCAL_SHARED_LIBRARIES := \
+    libc_sec \
+    libge_common \
+    libascend_protobuf \
+    libslog \
+    libgraph \
+    libregister \
+    liberror_manager \
+    libge_compiler \
+    libruntime_compile \
+    libparser_common \
+    liberror_manager \
+
+LOCAL_STATIC_LIBRARIES := libgflags
+
+LOCAL_LDFLAGS := -lrt -ldl
+
+include $(BUILD_HOST_EXECUTABLE)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := fwkacl/atc.bin
+
+LOCAL_CFLAGS += -Werror -Wno-deprecated-declarations
+LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DCOMPILE_OMG_PACKAGE -O2 -Dgoogle=ascend_private
+
+LOCAL_SRC_FILES := \
+    main.cc \
+    keep_dtype_option.cc \
+    single_op_parser.cc \
+    ../session/omg.cc \
+    ../ir_build/atc_ir_common.cc \
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/../ ./ \
+    $(TOPDIR)inc \
+    $(TOPDIR)metadef/inc \
+    $(TOPDIR)graphengine/inc \
+    $(TOPDIR)inc/external \
+    $(TOPDIR)metadef/inc/external \
+    $(TOPDIR)graphengine/inc/external \
+    $(TOPDIR)metadef/inc/external/graph \
+    $(TOPDIR)graphengine/inc/framework \
+    $(TOPDIR)libc_sec/include \
+    $(TOPDIR)metadef/inc/common/util \
+    $(TOPDIR)parser    \
+    third_party/json/include \
+    third_party/gflags/include \
+    third_party/protobuf/include \
+    proto/om.proto \
+    proto/ge_ir.proto \
+    proto/task.proto \
+    proto/insert_op.proto \
+
+LOCAL_SHARED_LIBRARIES := \
+    libc_sec \
+    libge_common \
+    libascend_protobuf \
+    libslog \
+    libgraph \
+    libregister \
+    liberror_manager \
+    libge_runner \
+    libruntime \
+    libparser_common \
+    liberror_manager \
+
+LOCAL_STATIC_LIBRARIES := libgflags
+
+LOCAL_LDFLAGS := -lrt -ldl
+
+include $(BUILD_HOST_EXECUTABLE)
diff --git a/ge/offline/proto/ge_ir.proto b/ge/offline/proto/ge_ir.proto
index e7bfe0cb..12989a54 100644
--- a/ge/offline/proto/ge_ir.proto
+++ b/ge/offline/proto/ge_ir.proto
@@ -30,6 +30,7 @@ enum DataType
     DT_RESOURCE  = 23;         // resource type
     DT_STRING_REF = 24;        // string_ref type
     DT_DUAL      = 25;              /**< dual output type */
+    DT_VARIANT = 26;           // variant type
 }
 
 message AttrDef
diff --git a/ge/offline/single_op_parser.cc b/ge/offline/single_op_parser.cc
index d4b9c1c9..b1e0da6d 100644
--- a/ge/offline/single_op_parser.cc
+++ b/ge/offline/single_op_parser.cc
@@ -27,6 +27,7 @@
 #include "common/ge_inner_error_codes.h"
 #include "framework/common/util.h"
 #include "graph/utils/tensor_utils.h"
+#include "graph/utils/type_utils.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/operator_factory_impl.h"
 
@@ -176,6 +177,7 @@ T GetValue(const map<string, T> &dict, string &key, T default_val) {
 }
 
 void from_json(const Json &j, SingleOpTensorDesc &desc) {
+  bool is_tensor_valid = true;
   desc.dims = j.at(kKeyShape).get<vector<int64_t>>();
   auto it = j.find(kKeyShapeRange);
   if (it != j.end()) {
@@ -189,9 +191,12 @@ void from_json(const Json &j, SingleOpTensorDesc &desc) {
   string type_str = j.at(kKeyType).get<string>();
   desc.format = GetValue(kFormatDict, format_str, FORMAT_RESERVED);
   desc.type = GetValue(kDataTypeDict, type_str, DT_UNDEFINED);
+  is_tensor_valid = is_tensor_valid && ge::TypeUtils::IsFormatValid(format_str);
+  is_tensor_valid = is_tensor_valid && ge::TypeUtils::IsDataTypeValid(type_str);
   it = j.find(kKeyOriginFormat);
   if (it != j.end()) {
     string origin_format_str = j.at(kKeyOriginFormat).get<string>();
+    is_tensor_valid = is_tensor_valid && ge::TypeUtils::IsFormatValid(origin_format_str);
     desc.ori_format = GetValue(kFormatDict, origin_format_str, FORMAT_RESERVED);
   }
   auto tensor_name = j.find(kKeyName);
@@ -202,6 +207,9 @@ void from_json(const Json &j, SingleOpTensorDesc &desc) {
   if (dynamic_input_name != j.end()) {
     desc.dynamic_input_name = dynamic_input_name->get<string>();
   }
+  if (!is_tensor_valid) {
+    desc.SetValidFlag(is_tensor_valid);
+  }
 }
 
 void from_json(const Json &j, SingleOpAttr &attr) {
@@ -305,6 +313,12 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) {
 
   int index = 0;
   for (auto &tensor_desc : op_desc.input_desc) {
+    if (!tensor_desc.GetValidFlag()) {
+      ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
+          {"intput", "datatype or format", std::to_string(index)});
+      GELOGE(PARAM_INVALID, "Input's dataType or format is invalid when the index is %d", index);
+      return false;
+    }
     if ((tensor_desc.type == DT_UNDEFINED && tensor_desc.format != FORMAT_RESERVED) ||
         (tensor_desc.type != DT_UNDEFINED && tensor_desc.format == FORMAT_RESERVED)){
       ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
@@ -317,6 +331,12 @@ bool SingleOpParser::Validate(const SingleOpDesc &op_desc) {
 
   index = 0;
   for (auto &tensor_desc : op_desc.output_desc) {
+    if (!tensor_desc.GetValidFlag()) {
+      ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
+          {"output", "datatype", std::to_string(index)});
+      GELOGE(PARAM_INVALID, "Output's dataType is invalid when the index is %d", index);
+      return false;
+    }
     if (tensor_desc.type == DT_UNDEFINED) {
       ErrorManager::GetInstance().ATCReportErrMessage("E10027", {"input", "type", "index"},
           {"output", "datatype", std::to_string(index)});
diff --git a/ge/offline/single_op_parser.h b/ge/offline/single_op_parser.h
index 19879a32..71aa58bb 100644
--- a/ge/offline/single_op_parser.h
+++ b/ge/offline/single_op_parser.h
@@ -28,6 +28,10 @@
 
 namespace ge {
 struct SingleOpTensorDesc {
+public:
+  bool GetValidFlag() const { return is_valid_; }
+  void SetValidFlag(bool is_valid) { is_valid_ = is_valid; }
+public:
   std::string name;
   std::vector<int64_t> dims;
   std::vector<int64_t> ori_dims;
@@ -36,6 +40,8 @@ struct SingleOpTensorDesc {
   ge::Format ori_format = ge::FORMAT_RESERVED;
   ge::DataType type = ge::DT_UNDEFINED;
   std::string dynamic_input_name;
+private:
+  bool is_valid_ = true;
 };
 
 struct SingleOpAttr {
diff --git a/ge/omm/csa_interact.cc b/ge/omm/csa_interact.cc
index 1599af94..1b33ddbd 100644
--- a/ge/omm/csa_interact.cc
+++ b/ge/omm/csa_interact.cc
@@ -202,7 +202,7 @@ Status CsaInteract::WriteFile(const std::string &file_name, const std::string &c
     }
   }
 
-  mmSsize_t ret = mmWrite(fd, (void *)content.c_str(), content.length());
+  mmSsize_t ret = mmWrite(fd, reinterpret_cast<void *>(const_cast<char *>(content.c_str())), content.length());
   if (ret == EN_ERROR) {
     GELOGE(INTERNAL_ERROR, "write file fail, errno is %d", errno);
     ret = mmClose(fd);
diff --git a/ge/opskernel_manager/ops_kernel_builder_manager.cc b/ge/opskernel_manager/ops_kernel_builder_manager.cc
index e0001fcd..37bdcf7a 100644
--- a/ge/opskernel_manager/ops_kernel_builder_manager.cc
+++ b/ge/opskernel_manager/ops_kernel_builder_manager.cc
@@ -167,4 +167,5 @@ Status OpsKernelBuilderManager::GenerateTask(const Node &node,
   GELOGD("Done invoking GenerateTask successfully");
   return SUCCESS;
 }
-}  // namespace ge
\ No newline at end of file
+
+}  // namespace ge
diff --git a/ge/opskernel_manager/ops_kernel_manager.cc b/ge/opskernel_manager/ops_kernel_manager.cc
index 8134a463..30f39c0d 100644
--- a/ge/opskernel_manager/ops_kernel_manager.cc
+++ b/ge/opskernel_manager/ops_kernel_manager.cc
@@ -175,8 +175,8 @@ Status OpsKernelManager::ParsePluginOptions(const map<string, string> &options,
       } else if (flag == 1) {
         enable_flag = true;
       } else {
-        GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(),
-               iter->second.c_str());
+        GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.",
+               plugin_name.c_str(), iter->second.c_str());
         return GE_GRAPH_OPTIONS_INVALID;
       }
     } catch (std::invalid_argument &) {
@@ -188,8 +188,8 @@ Status OpsKernelManager::ParsePluginOptions(const map<string, string> &options,
              iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     } catch (...) {
-      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(),
-             iter->second.c_str());
+      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.",
+             plugin_name.c_str(), iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     }
   } else {
diff --git a/ge/plugin/engine/CMakeLists.txt b/ge/plugin/engine/CMakeLists.txt
index 87a6d682..f6353231 100644
--- a/ge/plugin/engine/CMakeLists.txt
+++ b/ge/plugin/engine/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(engine SHARED ${SRC_LIST})
 
 target_compile_options(engine PRIVATE
     -Werror
+    -fno-common
 )
 
 target_compile_definitions(engine PRIVATE
diff --git a/ge/proto/caffe/caffe.proto b/ge/proto/caffe/caffe.proto
index 3f45aae2..20615fed 100644
--- a/ge/proto/caffe/caffe.proto
+++ b/ge/proto/caffe/caffe.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software caffe, version 1.0 https://github.com/BVLC/caffe
+ *
+ * This file is included by GraphEngine so as to support model format conversion from caffe model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto2";
 
 package domi.caffe;
diff --git a/ge/proto/dump_task.proto b/ge/proto/dump_task.proto
index b1e346cd..ee1c6f47 100644
--- a/ge/proto/dump_task.proto
+++ b/ge/proto/dump_task.proto
@@ -28,6 +28,7 @@ enum OutputDataType {
     DT_RESOURCE = 23;
     DT_STRING_REF = 24;
     DT_DUAL = 25;
+    DT_VARIANT = 26;
 }
 
 enum OutputFormat {
@@ -108,4 +109,5 @@ message DumpData{
     repeated OpOutput output = 3;
     repeated OpInput input = 4;
     repeated OpBuffer buffer = 5;
+    string op_name = 6;
 }
diff --git a/ge/proto/ge_ir.proto b/ge/proto/ge_ir.proto
index e7bfe0cb..12989a54 100644
--- a/ge/proto/ge_ir.proto
+++ b/ge/proto/ge_ir.proto
@@ -30,6 +30,7 @@ enum DataType
     DT_RESOURCE  = 23;         // resource type
     DT_STRING_REF = 24;        // string_ref type
     DT_DUAL      = 25;              /**< dual output type */
+    DT_VARIANT = 26;           // variant type
 }
 
 message AttrDef
diff --git a/ge/proto/op_mapping_info.proto b/ge/proto/op_mapping_info.proto
index e23b7ebe..7fb6f84b 100644
--- a/ge/proto/op_mapping_info.proto
+++ b/ge/proto/op_mapping_info.proto
@@ -15,6 +15,7 @@ message Output {
     int32 original_output_data_type = 7;
     int32 original_output_format = 8;
     uint64 size = 9;
+    Shape origin_shape = 10;
 }
 
 message Input {
@@ -23,6 +24,7 @@ message Input {
     Shape shape = 3;
     uint64 address = 4;
     uint64 size = 5;
+    Shape origin_shape = 6;
 }
 
 enum BufferType {
diff --git a/ge/proto/tensorflow/attr_value.proto b/ge/proto/tensorflow/attr_value.proto
index 1cc67d62..438d7163 100644
--- a/ge/proto/tensorflow/attr_value.proto
+++ b/ge/proto/tensorflow/attr_value.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/function.proto b/ge/proto/tensorflow/function.proto
index 075897c6..44681e32 100644
--- a/ge/proto/tensorflow/function.proto
+++ b/ge/proto/tensorflow/function.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/graph.proto b/ge/proto/tensorflow/graph.proto
index d639a7d6..73bfc6ee 100644
--- a/ge/proto/tensorflow/graph.proto
+++ b/ge/proto/tensorflow/graph.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/graph_library.proto b/ge/proto/tensorflow/graph_library.proto
index e393d38d..7bca0838 100644
--- a/ge/proto/tensorflow/graph_library.proto
+++ b/ge/proto/tensorflow/graph_library.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/node_def.proto b/ge/proto/tensorflow/node_def.proto
index b9bc97ee..50cf5cac 100644
--- a/ge/proto/tensorflow/node_def.proto
+++ b/ge/proto/tensorflow/node_def.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/op_def.proto b/ge/proto/tensorflow/op_def.proto
index 3485d045..7f0e8ce2 100644
--- a/ge/proto/tensorflow/op_def.proto
+++ b/ge/proto/tensorflow/op_def.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/resource_handle.proto b/ge/proto/tensorflow/resource_handle.proto
index a3452351..91c46c9a 100644
--- a/ge/proto/tensorflow/resource_handle.proto
+++ b/ge/proto/tensorflow/resource_handle.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/tensor.proto b/ge/proto/tensorflow/tensor.proto
index d0a4d024..48eeb6c4 100644
--- a/ge/proto/tensorflow/tensor.proto
+++ b/ge/proto/tensorflow/tensor.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/tensor_shape.proto b/ge/proto/tensorflow/tensor_shape.proto
index 4225a2e3..3a6d8c5a 100644
--- a/ge/proto/tensorflow/tensor_shape.proto
+++ b/ge/proto/tensorflow/tensor_shape.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 // Protocol buffer representing the shape of tensors.
 
 syntax = "proto3";
diff --git a/ge/proto/tensorflow/types.proto b/ge/proto/tensorflow/types.proto
index ba7a72b3..f40e49cb 100644
--- a/ge/proto/tensorflow/types.proto
+++ b/ge/proto/tensorflow/types.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/proto/tensorflow/versions.proto b/ge/proto/tensorflow/versions.proto
index 48061218..4e81548f 100644
--- a/ge/proto/tensorflow/versions.proto
+++ b/ge/proto/tensorflow/versions.proto
@@ -1,3 +1,11 @@
+/**
+ * This file is part of Open Source Software TensorFlow, version 1.15.0 https://github.com/tensorflow/tensorflow
+ *
+ * This file is included by GraphEngine so as to support model format conversion from tensorflow model to GraphEngine model.
+ * This file in this distribution may have been modified by Huawei Technologies Co., Ltd ("Huawei Modifications").
+ * All Huawei Modifications are Copyright 2019-2020 Huawei Technologies Co., Ltd.
+ */
+
 syntax = "proto3";
 
 package domi.tensorflow;
diff --git a/ge/session/omg.cc b/ge/session/omg.cc
index df837f99..37b279a2 100755
--- a/ge/session/omg.cc
+++ b/ge/session/omg.cc
@@ -68,6 +68,10 @@ const std::string kScopeIdAttr = "fusion_scope";
 const char *const kOutputTypeSample = "correct sample is \"opname:index:dtype\"";
 const char *const kOutputTypeSupport = "only support FP32, FP16, UINT8";
 const char *const kOutputTypeError = "The multiple out nodes set in output_type must be found in out_nodes.";
+const size_t kNodeNameIndex = 0;
+const size_t kIndexStrIndex = 1;
+const size_t kDTValueIndex = 2;
+const size_t kOmInfoSize = 5;
 }  // namespace
 
 // When the model is converted to a JSON file, the following operator attributes in the blacklist will be ignored
@@ -381,14 +385,14 @@ Status ParseOutputType(const std::string &output_type, std::map<std::string, vec
       return domi::FAILED;
     }
     ge::DataType tmp_dt;
-    std::string node_name = StringUtils::Trim(node_index_type_v[0]);
-    std::string index_str = StringUtils::Trim(node_index_type_v[1]);
+    std::string node_name = StringUtils::Trim(node_index_type_v[kNodeNameIndex]);
+    std::string index_str = StringUtils::Trim(node_index_type_v[kIndexStrIndex]);
     int32_t index;
     if (StringToInt(index_str, index) != SUCCESS) {
       GELOGE(PARAM_INVALID, "This str must be digit string, while the actual input is %s.", index_str.c_str());
       return domi::FAILED;
     }
-    std::string dt_value = StringUtils::Trim(node_index_type_v[2]);
+    std::string dt_value = StringUtils::Trim(node_index_type_v[kDTValueIndex]);
     auto it = output_type_str_to_datatype.find(dt_value);
     if (it == output_type_str_to_datatype.end()) {
       ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
@@ -641,7 +645,8 @@ Status ParseOutNodes(const string &out_nodes) {
         if (!domi::GetContext().user_out_nodes_top_vec.empty()) {
           ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
                                                           {"--out_nodes", out_nodes, "is not all index or top_name"});
-          GELOGE(PARAM_INVALID, "This out_nodes str must be all index or top_name, while the actual input is %s", out_nodes.c_str());
+          GELOGE(PARAM_INVALID,
+                 "This out_nodes str must be all index or top_name, while the actual input is %s", out_nodes.c_str());
           return PARAM_INVALID;
         }
         // stoi: The method may throw an exception: invalid_argument/out_of_range
@@ -865,9 +870,78 @@ void GetGroupName(ge::proto::ModelDef &model_def) {
       });
 }
 
-FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, const char *json_file) {
+FMK_FUNC_HOST_VISIBILITY void PrintModelInfo(ge::proto::ModelDef *model_def) {
+  std::cout << "============ Display Model Info start ============" << std::endl;
+
+  auto model_attr_map = model_def->mutable_attr();
+  // system info
+  auto iter = model_attr_map->find(ATTR_MODEL_ATC_VERSION);
+  auto atc_version = (iter != model_attr_map->end()) ? iter->second.s() : "";
+  iter = model_attr_map->find("soc_version");
+  auto soc_version = (iter != model_attr_map->end()) ? iter->second.s() : "";
+  iter = model_attr_map->find("framework_type");
+  auto framework_type = (iter != model_attr_map->end()) ? iter->second.s() : "";
+  std::cout << "system   info: "
+            <<  ATTR_MODEL_ATC_VERSION
+            << "[" << atc_version << "], "
+            << "soc_version"
+            << "[" << soc_version << "], "
+            << "framework_type"
+            << "[" << framework_type << "]." << std::endl;
+
+  // resource info
+  iter = model_attr_map->find(ATTR_MODEL_MEMORY_SIZE);
+  auto memory_size = (iter != model_attr_map->end()) ? iter->second.i() : -1;
+  iter = model_attr_map->find(ATTR_MODEL_WEIGHT_SIZE);
+  auto weight_size = (iter != model_attr_map->end()) ? iter->second.i() : -1;
+  iter = model_attr_map->find(ATTR_MODEL_STREAM_NUM);
+  auto stream_num = (iter != model_attr_map->end()) ? iter->second.i() : -1;
+  iter = model_attr_map->find(ATTR_MODEL_EVENT_NUM);
+  auto event_num = (iter != model_attr_map->end()) ? iter->second.i() : -1;
+  std::cout << "resource info: "
+            << ATTR_MODEL_MEMORY_SIZE
+            << "[" << memory_size << " B], "
+            << ATTR_MODEL_WEIGHT_SIZE
+            << "[" << weight_size << " B], "
+            << ATTR_MODEL_STREAM_NUM
+            << "[" << stream_num << "], "
+            << ATTR_MODEL_EVENT_NUM
+            << "[" << event_num << "]."
+            << std::endl;
+
+  // om info
+  iter = model_attr_map->find("om_info_list");
+  if (iter == model_attr_map->end()) {
+    std::cout << "Display Model Info failed, attr \"om_info_list\" is not found in om, check the version is matched."
+              << std::endl;
+    std::cout << "============ Display Model Info end   ============"  << std::endl;
+    return;
+  }
+  auto list_size = iter->second.list().i_size();
+  if (list_size == kOmInfoSize) {
+    std::cout << "om       info: "
+              << "modeldef_size"
+              << "[" << iter->second.list().i(0) << " B], "
+              << "weight_data_size"
+              << "[" << iter->second.list().i(1) << " B], "
+              << "tbe_kernels_size"
+              << "[" << iter->second.list().i(2) << " B], "
+              << "cust_aicpu_kernel_store_size"
+              << "[" << iter->second.list().i(3) << " B], "
+              << "task_info_size"
+              << "[" << iter->second.list().i(4) << " B]." << std::endl;
+  } else {
+    std::cout << "Display Model Info error, please check!"  << std::endl;
+  };
+
+  std::cout << "============ Display Model Info end   ============"  << std::endl;
+}
+
+FMK_FUNC_HOST_VISIBILITY Status ConvertOm(const char *model_file, const char *json_file, bool is_covert_to_json) {
   GE_CHECK_NOTNULL(model_file);
-  GE_CHECK_NOTNULL(json_file);
+  if (is_covert_to_json) {
+    GE_CHECK_NOTNULL(json_file);
+  }
   ge::ModelData model;
 
   // Mode 2 does not need to verify the priority, and a default value of 0 is passed
@@ -889,9 +963,10 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
       OmFileLoadHelper omFileLoadHelper;
       ge::graphStatus status = omFileLoadHelper.Init(model_data, model_len);
       if (status != ge::GRAPH_SUCCESS) {
+        ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"Om file init failed"});
         GELOGE(ge::FAILED, "Om file init failed.");
         if (model.model_data != nullptr) {
-          delete[](char *) model.model_data;
+          delete[] reinterpret_cast<char *>(model.model_data);
           model.model_data = nullptr;
         }
         return status;
@@ -900,9 +975,10 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
       ModelPartition ir_part;
       status = omFileLoadHelper.GetModelPartition(MODEL_DEF, ir_part);
       if (status != ge::GRAPH_SUCCESS) {
+        ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"Get model part failed"});
         GELOGE(ge::FAILED, "Get model part failed.");
         if (model.model_data != nullptr) {
-          delete[](char *) model.model_data;
+          delete[] reinterpret_cast<char *>(model.model_data);
           model.model_data = nullptr;
         }
         return status;
@@ -913,26 +989,35 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOmModelToJson(const char *model_file, con
       // De serialization
       bool flag = ReadProtoFromArray(ir_part.data, ir_part.size, &model_def);
       if (flag) {
-        GetGroupName(model_def);
+        if (is_covert_to_json) {
+          GetGroupName(model_def);
 
-        json j;
-        Pb2Json::Message2Json(model_def, kOmBlackFields, j, true);
+          json j;
+          Pb2Json::Message2Json(model_def, kOmBlackFields, j, true);
 
-        ret = ModelSaver::SaveJsonToFile(json_file, j);
+          ret = ModelSaver::SaveJsonToFile(json_file, j);
+        } else {
+          PrintModelInfo(&model_def);
+        }
       } else {
         ret = INTERNAL_ERROR;
+        ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"ReadProtoFromArray failed"});
         GELOGE(ret, "ReadProtoFromArray failed.");
       }
     } else {
+      ErrorManager::GetInstance().ATCReportErrMessage("E10003",
+          {"parameter", "value", "reason"}, {"om", model_file, "invalid om file"});
       GELOGE(PARAM_INVALID, "ParseModelContent failed because of invalid om file. Please check --om param.");
     }
 
     if (model.model_data != nullptr) {
-      delete[](char *) model.model_data;
+      delete[] reinterpret_cast<char *>(model.model_data);
       model.model_data = nullptr;
     }
     return ret;
   } catch (const std::exception &e) {
+    ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"},
+        {"Convert om model to json failed, exception message[" + std::string(e.what()) + "]"});
     GELOGE(FAILED, "Convert om model to json failed, exception message : %s.", e.what());
     return FAILED;
   }
@@ -963,7 +1048,8 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const
 
     if (!flag) {
       free_model_data(&model.model_data);
-      GELOGE(FAILED, "ParseFromString fail.");
+      ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"ParseFromString failed"});
+      GELOGE(FAILED, "ParseFromString failed.");
       return FAILED;
     }
     GetGroupName(model_def);
@@ -979,9 +1065,13 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const
     return SUCCESS;
   } catch (google::protobuf::FatalException &e) {
     free_model_data(&model.model_data);
-    GELOGE(FAILED, "ParseFromString fail. exception message : %s", e.what());
+    ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"ParseFromString failed, exception message["
+        + std::string(e.what()) + "]"});
+    GELOGE(FAILED, "ParseFromString failed. exception message : %s", e.what());
     return FAILED;
   } catch (const std::exception &e) {
+    ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"},
+        {"Convert pbtxt to json failed, exception message[" + std::string(e.what()) + "]"});
     GELOGE(FAILED, "Convert pbtxt to json failed, exception message : %s.", e.what());
     return FAILED;
   }
diff --git a/ge/single_op/single_op.cc b/ge/single_op/single_op.cc
index 371d7110..1f3fc5c5 100755
--- a/ge/single_op/single_op.cc
+++ b/ge/single_op/single_op.cc
@@ -17,6 +17,7 @@
 #include "single_op/single_op.h"
 
 #include "common/fmk_types.h"
+#include "common/ge_types.h"
 #include "common/math/math_util.h"
 #include "common/profiling/profiling_manager.h"
 #include "framework/common/debug/ge_log.h"
@@ -24,19 +25,64 @@
 #include "graph/load/new_model_manager/model_utils.h"
 #include "runtime/mem.h"
 #include "single_op/single_op_manager.h"
+#include "single_op/task/build_task_utils.h"
 #include "graph/load/new_model_manager/model_manager.h"
 
 namespace ge {
 namespace {
 const size_t kDataMemAlignSize = 32;
+const size_t kDataMemAlignUnit = 2;
+const string kShapeTypeDynamic = "dynamic";
+const string kShapeTypeStatic = "static";
 
 size_t GetAlignedSize(size_t size) {
-  size_t aligned_size = (size + 2 * kDataMemAlignSize - 1) / kDataMemAlignSize * kDataMemAlignSize;
+  size_t aligned_size = (size + kDataMemAlignUnit * kDataMemAlignSize - 1) / kDataMemAlignSize * kDataMemAlignSize;
   return aligned_size;
 }
+
+Status ProfilingTaskInfo(OpTask *op_task, const string &shape_type) {
+  if (!ProfilingManager::Instance().ProfilingModelLoadOn()) {
+    return SUCCESS;
+  }
+
+  string model_name;
+  string op_name;
+  uint32_t model_id;
+  uint32_t block_dim;
+  if (op_task->GetProfilingArgs(model_name, op_name, model_id, block_dim) != SUCCESS) {
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Get profiling data of task failed");
+    return ACL_ERROR_GE_PARAM_INVALID;
+  }
+  GELOGD("ProfilingReport of op[%s] model[%s] start.", op_name.c_str(), model_name.c_str());
+  std::vector<TaskDescInfo> task_desc_info;
+  uint32_t task_id = 0;
+  uint32_t stream_id = 0;
+  if (rtGetTaskIdAndStreamID(&task_id, &stream_id) != RT_ERROR_NONE) {
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Get task_id and stream_id failed.");
+    return ACL_ERROR_GE_PARAM_INVALID;
+  }
+
+  TaskDescInfo tmp_task_desc_info;
+  tmp_task_desc_info.model_name = model_name;
+  tmp_task_desc_info.op_name = op_name;
+  tmp_task_desc_info.block_dim = block_dim;
+  tmp_task_desc_info.task_id = task_id;
+  tmp_task_desc_info.stream_id = stream_id;
+  tmp_task_desc_info.shape_type = shape_type;
+  tmp_task_desc_info.cur_iter_num = 0;
+  GELOGD("GetTaskDescInfo of op [%s] end, task_id[%u], stream_id[%u]", op_name.c_str(), task_id, stream_id);
+  task_desc_info.emplace_back(tmp_task_desc_info);
+
+  std::vector<ComputeGraphDescInfo> compute_graph_info;
+
+  auto &profiling_manager = ProfilingManager::Instance();
+  profiling_manager.ReportProfilingData(model_id, task_desc_info, compute_graph_info);
+  return SUCCESS;
+}
 }  // namespace
 
-SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) {
+SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream)
+    : stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) {
 }
 
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() {
@@ -68,7 +114,8 @@ Status SingleOp::ValidateArgs(const std::vector<DataBuffer> &inputs, const std::
 
   auto num_outputs = outputs.size();
   if (num_outputs != output_sizes_.size()) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "output num mismatch. model expect %zu, but given %zu", output_sizes_.size(), outputs.size());
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "output num mismatch. model expect %zu, but given %zu",
+           output_sizes_.size(), outputs.size());
     return ACL_ERROR_GE_PARAM_INVALID;
   }
 
@@ -117,37 +164,6 @@ Status SingleOp::UpdateArgs(const std::vector<DataBuffer> &inputs, const std::ve
       *arg_addr = args_[i];
     }
   }
-  // update aicpu_TF or aicpu_CC args
-  for (auto &task : tasks_) {
-    size_t io_addr_num = args_.size();
-    if (task->GetOpTaskType() == OP_TASK_AICPU) {
-      GELOGD("Update aicpu_TF task args");
-      task->SetIoAddrsForDump(args_);
-      auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetIOAddr()));
-      GE_CHECK_NOTNULL(dst_io_addr);
-      auto rt_ret = rtMemcpyAsync(dst_io_addr,
-                                  sizeof(uint64_t) * args_.size(),
-                                  &args_[0],
-                                  sizeof(uint64_t) * args_.size(),
-                                  RT_MEMCPY_HOST_TO_DEVICE_EX,
-                                  stream_);
-      if (rt_ret != RT_ERROR_NONE) {
-        GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret);
-        return rt_ret;
-      }
-    } else if (task->GetOpTaskType() == OP_TASK_AICPUCC) {
-      GELOGD("Update aicpu_CC task args");
-      const uintptr_t *task_io_addr = reinterpret_cast<const uintptr_t *>(task->GetIOAddr());
-      GE_CHECK_NOTNULL(task_io_addr);
-      auto io_addr = reinterpret_cast<uint64_t *>(const_cast<uintptr_t *>(task_io_addr));
-      for (size_t i = 0; i < io_addr_num; ++i) {
-        io_addr[i] = static_cast<uintptr_t>(args_[i]);
-      }
-    } else {
-      GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType());
-      continue;
-    }
-  }
   return SUCCESS;
 }
 
@@ -158,7 +174,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
     return ret;
   }
 
+  GE_CHECK_NOTNULL(stream_resource_);
   std::lock_guard<std::mutex> lk(*stream_mutex_);
+  auto current_mem_base = stream_resource_->GetMemoryBase();
+  if (running_param_->mem_base != current_mem_base) {
+    running_param_->mem_base = const_cast<uint8_t *>(current_mem_base);
+    GELOGD("Memory base changed, new memory base = %p", current_mem_base);
+    for (auto &task : tasks_) {
+      auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_);
+      GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_),
+                        "[%s] Failed to update arg table",
+                        task->GetOpdesc()->GetName().c_str());
+    }
+  }
   ret = UpdateArgs(inputs, outputs);
   if (ret != SUCCESS) {
     return ret;
@@ -169,6 +197,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
     if (ret != SUCCESS) {
       return ret;
     }
+    GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(task, kShapeTypeStatic));
   }
 
   return ret;
@@ -182,9 +211,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex
     : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {
 }
 
-DynamicSingleOp::~DynamicSingleOp() {
-}
-
 Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
                                        const std::vector<DataBuffer> &inputs,
                                        std::vector<GeTensorDesc> &output_desc,
@@ -206,63 +232,24 @@ Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
   }
 
   if (input_desc.size() != num_inputs_) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu", num_inputs_, input_desc.size());
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID,
+           "Input number mismatches. expect %zu, but given %zu",
+           num_inputs_,
+           input_desc.size());
     return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   if (output_desc.size() != num_outputs_) {
-    GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu", num_outputs_, output_desc.size());
+    GELOGE(ACL_ERROR_GE_PARAM_INVALID,
+           "Output number mismatches. expect %zu, but given %zu",
+           num_outputs_,
+           output_desc.size());
     return ACL_ERROR_GE_PARAM_INVALID;
   }
 
   return SUCCESS;
 }
 
-Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
-                                           std::vector<void *> &workspaces) {
-  static const std::string kPurpose("malloc workspace memory for dynamic op.");
-  if (workspace_sizes.empty()) {
-    GELOGD("No need to allocate workspace.");
-    return SUCCESS;
-  }
-  int64_t total_size = 0;
-  std::vector<int64_t> ws_offsets;
-  for (auto ws_size : workspace_sizes) {
-    // alignment and padding should be done in OpParaCalculate
-    GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
-    ws_offsets.emplace_back(total_size);
-    total_size += ws_size;
-  }
-
-  GELOGD("Total workspace size is %ld", total_size);
-  StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_);
-  GE_CHECK_NOTNULL(stream_resource);
-  auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast<size_t>(total_size));
-  if (ws_base == nullptr) {
-    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
-    return ACL_ERROR_GE_MEMORY_ALLOCATION;
-  }
-  GELOGD("Done allocating workspace memory successfully.");
-
-  for (auto ws_offset : ws_offsets) {
-    workspaces.emplace_back(ws_base + ws_offset);
-  }
-
-  return SUCCESS;
-}
-
-Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
-                                       const vector<void *> &inputs,
-                                       vector<GeTensorDesc> &output_desc,
-                                       vector<void *> &outputs) {
-  GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));
-
-  std::vector<void *> workspace_buffers;
-  GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));
-
-  return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);
-}
-
 Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
                                      const vector<DataBuffer> &input_buffers,
                                      vector<GeTensorDesc> &output_desc,
@@ -271,24 +258,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
   GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
   std::lock_guard<std::mutex> lk(*stream_mutex_);
 
-  std::vector<void *> inputs;
-  std::vector<void *> outputs;
-  for (auto &buffer : input_buffers) {
-    inputs.emplace_back(buffer.data);
-  }
-  for (auto &buffer : output_buffers) {
-    outputs.emplace_back(buffer.data);
-  }
-
-  if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
-    return ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
-  } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
-    return op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_);
-  } else {
-    GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID,
-           "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
-           op_task_->GetOpTaskType());
-    return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
-  }
+  GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_));
+  GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get(), kShapeTypeDynamic));
+  return SUCCESS;
 }
 }  // namespace ge
diff --git a/ge/single_op/single_op.h b/ge/single_op/single_op.h
index 14ef8ce1..d677f94a 100755
--- a/ge/single_op/single_op.h
+++ b/ge/single_op/single_op.h
@@ -30,9 +30,11 @@
 #include "cce/aicpu_engine_struct.h"
 
 namespace ge {
+class StreamResource;
+struct SingleOpModelParam;
 class SingleOp {
  public:
-  SingleOp(std::mutex *stream_mutex, rtStream_t stream);
+  SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream);
   ~SingleOp();
 
   Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
@@ -44,6 +46,7 @@ class SingleOp {
   Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
 
   friend class SingleOpModel;
+  StreamResource *stream_resource_;
   std::mutex *stream_mutex_;
   rtStream_t stream_ = nullptr;
   std::vector<void *> input_addr_list_;
@@ -54,12 +57,13 @@ class SingleOp {
 
   std::vector<OpTask *> tasks_;
   std::vector<std::vector<uintptr_t *>> arg_table_;
+  std::unique_ptr<SingleOpModelParam> running_param_;
 };
 
 class DynamicSingleOp {
  public:
   DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream);
-  ~DynamicSingleOp();
+  ~DynamicSingleOp() = default;
   Status ExecuteAsync(const vector<GeTensorDesc> &input_desc,
                       const std::vector<DataBuffer> &inputs,
                       std::vector<GeTensorDesc> &output_desc,
@@ -72,14 +76,6 @@ class DynamicSingleOp {
                         std::vector<GeTensorDesc> &output_desc,
                         std::vector<DataBuffer> &outputs) const;
 
-  Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
-                            std::vector<void *> &workspaces);
-
-  Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
-                        const vector<void *> &inputs,
-                        vector<GeTensorDesc> &output_desc,
-                        vector<void *> &outputs);
-
   std::unique_ptr<OpTask> op_task_;
   uintptr_t resource_id_ = 0;
   std::mutex *stream_mutex_;
diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc
index 49968f4f..2a1a14e6 100755
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) {
   if (model_params_.memory_size > model_params_.zero_copy_mem_size) {
     const string purpose("malloc feature map memory on model execute.");
     GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size);
-    model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size);
+    model_params_.mem_base =
+        res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false);
     if (model_params_.mem_base == nullptr) {
       return ACL_ERROR_GE_MEMORY_ALLOCATION;
     }
@@ -157,6 +158,7 @@ Status SingleOpModel::LoadAllNodes() {
   auto ge_model = model_helper_.GetGeModel();
   GE_CHECK_NOTNULL(ge_model);
   Graph graph = ge_model->GetGraph();
+  model_id_ = ge_model->GetModelId();
   auto compute_graph = GraphUtils::GetComputeGraph(graph);
   if (compute_graph == nullptr) {
     GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[%s] compute_graph is null", model_name_.c_str());
@@ -225,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) {
   return SUCCESS;
 }
 
-Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
+Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) {
   auto ge_model = model_helper_.GetGeModel();
   GE_CHECK_NOTNULL(ge_model);
+  single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
   auto tasks = ge_model->GetModelTaskDefPtr()->task();
   for (int i = 0; i < tasks.size(); ++i) {
     const TaskDef &task_def = tasks[i];
@@ -237,8 +240,8 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
     if (task_type == RT_MODEL_TASK_KERNEL) {
       const domi::KernelDef &kernel_def = task_def.kernel();
       const auto &context = kernel_def.context();
-      auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-      if (kernel_type == cce::ccKernelType::TE) {
+      auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+      if (kernel_type == ccKernelType::TE) {
         GELOGD("Building TBE task");
         TbeOpTask *tbe_task = nullptr;
         auto ret = BuildKernelTask(task_def.kernel(), &tbe_task);
@@ -246,10 +249,13 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
           return ret;
         }
 
-        single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
         ParseArgTable(tbe_task, single_op);
+        tbe_task->SetModelArgs(model_name_, model_id_);
+        if (tbe_task->tiling_buffer_ != nullptr) {
+          tbe_task->stream_resource_ = stream_resource;
+        }
         single_op.tasks_.emplace_back(tbe_task);
-      } else if (kernel_type == cce::ccKernelType::AI_CPU || kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+      } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
         GELOGD("Building AICPU_CC task");
         OpTask *task = nullptr;
         uint64_t singleop_kernel_id = aicpu_kernel_id++;
@@ -258,9 +264,12 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
         if (ret != SUCCESS) {
           return ret;
         }
+        task->SetModelArgs(model_name_, model_id_);
+        ParseArgTable(task, single_op);
         single_op.tasks_.emplace_back(task);
       } else {
-        GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type());
+        GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
+               "Only TBE, AI_CPU, CUST_AI_CPU kernel are supported, but got %u", context.kernel_type());
         return ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID;
       }
     } else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
@@ -273,6 +282,8 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
       if (ret != SUCCESS) {
         return ret;
       }
+      aicpu_task->SetModelArgs(model_name_, model_id_);
+      ParseArgTable(aicpu_task, single_op);
       single_op.tasks_.emplace_back(aicpu_task);
     } else {
       // skip
@@ -282,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
   return SUCCESS;
 }
 
-void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) {
+void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) {
   if (task == nullptr) {
     GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr");
     return;
   }
+
   // args: addr1, addr2, addr3 ...
-  auto *args = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetArgs()));
-  size_t arg_size = task->GetArgSize();
-  for (size_t i = 0; i < arg_size / sizeof(void *); ++i) {
-    uintptr_t *ptr_to_addr = args + i;
+  uintptr_t *arg_base = nullptr;
+  size_t arg_num = 0;
+  task->GetIoAddr(arg_base, arg_num);
+  for (size_t i = 0; i < arg_num; ++i) {
+    uintptr_t *ptr_to_addr = arg_base + i;
     uintptr_t addr = *ptr_to_addr;
     auto iter = model_params_.addr_mapping_.find(addr);
     if (iter != model_params_.addr_mapping_.end()) {
       int arg_index = iter->second;
-      GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index);
+      GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index);
       op.arg_table_[iter->second].emplace_back(ptr_to_addr);
     }
   }
@@ -368,7 +381,7 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
   }
 
   auto builder = AiCpuCCTaskBuilder(iter->second->GetOpDesc(), kernel_def);
-  auto ret = builder.BuildTask(*aicpucc_task, kernel_id);
+  auto ret = builder.BuildTask(*aicpucc_task, kernel_id, model_params_);
   if (ret != SUCCESS) {
     GELOGE(ret, "build aicpu_CC op task failed");
     return ret;
@@ -381,25 +394,29 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
 Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
   GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs());
   GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
+  single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_));
+  GE_CHECK_NOTNULL(single_op.running_param_);
   GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op));
-  return BuildTaskList(single_op);
+  return BuildTaskList(&resource, single_op);
 }
 
 Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {
   const domi::KernelDef &kernel_def = task_def.kernel();
   const auto &context = kernel_def.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type == cce::ccKernelType::TE) {
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type == ccKernelType::TE) {
     GELOGD("Building TBE task");
     TbeOpTask *tbe_task = nullptr;
     GE_CHK_STATUS_RET_NOLOG(BuildKernelTask(task_def.kernel(), &tbe_task));
+    tbe_task->SetModelArgs(model_name_, model_id_);
     single_op.op_task_.reset(tbe_task);
-  } else if (kernel_type == cce::ccKernelType::AI_CPU || kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+  } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
     GELOGD("Building AICPU_CC task");
     OpTask *task = nullptr;
     uint64_t dynamic_singleop_kernel_id = aicpu_kernel_id++;
     GELOGI("Build dynamic singleOp CCTask, kernel_id = %lu", dynamic_singleop_kernel_id);
     GE_CHK_STATUS_RET_NOLOG(BuildCpuKernelTask(task_def.kernel(), &task, dynamic_singleop_kernel_id));
+    task->SetModelArgs(model_name_, model_id_);
     single_op.op_task_.reset(task);
   } else {
     GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
@@ -446,6 +463,7 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
         const TaskDef &copy_task_def = tasks[i];
         GE_CHK_STATUS_RET_NOLOG(aicpu_task->SetMemCopyTask(copy_task_def.kernel_ex()));
       }
+      aicpu_task->SetModelArgs(model_name_, model_id_);
       single_op.op_task_.reset(aicpu_task);
     } else {
       // skip
@@ -455,10 +473,11 @@ Status SingleOpModel::BuildTaskListForDynamicOp(DynamicSingleOp &single_op) {
   return SUCCESS;
 }
 
-Status SingleOpModel::BuildDynamicOp(DynamicSingleOp &single_op) {
+Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp &single_op) {
   single_op.num_inputs_ = data_ops_.size();
   single_op.num_outputs_ = netoutput_op_->GetAllInputsSize();
-  ParseOpModelParams(model_helper_, model_params_);
+  GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
+  model_params_.memory_size = UINT_MAX;
   return BuildTaskListForDynamicOp(single_op);
 }
 }  // namespace ge
diff --git a/ge/single_op/single_op_model.h b/ge/single_op/single_op_model.h
index 50aeb7ab..6d0109fe 100755
--- a/ge/single_op/single_op_model.h
+++ b/ge/single_op/single_op_model.h
@@ -52,7 +52,7 @@ class SingleOpModel {
 
   Status Init();
   Status BuildOp(StreamResource &resource, SingleOp &single_op);
-  Status BuildDynamicOp(DynamicSingleOp &single_op);
+  Status BuildDynamicOp(StreamResource &resource, DynamicSingleOp &single_op);
 
  private:
   Status InitModel();
@@ -65,7 +65,7 @@ class SingleOpModel {
   Status ParseInputNode(const OpDescPtr &op_desc);
   void ParseOutputNode(const OpDescPtr &op_desc);
 
-  Status BuildTaskList(SingleOp &single_op);
+  Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op);
   Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
   Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
   Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
@@ -74,9 +74,10 @@ class SingleOpModel {
   Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op);
 
   static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
-  void ParseArgTable(TbeOpTask *task, SingleOp &op);
+  void ParseArgTable(OpTask *task, SingleOp &op);
 
   std::string model_name_;
+  uint32_t model_id_ = 0;
   const void *ori_model_data_;
   uint32_t ori_model_size_;
 
diff --git a/ge/single_op/stream_resource.cc b/ge/single_op/stream_resource.cc
index f545b6c8..db6b7c47 100755
--- a/ge/single_op/stream_resource.cc
+++ b/ge/single_op/stream_resource.cc
@@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
                                         size_t size,
                                         size_t &max_allocated,
                                         std::vector<uint8_t *> &allocated) {
+  if (size == 0) {
+    GELOGD("Mem size == 0");
+    return nullptr;
+  }
+
   if (size <= max_allocated && !allocated.empty()) {
     GELOGD("reuse last memory");
     return allocated.back();
   }
 
+  if (!allocated.empty()) {
+    uint8_t *current_buffer = allocated.back();
+    allocated.pop_back();
+    if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) {
+      GELOGW("Failed to invoke rtStreamSynchronize");
+    }
+    (void) rtFree(current_buffer);
+  }
+
   uint8_t *buffer = nullptr;
   auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM);
   if (ret != RT_ERROR_NONE) {
@@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
   return buffer;
 }
 
-uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) {
+uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) {
   GELOGD("To Malloc memory, size = %zu", size);
-  uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
-  return buffer;
+  if (holding_lock) {
+    return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
+  } else {
+    std::lock_guard<std::mutex> lk(stream_mu_);
+    return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
+  }
 }
 
 uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) {
@@ -137,7 +155,8 @@ Status StreamResource::BuildDynamicOperator(const string &model_name,
   GE_CHECK_NOTNULL(new_op);
 
   GELOGI("To build operator: %s", model_name.c_str());
-  GE_CHK_STATUS_RET(model.BuildDynamicOp(*new_op), "Build op failed. op = %s, ret = %u", model_name.c_str(), ret);
+  GE_CHK_STATUS_RET(model.BuildDynamicOp(*this, *new_op),
+                    "Build op failed. op = %s, ret = %u", model_name.c_str(), ret);
   *single_op = new_op.get();
   dynamic_op_map_[model_data.model_data] = std::move(new_op);
   return SUCCESS;
@@ -158,7 +177,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
     return ret;
   }
 
-  auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(&stream_mu_, stream_));
+  auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(this, &stream_mu_, stream_));
   if (new_op == nullptr) {
     GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed");
     return ACL_ERROR_GE_MEMORY_ALLOCATION;
@@ -171,4 +190,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
   op_map_[model_data.model_data] = std::move(new_op);
   return SUCCESS;
 }
+
+const uint8_t *StreamResource::GetMemoryBase() const {
+  if (memory_list_.empty()) {
+    return nullptr;
+  }
+
+  return memory_list_.back();
+}
 }  // namespace ge
diff --git a/ge/single_op/stream_resource.h b/ge/single_op/stream_resource.h
index 39f08ebe..d5bc941a 100755
--- a/ge/single_op/stream_resource.h
+++ b/ge/single_op/stream_resource.h
@@ -45,8 +45,9 @@ class StreamResource {
   Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op);
   Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op);
 
-  uint8_t *MallocMemory(const std::string &purpose, size_t size);
+  uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true);
   uint8_t *MallocWeight(const std::string &purpose, size_t size);
+  const uint8_t *GetMemoryBase() const;
 
  private:
   uint8_t *DoMallocMemory(const std::string &purpose,
diff --git a/ge/single_op/task/aicpu_kernel_task_builder.cc b/ge/single_op/task/aicpu_kernel_task_builder.cc
index 26f6a166..2a5f968f 100755
--- a/ge/single_op/task/aicpu_kernel_task_builder.cc
+++ b/ge/single_op/task/aicpu_kernel_task_builder.cc
@@ -15,19 +15,24 @@
  */
 
 #include "single_op/task/aicpu_kernel_task_builder.h"
-#include "cce/taskdown_common.hpp"
+#include "framework/common/taskdown_common.h"
 #include "graph/load/new_model_manager/model_manager.h"
+#include "build_task_utils.h"
 
 namespace ge {
 AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def)
     : op_desc_(op_desc), kernel_def_(kernel_def) {}
 
-Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
+Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param) {
   size_t aicpu_arg_size = kernel_def_.args_size();
-  if (aicpu_arg_size <= 0) {
+  if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) {
     GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size);
     return ACL_ERROR_GE_PARAM_INVALID;
   }
+
+  task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize();
+  GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *));
+
   std::unique_ptr<uint8_t[]> aicpu_args;
   aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]());
   if (aicpu_args == nullptr) {
@@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
     return ACL_ERROR_GE_INTERNAL_ERROR;
   }
 
-  task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead));
+  task.SetIoAddr(reinterpret_cast<uintptr_t *>(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)));
   task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size);
+
+  auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param);
+  GE_CHECK_GE(addresses.size(), task.io_addr_num_);
+  for (size_t i = 0; i < task.io_addr_num_; ++i) {
+    task.io_addr_[i] = reinterpret_cast<uintptr_t>(addresses[i]);
+  }
   return SUCCESS;
 }
 
-Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
-  auto ret = SetKernelArgs(task);
+Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param) {
+  auto ret = SetKernelArgs(task, param);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -55,15 +66,20 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
   const std::string &kernel_name = kernel_def_.kernel_name();
   task.SetSoName(so_name);
   task.SetkernelName(kernel_name);
+  GE_CHECK_NOTNULL(op_desc_);
   task.op_desc_ = op_desc_;
 
   const auto &context = kernel_def_.context();
-  auto kernel_type = static_cast<cce::ccKernelType>(context.kernel_type());
-  if (kernel_type == cce::ccKernelType::CUST_AI_CPU) {
+  auto kernel_type = static_cast<ccKernelType>(context.kernel_type());
+  if (kernel_type == ccKernelType::CUST_AI_CPU) {
     task.is_custom_ = true;
     task.dump_flag_ |= RT_KERNEL_CUSTOM_AICPU;
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name), "launch cust aicpu so failed");
-    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
+    bool loaded = false;
+    GE_CHK_STATUS_RET(ModelManager::GetInstance()->LoadCustAicpuSo(op_desc_, so_name, loaded), 
+                      "launch cust aicpu so failed");
+    if (!loaded) {
+      GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "launch cust aicpu so failed.");
+    }
   }
 
   task.num_inputs_ = op_desc_->GetInputsSize();
@@ -81,13 +97,22 @@ Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
     GELOGE(ret, "Init ext info failed.");
     return ret;
   }
+  GE_CHK_STATUS_RET(task.SetInputConst(), "AiCpuCCTask set input_const failed.");
 
+  if (task.GetUnknownType() == DEPEND_COMPUTE) {
+    GELOGE(FAILED, "AiCpuCCTask unknown type is depend compute, it's not supported now.");
+    return FAILED;
+  }
   auto aicpu_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(task.args_.get());
   if (task.ext_info_addr_dev_ != nullptr) {
     aicpu_param_head->extInfoLength = kernel_ext_info.size();
     aicpu_param_head->extInfoAddr = reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_);
   }
 
+  task.op_type_ = op_desc_->GetName();
+  task.kernel_id_ = kernel_id;
+  auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_);
+  GELOGI("[TASK_INFO] %lu/%s %s", kernel_id, task.op_type_.c_str(), debug_info.c_str());
   return SUCCESS;
 }
 }  // namespace ge
\ No newline at end of file
diff --git a/ge/single_op/task/aicpu_kernel_task_builder.h b/ge/single_op/task/aicpu_kernel_task_builder.h
index e77e3c10..85d5034d 100755
--- a/ge/single_op/task/aicpu_kernel_task_builder.h
+++ b/ge/single_op/task/aicpu_kernel_task_builder.h
@@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder {
   explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def);
   ~AiCpuCCTaskBuilder() = default;
 
-  Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id);
+  Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param);
 
  private:
-  Status SetKernelArgs(AiCpuCCTask &task);
+  Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param);
   const OpDescPtr op_desc_;
   const domi::KernelDef &kernel_def_;
 };
diff --git a/ge/single_op/task/aicpu_task_builder.cc b/ge/single_op/task/aicpu_task_builder.cc
index 8f28ffda..1bfbcb3c 100755
--- a/ge/single_op/task/aicpu_task_builder.cc
+++ b/ge/single_op/task/aicpu_task_builder.cc
@@ -26,26 +26,6 @@ namespace ge {
   AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def)
       : op_desc_(op_desc), kernel_def_(kernel_def) {}
 
-  Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses) {
-    size_t arg_size = kernel_def_.args_size();
-    auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM);
-    if (rt_ret != RT_ERROR_NONE) {
-      GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret);
-      return rt_ret;
-    }
-
-    const void *src_addr = reinterpret_cast<const void *>(addresses.data());
-    uint64_t src_len = sizeof(void *) * addresses.size();
-    rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE);
-    if (rt_ret != RT_ERROR_NONE) {
-      (void)rtFree(*io_addr);
-      GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret);
-      return rt_ret;
-    }
-
-    return SUCCESS;
-  }
-
   Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) {
     auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
                             kernel_def_.args().data(), kernel_def_.args().size());
@@ -80,39 +60,27 @@ namespace ge {
     return SUCCESS;
   }
 
-  Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
-                                              const SingleOpModelParam &param, bool dynamic_flag) {
+  Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag) {
     if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
       GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
              sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size());
       return ACL_ERROR_GE_PARAM_INVALID;
     }
-    auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
-    auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace);
-
-    if (dynamic_flag) {
-      GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM));
-    } else {
-      if (ws_addr_vec.empty()) {
-        GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty.");
-        return ACL_ERROR_GE_PARAM_INVALID;
-      }
-      *kernel_workspace = ws_addr_vec[0];
-    }
-    GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(),
+    GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM));
+    GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(),
                            kernel_def_.task_info().data(), kernel_def_.task_info_size(),
                            RT_MEMCPY_HOST_TO_DEVICE));
 
-    auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses));
-    if (ret != SUCCESS) {
-      return ret;
-    }
+    auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false);
+    task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses);
+    task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *);
+    GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM));
     return SUCCESS;
   }
 
   Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param,
                                      bool dynamic_flag, uint64_t kernel_id) {
-    GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag));
+    GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag));
 
     STR_FWK_OP_KERNEL fwk_op_kernel = {0};
     auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel);
@@ -120,6 +88,7 @@ namespace ge {
       return ret;
     }
 
+    GE_CHECK_NOTNULL(op_desc_);
     task.op_desc_ = op_desc_;
     task.num_inputs_ = op_desc_->GetInputsSize();
     task.num_outputs_ = op_desc_->GetOutputsSize();
@@ -136,6 +105,7 @@ namespace ge {
       fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr =  reinterpret_cast<uintptr_t>(task.ext_info_addr_dev_);
       fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoLen = kernel_ext_info_size;
     }
+    GE_CHK_STATUS_RET(task.SetInputConst(), "AiCpuTask set input_const failed.");
     GE_CHK_STATUS_RET(task.InitForSummaryAndCopy(), "AiCpuTask init for summary and copy task failed.");
 
     fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID = ULLONG_MAX;
@@ -153,7 +123,7 @@ namespace ge {
     task.kernel_id_ = kernel_id;
 
     auto debug_info = BuildTaskUtils::GetTaskInfo(op_desc_);
-    GELOGI("[TASK_INFO] %s/%s %s", std::to_string(kernel_id).c_str(), task.op_type_.c_str(), debug_info.c_str());
+    GELOGI("[TASK_INFO] %lu/%s %s", kernel_id, task.op_type_.c_str(), debug_info.c_str());
     return SUCCESS;
   }
 }  // namespace ge
diff --git a/ge/single_op/task/aicpu_task_builder.h b/ge/single_op/task/aicpu_task_builder.h
index 4669e118..fe9c9bc2 100755
--- a/ge/single_op/task/aicpu_task_builder.h
+++ b/ge/single_op/task/aicpu_task_builder.h
@@ -33,10 +33,8 @@ namespace ge {
 
   private:
     static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
-    Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses);
     Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
-    Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
-                              const SingleOpModelParam &param, bool dynamic_flag);
+    Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag);
 
     const OpDescPtr op_desc_;
     const domi::KernelExDef &kernel_def_;
diff --git a/ge/single_op/task/build_task_utils.cc b/ge/single_op/task/build_task_utils.cc
index 29f1657b..071e514b 100644
--- a/ge/single_op/task/build_task_utils.cc
+++ b/ge/single_op/task/build_task_utils.cc
@@ -32,7 +32,8 @@ const uint64_t kVarSize = 0;
 }
 
 std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc,
-                                                              const SingleOpModelParam &param) {
+                                                              const SingleOpModelParam &param,
+                                                              bool keep_workspace) {
   std::vector<std::vector<void *>> ret;
   RuntimeParam runtime_para;
   runtime_para.mem_size = param.memory_size;
@@ -49,7 +50,9 @@ std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &o
 
   ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc));
   ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc));
-  ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
+  if (keep_workspace) {
+    ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
+  }
   return ret;
 }
 
diff --git a/ge/single_op/task/build_task_utils.h b/ge/single_op/task/build_task_utils.h
index cddc7a2b..7a2369e4 100644
--- a/ge/single_op/task/build_task_utils.h
+++ b/ge/single_op/task/build_task_utils.h
@@ -27,15 +27,17 @@
 namespace ge {
 class BuildTaskUtils {
  public:
+  static constexpr int kAddressIndexOutput = 1;
   static constexpr int kAddressIndexWorkspace = 2;
 
-  static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam &param);
+  static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc,
+                                                       const SingleOpModelParam &param,
+                                                       bool keep_workspace = true);
   static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses);
   static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam &param);
   static std::string GetTaskInfo(const OpDescPtr &op_desc);
   template<typename T>
-  static std::string VectorToString(const std::vector<T> &values)
-  {
+  static std::string VectorToString(const std::vector<T> &values) {
     std::stringstream ss;
     ss << '[';
     auto size = values.size();
diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc
index c3c4e5bb..cc63e811 100755
--- a/ge/single_op/task/op_task.cc
+++ b/ge/single_op/task/op_task.cc
@@ -24,9 +24,11 @@
 #include "common/dump/dump_manager.h"
 #include "common/dump/dump_op.h"
 #include "common/formats/formats.h"
+#include "common/math/math_util.h"
 #include "framework/common/debug/log.h"
 #include "register/op_tiling.h"
 #include "runtime/rt.h"
+#include "build_task_utils.h"
 
 namespace ge {
 namespace {
@@ -48,18 +50,22 @@ Status OpTask::OpenDump(rtStream_t stream) {
     std::vector<uint64_t> output_adds;
     auto input_size = op_desc_->GetInputsSize();
     auto output_size = op_desc_->GetOutputsSize();
-    auto all_size = io_addrs_for_dump_.size();
-    if (input_size + output_size != all_size) {
-      GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", all_size,
+    uintptr_t *arg_base = nullptr;
+    size_t arg_num = 0;
+    GetIoAddr(arg_base, arg_num);
+    if (arg_num < input_size + output_size) {
+      GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu",
+             arg_num,
              input_size + output_size);
       return FAILED;
     }
+
     for (size_t i = 0; i < input_size; i++) {
-      uint64_t input_addr = io_addrs_for_dump_[i];
+      uint64_t input_addr = arg_base[i];
       input_addrs.emplace_back(input_addr);
     }
     for (size_t j = 0; j < output_size; j++) {
-      uint64_t output_addr = io_addrs_for_dump_[input_size + j];
+      uint64_t output_addr = arg_base[input_size + j];
       output_adds.emplace_back(output_addr);
     }
     dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
@@ -89,9 +95,55 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size
 
 void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }
 
-const vector<int64_t> &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; }
+void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
+  model_name_ = model_name;
+  model_id_ = model_id;
+}
+
+Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id,
+                                uint32_t &block_dim) {
+  model_name = model_name_;
+  model_id = model_id_;
+  block_dim = block_dim_;
+  GE_CHECK_NOTNULL(op_desc_);
+  op_name = op_desc_->GetName();
+  return SUCCESS;
+}
+Status OpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
+  return UNSUPPORTED;
+}
+
+Status OpTask::DoUpdateArgTable(const SingleOpModelParam &param, bool keep_workspace) {
+  auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, keep_workspace);
+  auto all_addresses = BuildTaskUtils::JoinAddresses(addresses);
+  uintptr_t *arg_base = nullptr;
+  size_t arg_num = 0;
+  GetIoAddr(arg_base, arg_num);
+  if (arg_num < all_addresses.size()) {
+    GELOGE(INTERNAL_ERROR, "[%s] arg number mismatches, expect at least = %zu, but got = %zu",
+           op_desc_->GetName().c_str(),
+           all_addresses.size(),
+           arg_num);
+    return INTERNAL_ERROR;
+  }
 
-void OpTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) { workspace_sizes_ = workspace_sizes; }
+  for (void *addr : all_addresses) {
+    *arg_base++ = reinterpret_cast<uintptr_t >(addr);
+  }
+  return SUCCESS;
+}
+
+Status OpTask::UpdateArgTable(const SingleOpModelParam &param) {
+  return DoUpdateArgTable(param, true);
+}
+
+Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
+                            const vector<DataBuffer> &input_buffers,
+                            vector<GeTensorDesc> &output_desc,
+                            vector<DataBuffer> &output_buffers,
+                            rtStream_t stream) {
+  return UNSUPPORTED;
+}
 
 TbeOpTask::~TbeOpTask() {
   if (sm_desc_ != nullptr) {
@@ -126,12 +178,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) {
     return RT_FAILED;
   }
   GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());
-
-  size_t input_size = op_desc_->GetInputsSize();
-  size_t output_size = op_desc_->GetOutputsSize();
-  uint64_t *io_addr = reinterpret_cast<uint64_t *>(args_.get());
-  std::vector<uint64_t> io_addrs(io_addr, io_addr + input_size + output_size);
-  SetIoAddrsForDump(io_addrs);
   auto status = OpenDump(stream);
   if (status != SUCCESS) {
     GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str());
@@ -152,11 +198,12 @@ Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const ve
     GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret);
     return FAILED;
   }
-  SetWorkspaceSizes(run_info.workspaces);
   block_dim_ = run_info.block_dim;
   tiling_data_ = run_info.tiling_data.str();
   GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_,
          tiling_data_.size());
+
+  GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces");
   return SUCCESS;
 }
 
@@ -212,13 +259,54 @@ void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, s
   max_tiling_size_ = max_tiling_size;
 }
 
-Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *> &outputs,
-                               const vector<void *> &workspaces, rtStream_t stream) {
+Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) {
+  static const std::string kPurpose("malloc workspace memory for dynamic op.");
+  if (workspace_sizes.empty()) {
+    GELOGD("No need to allocate workspace.");
+    return SUCCESS;
+  }
+  int64_t total_size = 0;
+  std::vector<int64_t> ws_offsets;
+  for (auto ws_size : workspace_sizes) {
+    // alignment and padding should be done in OpParaCalculate
+    GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
+    ws_offsets.emplace_back(total_size);
+    total_size += ws_size;
+  }
+
+  GELOGD("Total workspace size is %ld", total_size);
+  GE_CHECK_NOTNULL(stream_resource_);
+  auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size));
+  if (ws_base == nullptr) {
+    GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
+    return ACL_ERROR_GE_MEMORY_ALLOCATION;
+  }
+  GELOGD("Done allocating workspace memory successfully.");
+
+  for (auto ws_offset : ws_offsets) {
+    workspaces_.emplace_back(ws_base + ws_offset);
+  }
+
+  return SUCCESS;
+}
+
+Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
+                               const vector<DataBuffer> &input_buffers,
+                               vector<GeTensorDesc> &output_desc,
+                               vector<DataBuffer> &output_buffers,
+                               rtStream_t stream) {
+  GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc));
   GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
   std::vector<void *> args;
-  args.insert(args.end(), inputs.begin(), inputs.end());
-  args.insert(args.end(), outputs.begin(), outputs.end());
-  args.insert(args.end(), workspaces.begin(), workspaces.end());
+  for (auto &buffer : input_buffers) {
+    args.emplace_back(buffer.data);
+  }
+  for (auto &buffer : output_buffers) {
+    args.emplace_back(buffer.data);
+  }
+  for (auto &buffer : workspaces_) {
+    args.emplace_back(buffer);
+  }
 
   if (tiling_buffer_ != nullptr) {
     GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
@@ -239,6 +327,14 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *
   return SUCCESS;
 }
 
+void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
+  arg_base = reinterpret_cast<uintptr_t *>(args_.get());
+  arg_count = arg_size_ / sizeof(void *);
+  if (tiling_buffer_ != nullptr) {
+    --arg_count;
+  }
+}
+
 AiCpuBaseTask::~AiCpuBaseTask() {
   if (ext_info_addr_dev_ != nullptr) {
     (void)rtFree(ext_info_addr_dev_);
@@ -278,6 +374,25 @@ Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint
   return SUCCESS;
 }
 
+Status AiCpuBaseTask::SetInputConst() {
+  input_is_const_.clear();
+  const vector<bool> v_is_input_const = op_desc_->GetIsInputConst();
+  for (size_t i = 0; i < op_desc_->GetAllInputsSize(); ++i) {
+    const GeTensorDescPtr tensor_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(i));
+    if (tensor_desc == nullptr) {
+      GELOGD("SingleOp: %s, Index: %zu, has no input", op_desc_->GetName().c_str(), i);
+      continue;
+    }
+    if (i < v_is_input_const.size() && v_is_input_const[i]) {
+      GELOGD("SingleOp: %s, Index: %zu, input is const", op_desc_->GetName().c_str(), i);
+      input_is_const_.push_back(true);
+      continue;
+    }
+    input_is_const_.push_back(false);
+  }
+  return SUCCESS;
+}
+
 Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc, 
                                     std::vector<GeTensorDesc> &output_desc,
                                     rtStream_t stream) {
@@ -288,9 +403,23 @@ Status AiCpuBaseTask::UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
   }
 
   GE_CHECK_NOTNULL(aicpu_ext_handle_);
-  for (size_t i = 0; i < num_inputs_; ++i) {
-    GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(i, input_desc[i]),
-                      "Input[%zu] update input shape failed.", i);
+
+  size_t non_const_index = 0;
+  for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
+    if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
+      // get input_desc from op_desc if const input, num_inputs_ is op_desc_ input_size
+      auto const_input_desc = op_desc_->MutableInputDesc(static_cast<uint32_t>(input_index));
+      GE_CHECK_NOTNULL(const_input_desc);
+      GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, *const_input_desc),
+                        "Input[%zu] update input shape failed.", input_index);
+      continue;
+    }
+    GE_CHK_BOOL_RET_STATUS(non_const_index < input_desc.size(), PARAM_INVALID,
+                           "Input_desc size is %zu, but get non_const_index is %zu",
+                           input_desc.size(), non_const_index);
+    GE_CHK_STATUS_RET(aicpu_ext_handle_->UpdateInputShapeAndType(input_index, input_desc[non_const_index]),
+                      "Input[%zu] update input shape failed.", input_index);
+    non_const_index++;
   }
 
   if (unknown_type_ != DEPEND_COMPUTE) {
@@ -363,6 +492,41 @@ Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensor
   return SUCCESS;
 }
 
+Status AiCpuBaseTask::UpdateIoAddr(const vector<DataBuffer> &inputs, const vector<DataBuffer> &outputs) {
+  uintptr_t *arg_base = nullptr;
+  size_t arg_num = 0;
+  GetIoAddr(arg_base, arg_num);
+
+  // input number and output number was check in ValidateParams
+  size_t non_const_index = 0;
+  for (size_t input_index = 0; input_index < num_inputs_; input_index++) {
+    if (input_index < input_is_const_.size() && input_is_const_[input_index]) {
+      // const input no need update addr
+      GE_CHECK_NOTNULL(arg_base);
+      GELOGD("AICpuTask input[%zu] addr = %u", input_index, *arg_base);
+      arg_base++;
+      continue;
+    }
+    GE_CHK_BOOL_RET_STATUS(non_const_index < inputs.size(), PARAM_INVALID,
+                           "Input size is %zu, but get non_const_index is %zu",
+                           inputs.size(), non_const_index);
+    auto addr = inputs[non_const_index].data;
+    GE_CHECK_NOTNULL(addr);
+    GELOGD("AICpuTask input[%zu] addr = %p", input_index, addr);
+    *arg_base++ = reinterpret_cast<uintptr_t>(addr);
+    non_const_index++;
+  }
+
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto addr = outputs[i].data;
+    GE_CHECK_NOTNULL(addr);
+    GELOGD("AICpuTask output[%zu] addr = %p", i, addr);
+    *arg_base++ = reinterpret_cast<uintptr_t>(addr);
+  }
+
+  return SUCCESS;
+}
+
 AiCpuTask::~AiCpuTask() {
   FreeHbm(args_);
   FreeHbm(io_addr_);
@@ -384,12 +548,14 @@ AiCpuTask::~AiCpuTask() {
   }
 }
 
-const void *AiCpuTask::GetIOAddr() const { return io_addr_; }
-
 Status AiCpuTask::LaunchKernel(rtStream_t stream) {
   GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
-  auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(),
-                           RT_MEMCPY_HOST_TO_DEVICE_EX, stream);
+  auto ret = rtMemcpyAsync(io_addr_,
+                           io_addr_size_,
+                           io_addr_host_.data(),
+                           io_addr_host_.size() * sizeof(void *),
+                           RT_MEMCPY_HOST_TO_DEVICE_EX,
+                           stream);
   if (ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str());
     return RT_FAILED;
@@ -401,7 +567,7 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) {
     GELOGE(RT_FAILED, "Invoke rtKernelLaunch failed. ret = %d, task = %s", ret, this->op_type_.c_str());
     return RT_FAILED;
   }
-  GELOGI("[TASK_INFO] %s/%s", std::to_string(kernel_id_).c_str(), op_type_.c_str());
+  GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
 
   auto status = OpenDump(stream);
   if (status != SUCCESS) {
@@ -538,40 +704,6 @@ Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output
   return SUCCESS;
 }
 
-Status AiCpuTask::SetIO(const vector<void *> &inputs, vector<void *> &outputs) {
-  vector<uint64_t> io_addrs;
-  io_addrs.reserve(num_inputs_ + num_outputs_);
-  for (size_t i = 0; i < num_inputs_; ++i) {
-    GE_CHECK_NOTNULL(inputs[i]);
-    GELOGD("AiCpuTask input[%zu] addr = %p", i, inputs[i]);
-    io_addrs.emplace_back(reinterpret_cast<uintptr_t>(inputs[i]));
-  }
-
-  if (unknown_type_ != DEPEND_COMPUTE) {
-    for (size_t i = 0; i < num_outputs_; ++i) {
-      GE_CHECK_NOTNULL(outputs[i]);
-      GELOGD("AiCpuTask output[%zu] addr = %p", i, outputs[i]);
-      io_addrs.emplace_back(reinterpret_cast<uintptr_t>(outputs[i]));
-    }
-  } else {
-    for (size_t i = 0; i < num_outputs_; ++i) {
-      void *summary_addr = output_summary_[i];
-      io_addrs.emplace_back(reinterpret_cast<uintptr_t>(summary_addr));
-    }
-  }
-
-  if (!io_addrs.empty()) {
-    auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(io_addr_));
-    GE_CHK_RT_RET(rtMemcpy(dst_io_addr,
-                           sizeof(uint64_t) * io_addrs.size(),
-                           &io_addrs[0],
-                           sizeof(uint64_t) * io_addrs.size(),
-                           RT_MEMCPY_HOST_TO_DEVICE));
-    GE_CHECK_NOTNULL(dst_io_addr);
-  };
-  return SUCCESS;
-}
-
 Status AiCpuTask::InitForSummaryAndCopy() {
   if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
     GELOGI("Unknown_type is %d, output num is %d.", unknown_type_, num_outputs_);
@@ -643,17 +775,17 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                                std::vector<DataBuffer> &output_buffers,
                                rtStream_t stream) {
   GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
-  std::vector<void *> inputs;
-  std::vector<void *> outputs;
-  for (auto &buffer : input_buffers) {
-    inputs.emplace_back(buffer.data);
-  }
-  for (auto &buffer : output_buffers) {
-    outputs.emplace_back(buffer.data);
+  if (unknown_type_ == DEPEND_COMPUTE) {
+    std::vector<DataBuffer> summary_buffers;
+    for (size_t i = 0; i < num_outputs_; ++i) {
+      summary_buffers.emplace_back(output_summary_[i], sizeof(aicpu::FWKAdapter::ResultSummary), false);
+    }
+    GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, summary_buffers));
+  } else {
+    GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
   }
-  GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs));
-  GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
 
+  GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
   if (unknown_type_ == DEPEND_SHAPE_RANGE) {
     GE_CHK_RT_RET(rtStreamSynchronize(stream));
     GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
@@ -665,6 +797,16 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
   return SUCCESS;
 }
 
+Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam &param) {
+  // aicpu do not have workspace, for now
+  return DoUpdateArgTable(param, false);
+}
+
+void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
+  arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
+  arg_count = io_addr_host_.size();
+}
+
 void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
   args_ = std::move(args);
   arg_size_ = arg_size;
@@ -676,9 +818,7 @@ void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }
 
 void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }
 
-void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; }
-
-const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; }
+void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; }
 
 const void *AiCpuCCTask::GetArgs() const { return args_.get(); }
 
@@ -700,13 +840,8 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
     GELOGE(ret, "Invoke rtCpuKernelLaunch failed. ret = %d", ret);
     return ret;
   }
+  GELOGI("[TASK_INFO] %lu/%s", kernel_id_, op_type_.c_str());
   GELOGD("Invoke rtCpuKernelLaunch succeeded");
-
-  size_t input_size = op_desc_->GetInputsSize();
-  size_t output_size = op_desc_->GetOutputsSize();
-  uint64_t *io_addr = reinterpret_cast<uint64_t *>(io_addr_);
-  std::vector<uint64_t> io_addrs (io_addr, io_addr + input_size + output_size);
-  SetIoAddrsForDump(io_addrs);
   auto status = OpenDump(stream);
   if (status != SUCCESS) {
     GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str());
@@ -721,24 +856,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                                  std::vector<GeTensorDesc> &output_desc,
                                  std::vector<DataBuffer> &output_buffers,
                                  rtStream_t stream) {
-  GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED,
-                         "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.",
-                         unknown_type_);
-
   GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc, stream));
-
-  size_t arg_index = 0;
-  auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_);
-  GE_CHECK_NOTNULL(task_io_addr);
-  for (auto &input : input_buffers) {
-    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input.data);
-  }
-  for (auto &output : output_buffers) {
-    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output.data);
-  }
-
+  GE_CHK_STATUS_RET_NOLOG(UpdateIoAddr(input_buffers, output_buffers));
   GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
-
   if (unknown_type_ == DEPEND_SHAPE_RANGE) {
     GE_CHK_RT_RET(rtStreamSynchronize(stream));
     GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
@@ -746,4 +866,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
 
   return SUCCESS;
 }
+
+void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
+  arg_base = io_addr_;
+  arg_count = io_addr_num_;
+}
 }  // namespace ge
diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h
index 65c77800..2d0740a6 100644
--- a/ge/single_op/task/op_task.h
+++ b/ge/single_op/task/op_task.h
@@ -32,64 +32,48 @@
 #include "init/gelib.h"
 
 namespace ge {
-enum OpTaskType {
-  OP_TASK_TBE = 0,
-  OP_TASK_AICPU,
-  OP_TASK_AICPUCC,
-  OP_TASK_INVALID,
-};
-
+class StreamResource;
+struct SingleOpModelParam;
 class OpTask {
  public:
   OpTask() = default;
   virtual ~OpTask() = default;
   virtual Status LaunchKernel(rtStream_t stream) = 0;
   virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
-                               const vector<GeTensorDesc> &output_desc) {
-    return UNSUPPORTED;
-  }
-  virtual Status LaunchKernel(const std::vector<void *> &inputs,
-                              const std::vector<void *> &outputs,
-                              const std::vector<void *> &workspaces,
-                              rtStream_t stream) {
-    return UNSUPPORTED;
-  }
-  virtual OpTaskType GetOpTaskType() = 0;
-  virtual const void *GetIOAddr() const = 0;
-  const vector<int64_t> &GetWorkspaceSizes() const;
-  void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
+                               const vector<GeTensorDesc> &output_desc);
+  virtual Status UpdateArgTable(const SingleOpModelParam &param);
+  void SetModelArgs(std::string model_name, uint32_t model_id);
+  Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim);
   const OpDescPtr &GetOpdesc() const {return op_desc_;}
   Status OpenDump(rtStream_t stream);
-  void SetIoAddrsForDump(const vector<uint64_t> &io_addrs_for_dump) {
-    io_addrs_for_dump_ = io_addrs_for_dump;
-  }
+  virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0;
   virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                               const std::vector<DataBuffer> &input_buffers,
                               std::vector<GeTensorDesc> &output_desc,
                               std::vector<DataBuffer> &output_buffers,
-                              rtStream_t stream) {
-    return UNSUPPORTED;
-  }
+                              rtStream_t stream);
 
- private:
-  std::vector<int64_t> workspace_sizes_;
  protected:
+  Status DoUpdateArgTable(const SingleOpModelParam &param, bool keep_workspace);
+
   DumpProperties dump_properties_;
   DumpOp dump_op_;
   OpDescPtr op_desc_;
-  std::vector<uint64_t> io_addrs_for_dump_;
+  std::string model_name_;
+  uint32_t model_id_ = 0;
+  uint32_t block_dim_ = 1;
 };
 
 class TbeOpTask : public OpTask {
  public:
   ~TbeOpTask() override;
   Status LaunchKernel(rtStream_t stream) override;
-  OpTaskType GetOpTaskType() override {
-    return OP_TASK_TBE;
-  }
-  const void *GetIOAddr() const override {
-    return nullptr;
-  }
+  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
+                      const std::vector<DataBuffer> &input_buffers,
+                      std::vector<GeTensorDesc> &output_desc,
+                      std::vector<DataBuffer> &output_buffers,
+                      rtStream_t stream) override;
+  void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
   void SetSmDesc(void *sm_desc);
   void SetStubFunc(const std::string &name, const void *stub_func);
   void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc);
@@ -97,31 +81,29 @@ class TbeOpTask : public OpTask {
   Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
                        const vector<GeTensorDesc> &output_desc) override;
 
-  Status LaunchKernel(const vector<void *> &inputs,
-                      const vector<void *> &outputs,
-                      const vector<void *> &workspaces,
-                      rtStream_t stream) override;
-
   const void *GetArgs() const;
   size_t GetArgSize() const;
   const std::string &GetStubName() const;
   void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size);
 
  private:
+  friend class SingleOpModel;
   static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor);
   Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc,
                            const vector<GeTensorDesc> &output_desc);
+  Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes);
 
   const void *stub_func_ = nullptr;
   std::unique_ptr<uint8_t[]> args_;
   size_t arg_size_ = 0;
-  uint32_t block_dim_ = 1;
   void *sm_desc_ = nullptr;
   std::string stub_name_;
 
+  StreamResource *stream_resource_ = nullptr;
   void *tiling_buffer_ = nullptr;
   uint32_t max_tiling_size_ = 0;
   std::string tiling_data_;
+  std::vector<void *> workspaces_;
   NodePtr node_;
 };
 
@@ -129,9 +111,11 @@ class AiCpuBaseTask : public OpTask {
  public:
   AiCpuBaseTask() = default;
   ~AiCpuBaseTask() override;
-  const UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
-
+  UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
+  Status UpdateArgTable(const SingleOpModelParam &param) override;
  protected:
+  Status UpdateIoAddr(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
+  Status SetInputConst();
   Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id);
 
   Status UpdateExtInfo(const std::vector<GeTensorDesc> &input_desc,
@@ -146,6 +130,7 @@ class AiCpuBaseTask : public OpTask {
   UnknowShapeOpType unknown_type_ = DEPEND_IN_SHAPE;
   std::unique_ptr<ge::hybrid::AicpuExtInfoHandler> aicpu_ext_handle_;
   void *ext_info_addr_dev_ = nullptr;
+  vector<bool> input_is_const_;
 };
 
 class AiCpuTask : public AiCpuBaseTask {
@@ -154,10 +139,7 @@ class AiCpuTask : public AiCpuBaseTask {
   ~AiCpuTask() override;
 
   Status LaunchKernel(rtStream_t stream) override;
-  OpTaskType GetOpTaskType() override {
-    return OP_TASK_AICPU;
-  }
-  const void *GetIOAddr() const override;
+  void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
 
   Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
                       const std::vector<DataBuffer> &input_buffers,
@@ -167,8 +149,6 @@ class AiCpuTask : public AiCpuBaseTask {
   Status SetMemCopyTask(const domi::KernelExDef &kernel_def);
 
  private:
-  Status SetIO(const vector<void *> &inputs, vector<void *> &outputs);
-
   // for copy task.
   Status InitForSummaryAndCopy();
   Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
@@ -184,27 +164,31 @@ class AiCpuTask : public AiCpuBaseTask {
   friend class AiCpuTaskBuilder;
   void *workspace_addr_ = nullptr;
   std::string task_info_;
- // device addr
+  // device addr
   void *args_ = nullptr;
   size_t arg_size_ = 0;
   std::string op_type_;
   // device addr
   void *io_addr_ = nullptr;
+  size_t io_addr_size_ = 0;
+
+  // host addr
+  std::vector<void *> io_addr_host_;
 
   bool dynamic_flag_ = false;
   // for copy task
-  void *copy_task_args_buf_;
-  void *copy_workspace_buf_;
+  void *copy_task_args_buf_ = nullptr;
+  void *copy_workspace_buf_ = nullptr;
 
   std::vector<void *> output_summary_;
   std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;
 
-  void *copy_ioaddr_dev_;
+  void *copy_ioaddr_dev_ = nullptr;
 
-  void *copy_input_release_flag_dev_;
-  void *copy_input_data_size_dev_;
-  void *copy_input_src_dev_;
-  void *copy_input_dst_dev_;
+  void *copy_input_release_flag_dev_ = nullptr;
+  void *copy_input_data_size_dev_ = nullptr;
+  void *copy_input_src_dev_ = nullptr;
+  void *copy_input_dst_dev_ = nullptr;
 
   vector<void *> out_shape_hbm_;
   uint64_t kernel_id_ = 0;
@@ -218,13 +202,12 @@ class AiCpuCCTask : public AiCpuBaseTask {
   AiCpuCCTask &operator=(const AiCpuCCTask &) = delete;
 
   Status LaunchKernel(rtStream_t stream) override;
-  OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; }
-  const void *GetIOAddr() const override;
+  void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
   const void *GetArgs() const;
   void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size);
   void SetSoName(const std::string &so_name);
   void SetkernelName(const std::string &kernel_Name);
-  void SetIoAddr(void *io_addr);
+  void SetIoAddr(uintptr_t *io_addr);
   size_t GetArgSize() const;
 
   Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
@@ -239,11 +222,13 @@ private:
   std::string kernel_name_;
   std::unique_ptr<uint8_t[]> args_;
   size_t arg_size_ = 0;
-  uint32_t block_dim_ = 1;
   void *sm_desc_ = nullptr;
-  void *io_addr_ = nullptr;
+  uintptr_t *io_addr_ = nullptr;
+  size_t io_addr_num_ = 0;
   bool is_custom_ = false;
   uint32_t dump_flag_ = RT_KERNEL_DEFAULT;
+  std::string op_type_;
+  uint64_t kernel_id_ = 0;
 };
 }  // namespace ge
 
diff --git a/ge/single_op/task/tbe_task_builder.cc b/ge/single_op/task/tbe_task_builder.cc
index e06a08c6..594352aa 100644
--- a/ge/single_op/task/tbe_task_builder.cc
+++ b/ge/single_op/task/tbe_task_builder.cc
@@ -173,7 +173,8 @@ Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task, const SingleOpModelParam
 
     auto tbe_kernel = GetTbeKernel(op_desc_);
     if (tbe_kernel == nullptr) {
-      GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "OP EXT ATTR NAME TBE_KERNEL not found. op = %s", op_desc_->GetName().c_str());
+      GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "OP EXT ATTR NAME TBE_KERNEL not found. op = %s",
+             op_desc_->GetName().c_str());
       return ACL_ERROR_GE_INTERNAL_ERROR;
     }
 
diff --git a/ge/stub/gen_stubapi.py b/ge/stub/gen_stubapi.py
index f2a6a287..1476d505 100644
--- a/ge/stub/gen_stubapi.py
+++ b/ge/stub/gen_stubapi.py
@@ -1,3 +1,10 @@
+#!/usr/bin/python3.7
+# -*- coding: UTF-8 -*-
+#-------------------------------------------------------------------
+# Purpose:
+# Copyright 2020 Huawei Technologies Co., Ltd. All rights reserved.
+#-------------------------------------------------------------------
+
 import os
 import re
 import sys
@@ -64,7 +71,7 @@ max_code_len_per_line = 100
     when DEBUG on
 """
 white_list_for_debug = ["attr_value.h", "operator.h", "tensor.h", "graph.h", "operator_factory.h",
-                        "ge_ir_build.h", "ge_api.h", "ge_prof.h", "tensorflow_parser.h", "caffe_parser.h"]
+                        "ge_ir_build.h", "ge_api.h", "tensorflow_parser.h", "caffe_parser.h"]
 include_dir_key_words = ["ge", "graph", "parser"]
 DEBUG = True
 
diff --git a/inc/external/acl/acl.h b/inc/external/acl/acl.h
deleted file mode 100644
index ef5b4772..00000000
--- a/inc/external/acl/acl.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_ACL_ACL_H_
-#define INC_EXTERNAL_ACL_ACL_H_
-
-#include "acl_rt.h"
-#include "acl_op.h"
-#include "acl_mdl.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Current version is 1.0.0
-#define ACL_MAJOR_VERSION 1
-#define ACL_MINOR_VERSION 0
-#define ACL_PATCH_VERSION 0
-
-/**
- * @ingroup AscendCL
- * @brief acl initialize
- *
- * @par Restriction
- * The aclInit interface can be called only once in a process
- * @param configPath [IN]    the config path,it can be NULL
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclInit(const char *configPath);
-
-/**
- * @ingroup AscendCL
- * @brief acl finalize
- *
- * @par Restriction
- * Need to call aclFinalize before the process exits.
- * After calling aclFinalize,the services cannot continue to be used normally.
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclFinalize();
-
-/**
- * @ingroup AscendCL
- * @brief query ACL interface version
- *
- * @param majorVersion[OUT] ACL interface major version
- * @param minorVersion[OUT] ACL interface minor version
- * @param patchVersion[OUT] ACL interface patch version
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetVersion(int32_t *majorVersion, int32_t *minorVersion, int32_t *patchVersion);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INC_EXTERNAL_ACL_ACL_H_
diff --git a/inc/external/acl/acl_mdl.h b/inc/external/acl/acl_mdl.h
deleted file mode 100644
index 0652358d..00000000
--- a/inc/external/acl/acl_mdl.h
+++ /dev/null
@@ -1,1112 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_ACL_ACL_MODEL_H_
-#define INC_EXTERNAL_ACL_ACL_MODEL_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "acl_base.h"
-#include "acl_rt.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ACL_MAX_DIM_CNT 128
-#define ACL_MAX_TENSOR_NAME_LEN 128
-#define ACL_MAX_BATCH_NUM 128
-#define ACL_MAX_HW_NUM 128
-#define ACL_MAX_SHAPE_COUNT 128
-#define ACL_INVALID_NODE_INDEX 0xFFFFFFFF
-
-#define ACL_DYNAMIC_TENSOR_NAME "ascend_mbatch_shape_data"
-#define ACL_DYNAMIC_AIPP_NAME "ascend_dynamic_aipp_data"
-
-typedef struct aclmdlDataset aclmdlDataset;
-typedef struct aclmdlDesc aclmdlDesc;
-typedef struct aclmdlAIPP aclmdlAIPP;
-typedef struct aclAippExtendInfo aclAippExtendInfo;
-
-typedef enum {
-  ACL_YUV420SP_U8 = 1,
-  ACL_XRGB8888_U8,
-  ACL_RGB888_U8,
-  ACL_YUV400_U8,
-  ACL_NC1HWC0DI_FP16,
-  ACL_NC1HWC0DI_S8,
-  ACL_ARGB8888_U8,
-  ACL_YUYV_U8,
-  ACL_YUV422SP_U8,
-  ACL_AYUV444_U8,
-  ACL_RAW10,
-  ACL_RAW12,
-  ACL_RAW16,
-  ACL_RAW24,
-  ACL_AIPP_RESERVED = 0xffff,
-} aclAippInputFormat;
-
-typedef enum {
-  ACL_DATA_WITHOUT_AIPP = 0,
-  ACL_DATA_WITH_STATIC_AIPP,
-  ACL_DATA_WITH_DYNAMIC_AIPP,
-  ACL_DYNAMIC_AIPP_NODE
-} aclmdlInputAippType;
-
-typedef struct aclmdlIODims {
-  char name[ACL_MAX_TENSOR_NAME_LEN]; /**< tensor name */
-  size_t dimCount;                    /**< dim array count */
-  int64_t dims[ACL_MAX_DIM_CNT];      /**< dim data array */
-} aclmdlIODims;
-
-typedef struct aclAippDims {
-  aclmdlIODims srcDims;     /**< input dims before model transform */
-  size_t srcSize;           /**< input size before model transform */
-  aclmdlIODims aippOutdims; /**< aipp output dims */
-  size_t aippOutSize;       /**< aipp output size */
-} aclAippDims;
-
-typedef struct aclmdlBatch {
-  size_t batchCount;                 /**< batch array count */
-  uint64_t batch[ACL_MAX_BATCH_NUM]; /**< batch data array */
-} aclmdlBatch;
-
-typedef struct aclmdlHW {
-  size_t hwCount;                 /**< height&width array count */
-  uint64_t hw[ACL_MAX_HW_NUM][2]; /**< height&width data array */
-} aclmdlHW;
-
-typedef struct aclAippInfo {
-  aclAippInputFormat inputFormat;
-  int32_t srcImageSizeW;
-  int32_t srcImageSizeH;
-  int8_t cropSwitch;
-  int32_t loadStartPosW;
-  int32_t loadStartPosH;
-  int32_t cropSizeW;
-  int32_t cropSizeH;
-  int8_t resizeSwitch;
-  int32_t resizeOutputW;
-  int32_t resizeOutputH;
-  int8_t paddingSwitch;
-  int32_t leftPaddingSize;
-  int32_t rightPaddingSize;
-  int32_t topPaddingSize;
-  int32_t bottomPaddingSize;
-  int8_t cscSwitch;
-  int8_t rbuvSwapSwitch;
-  int8_t axSwapSwitch;
-  int8_t singleLineMode;
-  int32_t matrixR0C0;
-  int32_t matrixR0C1;
-  int32_t matrixR0C2;
-  int32_t matrixR1C0;
-  int32_t matrixR1C1;
-  int32_t matrixR1C2;
-  int32_t matrixR2C0;
-  int32_t matrixR2C1;
-  int32_t matrixR2C2;
-  int32_t outputBias0;
-  int32_t outputBias1;
-  int32_t outputBias2;
-  int32_t inputBias0;
-  int32_t inputBias1;
-  int32_t inputBias2;
-  int32_t meanChn0;
-  int32_t meanChn1;
-  int32_t meanChn2;
-  int32_t meanChn3;
-  float minChn0;
-  float minChn1;
-  float minChn2;
-  float minChn3;
-  float varReciChn0;
-  float varReciChn1;
-  float varReciChn2;
-  float varReciChn3;
-  aclFormat srcFormat;
-  aclDataType srcDatatype;
-  size_t srcDimNum;
-  size_t shapeCount;
-  aclAippDims outDims[ACL_MAX_SHAPE_COUNT];
-  aclAippExtendInfo *aippExtend; /**< reserved parameters, current version needs to be null */
-} aclAippInfo;
-
-/**
- * @ingroup AscendCL
- * @brief Create data of type aclmdlDesc
- *
- * @retval the aclmdlDesc pointer
- */
-ACL_FUNC_VISIBILITY aclmdlDesc *aclmdlCreateDesc();
-
-/**
- * @ingroup AscendCL
- * @brief destroy data of type aclmdlDesc
- *
- * @param modelDesc [IN]   Pointer to almdldlDesc to be destroyed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlDestroyDesc(aclmdlDesc *modelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get aclmdlDesc data of the model according to the model ID
- *
- * @param  modelDesc [OUT]   aclmdlDesc pointer
- * @param  modelId [IN]      model id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetDesc(aclmdlDesc *modelDesc, uint32_t modelId);
-
-/**
- * @ingroup AscendCL
- * @brief Get the number of the inputs of
- *        the model according to data of aclmdlDesc
- *
- * @param  modelDesc [IN]   aclmdlDesc pointer
- *
- * @retval input size with aclmdlDesc
- */
-ACL_FUNC_VISIBILITY size_t aclmdlGetNumInputs(aclmdlDesc *modelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get the number of the output of
- *        the model according to data of aclmdlDesc
- *
- * @param  modelDesc [IN]   aclmdlDesc pointer
- *
- * @retval output size with aclmdlDesc
- */
-ACL_FUNC_VISIBILITY size_t aclmdlGetNumOutputs(aclmdlDesc *modelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get the size of the specified input according to
- *        the data of type aclmdlDesc
- *
- * @param  modelDesc [IN]  aclmdlDesc pointer
- * @param  index [IN] the size of the number of inputs to be obtained,
- *         the index value starts from 0
- *
- * @retval Specify the size of the input
- */
-ACL_FUNC_VISIBILITY size_t aclmdlGetInputSizeByIndex(aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief Get the size of the specified output according to
- *        the data of type aclmdlDesc
- *
- * @param modelDesc [IN]   aclmdlDesc pointer
- * @param index [IN]  the size of the number of outputs to be obtained,
- *        the index value starts from 0
- *
- * @retval Specify the size of the output
- */
-ACL_FUNC_VISIBILITY size_t aclmdlGetOutputSizeByIndex(aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief Create data of type aclmdlDataset
- *
- * @retval the aclmdlDataset pointer
- */
-ACL_FUNC_VISIBILITY aclmdlDataset *aclmdlCreateDataset();
-
-/**
- * @ingroup AscendCL
- * @brief destroy data of type aclmdlDataset
- *
- * @param  dataset [IN]  Pointer to aclmdlDataset to be destroyed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlDestroyDataset(const aclmdlDataset *dataset);
-
-/**
- * @ingroup AscendCL
- * @brief Add aclDataBuffer to aclmdlDataset
- *
- * @param dataset [OUT]    aclmdlDataset address of aclDataBuffer to be added
- * @param dataBuffer [IN]  aclDataBuffer address to be added
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlAddDatasetBuffer(aclmdlDataset *dataset, aclDataBuffer *dataBuffer);
-
-/**
- * @ingroup AscendCL
- * @brief Get the number of aclDataBuffer in aclmdlDataset
- *
- * @param dataset [IN]   aclmdlDataset poiter
- *
- * @retval the number of aclDataBuffer
- */
-ACL_FUNC_VISIBILITY size_t aclmdlGetDatasetNumBuffers(const aclmdlDataset *dataset);
-
-/**
- * @ingroup AscendCL
- * @brief Get the aclDataBuffer in aclmdlDataset by index
- *
- * @param dataset [IN]   aclmdlDataset poiter
- * @param index [IN]     the index of aclDataBuffer
- *
- * @retval Get successfully, return the address of aclDataBuffer
- * @retval Failure return NULL
- */
-ACL_FUNC_VISIBILITY aclDataBuffer *aclmdlGetDatasetBuffer(const aclmdlDataset *dataset, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief Load offline model data from files
- * and manage memory internally by the system
- *
- * @par Function
- * After the system finishes loading the model,
- * the model ID returned is used as a mark to identify the model
- * during subsequent operations
- *
- * @param modelPath [IN]   Storage path for offline model files
- * @param modelId [OUT]    Model ID generated after
- *        the system finishes loading the model
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromFile(const char *modelPath, uint32_t *modelId);
-
-/**
- * @ingroup AscendCL
- * @brief Load offline model data from memory and manage the memory of
- * model running internally by the system
- *
- * @par Function
- * After the system finishes loading the model,
- * the model ID returned is used as a mark to identify the model
- * during subsequent operations
- *
- * @param model [IN]      Model data stored in memory
- * @param modelSize [IN]  model data size
- * @param modelId [OUT]   Model ID generated after
- *        the system finishes loading the model
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromMem(const void *model, size_t modelSize, uint32_t *modelId);
-
-/**
- * @ingroup AscendCL
- * @brief Load offline model data from a file,
- * and the user manages the memory of the model run by itself
- *
- * @par Function
- * After the system finishes loading the model,
- * the model ID returned is used as a mark to identify the model
- * during subsequent operations.
- * @param modelPath [IN]   Storage path for offline model files
- * @param modelId [OUT]    Model ID generated after finishes loading the model
- * @param workPtr [IN]     A pointer to the working memory
- *                         required by the model on the Device,can be null
- * @param workSize [IN]    The amount of working memory required by the model
- * @param weightPtr [IN]   Pointer to model weight memory on Device
- * @param weightSize [IN]  The amount of weight memory required by the model
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromFileWithMem(const char *modelPath, uint32_t *modelId, void *workPtr,
-                                                       size_t workSize, void *weightPtr, size_t weightSize);
-
-/**
- * @ingroup AscendCL
- * @brief Load offline model data from memory,
- * and the user can manage the memory of model running
- *
- * @par Function
- * After the system finishes loading the model,
- * the model ID returned is used as a mark to identify the model
- * during subsequent operations
- * @param model [IN]      Model data stored in memory
- * @param modelSize [IN]  model data size
- * @param modelId [OUT]   Model ID generated after finishes loading the model
- * @param workPtr [IN]    A pointer to the working memory
- *                        required by the model on the Device,can be null
- * @param workSize [IN]   work memory size
- * @param weightPtr [IN]  Pointer to model weight memory on Device,can be null
- * @param weightSize [IN] The amount of weight memory required by the model
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromMemWithMem(const void *model, size_t modelSize, uint32_t *modelId,
-                                                      void *workPtr, size_t workSize, void *weightPtr,
-                                                      size_t weightSize);
-
-/**
- * @ingroup AscendCL
- * @brief load model from file with async queue
- *
- * @param modelPath  [IN] model path
- * @param modelId [OUT]   return model id if load success
- * @param inputQ [IN]     input queue pointer
- * @param inputQNum [IN]  input queue num
- * @param outputQ [IN]    output queue pointer
- * @param outputQNum [IN] output queue num
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromFileWithQ(const char *modelPath, uint32_t *modelId, const uint32_t *inputQ,
-                                                     size_t inputQNum, const uint32_t *outputQ, size_t outputQNum);
-
-/**
- * @ingroup AscendCL
- * @brief load model from memory with async queue
- *
- * @param model [IN]      model memory which user manages
- * @param modelSize [IN]  model size
- * @param modelId [OUT]   return model id if load success
- * @param inputQ [IN]     input queue pointer
- * @param inputQNum [IN]  input queue num
- * @param outputQ [IN]    output queue pointer
- * @param outputQNum [IN] output queue num
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromMemWithQ(const void *model, size_t modelSize, uint32_t *modelId,
-                                                    const uint32_t *inputQ, size_t inputQNum, const uint32_t *outputQ,
-                                                    size_t outputQNum);
-
-/**
- * @ingroup AscendCL
- * @brief Execute model synchronous inference until the inference result is returned
- *
- * @param  modelId [IN]   ID of the model to perform inference
- * @param  input [IN]     Input data for model inference
- * @param  output [OUT]   Output data for model inference
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlExecute(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output);
-
-/**
- * @ingroup AscendCL
- * @brief Execute model asynchronous inference until the inference result is returned
- *
- * @param  modelId [IN]   ID of the model to perform inference
- * @param  input [IN]     Input data for model inference
- * @param  output [OUT]   Output data for model inference
- * @param  stream [IN]    stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem
- */
-ACL_FUNC_VISIBILITY aclError aclmdlExecuteAsync(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output,
-                                                aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief unload model with model id
- *
- * @param  modelId [IN]   model id to be unloaded
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlUnload(uint32_t modelId);
-
-/**
- * @ingroup AscendCL
- * @brief Get the weight memory size and working memory size
- * required for model execution according to the model file
- *
- * @param  fileName [IN]     Model path to get memory information
- * @param  workSize [OUT]    The amount of working memory for model executed
- * @param  weightSize [OUT]  The amount of weight memory for model executed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlQuerySize(const char *fileName, size_t *workSize, size_t *weightSize);
-
-/**
- * @ingroup AscendCL
- * @brief Obtain the weights required for
- * model execution according to the model data in memory
- *
- * @par Restriction
- * The execution and weight memory is Device memory,
- * and requires user application and release.
- * @param  model [IN]        model memory which user manages
- * @param  modelSize [IN]    model data size
- * @param  workSize [OUT]    The amount of working memory for model executed
- * @param  weightSize [OUT]  The amount of weight memory for model executed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlQuerySizeFromMem(const void *model, size_t modelSize, size_t *workSize,
-                                                    size_t *weightSize);
-
-/**
- * @ingroup AscendCL
- * @brief In dynamic batch scenarios,
- * it is used to set the number of images processed
- * at one time during model inference
- *
- * @param  modelId [IN]     model id
- * @param  dataset [IN|OUT] data for model inference
- * @param  index [IN]       index of dynamic tensor
- * @param  batchSize [IN]   Number of images processed at a time during model
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetDynamicBatchSize(uint32_t modelId, aclmdlDataset *dataset, size_t index,
-                                                       uint64_t batchSize);
-
-/**
- * @ingroup AscendCL
- * @brief Sets the H and W of the specified input of the model
- *
- * @param  modelId [IN]     model id
- * @param  dataset [IN|OUT] data for model inference
- * @param  index [IN]       index of dynamic tensor
- * @param  height [IN]      model height
- * @param  width [IN]       model width
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetDynamicHWSize(uint32_t modelId, aclmdlDataset *dataset, size_t index,
-                                                    uint64_t height, uint64_t width);
-
-/**
- * @ingroup AscendCL
- * @brief Sets the dynamic dims of the specified input of the model
- *
- * @param  modelId [IN]     model id
- * @param  dataset [IN|OUT] data for model inference
- * @param  index [IN]       index of dynamic dims
- * @param  dims [IN]        value of dynamic dims
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetInputDynamicDims(uint32_t modelId, aclmdlDataset *dataset, size_t index,
-                                                       const aclmdlIODims *dims);
-
-/**
- * @ingroup AscendCL
- * @brief get input dims info
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]  input tensor index
- * @param dims [OUT]  dims info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlGetInputDimsV2
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetInputDims(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims);
-
-/**
- * @ingroup AscendCL
- * @brief get input dims info(version 2), especially for static aipp
- * it is the same with aclmdlGetInputDims while model without static aipp
- *
- * @param modelDesc [IN] model description
- * @param index [IN]     input tensor index
- * @param dims [OUT]     dims info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlGetInputDims
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetInputDimsV2(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims);
-
-/**
- * @ingroup AscendCL
- * @brief get output dims info
- *
- * @param modelDesc [IN] model description
- * @param index [IN]     output tensor index
- * @param dims [OUT]     dims info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetOutputDims(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims);
-
-/**
- * @ingroup AscendCL
- * @brief get current output dims info
- *
- * @par Function
- * The following use cases are supported:
- * @li Get current output shape when model is dynamic and
- * dynamic shape info is set
- * @li Get max output shape when model is dynamic and
- * dynamic shape info is not set
- * @li Get actual output shape when model is static
- *
- * @param modelDesc [IN] model description
- * @param index [IN]     output tensor index
- * @param dims [OUT]     dims info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetCurOutputDims(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims);
-
-/**
- * @ingroup AscendCL
- * @brief get input name by index
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]      intput tensor index
- *
- * @retval input tensor name,the same life cycle with modelDesc
- */
-ACL_FUNC_VISIBILITY const char *aclmdlGetInputNameByIndex(const aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get output name by index
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]      output tensor index
- *
- * @retval output tensor name,the same life cycle with modelDesc
- */
-ACL_FUNC_VISIBILITY const char *aclmdlGetOutputNameByIndex(const aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get input format by index
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]      intput tensor index
- *
- * @retval input tensor format
- */
-ACL_FUNC_VISIBILITY aclFormat aclmdlGetInputFormat(const aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get output format by index
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]      output tensor index
- *
- * @retval output tensor format
- */
-ACL_FUNC_VISIBILITY aclFormat aclmdlGetOutputFormat(const aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get input data type by index
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]  intput tensor index
- *
- * @retval input tensor data type
- */
-ACL_FUNC_VISIBILITY aclDataType aclmdlGetInputDataType(const aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get output data type by index
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]  output tensor index
- *
- * @retval output tensor data type
- */
-ACL_FUNC_VISIBILITY aclDataType aclmdlGetOutputDataType(const aclmdlDesc *modelDesc, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get input tensor index by name
- *
- * @param modelDesc [IN]  model description
- * @param name [IN]    intput tensor name
- * @param index [OUT]  intput tensor index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetInputIndexByName(const aclmdlDesc *modelDesc, const char *name, size_t *index);
-
-/**
- * @ingroup AscendCL
- * @brief get output tensor index by name
- *
- * @param modelDesc [IN]  model description
- * @param name [IN]  output tensor name
- * @param index [OUT]  output tensor index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetOutputIndexByName(const aclmdlDesc *modelDesc, const char *name, size_t *index);
-
-/**
- * @ingroup AscendCL
- * @brief get dynamic batch info
- *
- * @param modelDesc [IN]  model description
- * @param batch [OUT]  dynamic batch info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetDynamicBatch(const aclmdlDesc *modelDesc, aclmdlBatch *batch);
-
-/**
- * @ingroup AscendCL
- * @brief get dynamic height&width info
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]  input tensor index
- * @param hw [OUT]  dynamic height&width info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetDynamicHW(const aclmdlDesc *modelDesc, size_t index, aclmdlHW *hw);
-
-/**
- * @ingroup AscendCL
- * @brief get dynamic gear count
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]  unused, must be -1
- * @param gearCount [OUT]  dynamic gear count
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetInputDynamicGearCount(const aclmdlDesc *modelDesc, size_t index,
-                                                            size_t *gearCount);
-
-/**
- * @ingroup AscendCL
- * @brief get dynamic dims info
- *
- * @param modelDesc [IN]  model description
- * @param index [IN]  unused, must be -1
- * @param dims [OUT]  value of dynamic dims
- * @param gearCount [IN]  dynamic gear count
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetInputDynamicDims(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims,
-                                                       size_t gearCount);
-
-/**
- * @ingroup AscendCL
- * @brief Create data of type aclmdlAIPP
- *
- * @param batchSize [IN]    batchsizes of model
- *
- * @retval the aclmdlAIPP pointer
- */
-ACL_FUNC_VISIBILITY aclmdlAIPP *aclmdlCreateAIPP(uint64_t batchSize);
-
-/**
- * @ingroup AscendCL
- * @brief destroy data of type aclmdlAIPP
- *
- * @param aippParmsSet [IN]    Pointer for aclmdlAIPP to be destroyed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlDestroyAIPP(const aclmdlAIPP *aippParmsSet);
-
-/**
- * @ingroup AscendCL
- * @brief set InputFormat of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]  Pointer for aclmdlAIPP
- * @param inputFormat [IN]    The inputFormat of aipp
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPInputFormat(aclmdlAIPP *aippParmsSet, aclAippInputFormat inputFormat);
-
-/**
- * @ingroup AscendCL
- * @brief set cscParms of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]    Pointer for aclmdlAIPP
- * @param csc_switch [IN]       Csc switch
- * @param cscMatrixR0C0 [IN]    Csc_matrix_r0_c0
- * @param cscMatrixR0C1 [IN]    Csc_matrix_r0_c1
- * @param cscMatrixR0C2 [IN]    Csc_matrix_r0_c2
- * @param cscMatrixR1C0 [IN]    Csc_matrix_r1_c0
- * @param cscMatrixR1C1 [IN]    Csc_matrix_r1_c1
- * @param cscMatrixR1C2 [IN]    Csc_matrix_r1_c2
- * @param cscMatrixR2C0 [IN]    Csc_matrix_r2_c0
- * @param cscMatrixR2C1 [IN]    Csc_matrix_r2_c1
- * @param cscMatrixR2C2 [IN]    Csc_matrix_r2_c2
- * @param cscOutputBiasR0 [IN]  Output Bias for RGB to YUV, element of row 0, unsigned number
- * @param cscOutputBiasR1 [IN]  Output Bias for RGB to YUV, element of row 1, unsigned number
- * @param cscOutputBiasR2 [IN]  Output Bias for RGB to YUV, element of row 2, unsigned number
- * @param cscInputBiasR0 [IN]   Input Bias for YUV to RGB, element of row 0, unsigned number
- * @param cscInputBiasR1 [IN]   Input Bias for YUV to RGB, element of row 1, unsigned number
- * @param cscInputBiasR2 [IN]   Input Bias for YUV to RGB, element of row 2, unsigned number
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPCscParams(aclmdlAIPP *aippParmsSet, int8_t csc_switch, int16_t cscMatrixR0C0,
-                                                    int16_t cscMatrixR0C1, int16_t cscMatrixR0C2, int16_t cscMatrixR1C0,
-                                                    int16_t cscMatrixR1C1, int16_t cscMatrixR1C2, int16_t cscMatrixR2C0,
-                                                    int16_t cscMatrixR2C1, int16_t cscMatrixR2C2,
-                                                    uint8_t cscOutputBiasR0, uint8_t cscOutputBiasR1,
-                                                    uint8_t cscOutputBiasR2, uint8_t cscInputBiasR0,
-                                                    uint8_t cscInputBiasR1, uint8_t cscInputBiasR2);
-
-/**
- * @ingroup AscendCL
- * @brief set rb/ub swap switch of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]  Pointer for aclmdlAIPP
- * @param rbuvSwapSwitch [IN] rb/ub swap switch
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPRbuvSwapSwitch(aclmdlAIPP *aippParmsSet, int8_t rbuvSwapSwitch);
-
-/**
- * @ingroup AscendCL
- * @brief set RGBA->ARGB, YUVA->AYUV swap switch of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]  Pointer for aclmdlAIPP
- * @param axSwapSwitch [IN]   RGBA->ARGB, YUVA->AYUV swap switch
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPAxSwapSwitch(aclmdlAIPP *aippParmsSet, int8_t axSwapSwitch);
-
-/**
- * @ingroup AscendCL
- * @brief set source image of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]  Pointer for aclmdlAIPP
- * @param srcImageSizeW [IN]  Source image width
- * @param srcImageSizeH [IN]  Source image height
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPSrcImageSize(aclmdlAIPP *aippParmsSet, int32_t srcImageSizeW,
-                                                       int32_t srcImageSizeH);
-
-/**
- * @ingroup AscendCL
- * @brief set resize switch of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]  Pointer for aclmdlAIPP
- * @param scfSwitch [IN]      Resize switch
- * @param scfInputSizeW [IN]  Input width of scf
- * @param scfInputSizeH [IN]  Input height of scf
- * @param scfOutputSizeW [IN] Output width of scf
- * @param scfOutputSizeH [IN] Output height of scf
- * @param batchIndex [IN]     Batch parameter index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPScfParams(aclmdlAIPP *aippParmsSet, int8_t scfSwitch, int32_t scfInputSizeW,
-                                                    int32_t scfInputSizeH, int32_t scfOutputSizeW,
-                                                    int32_t scfOutputSizeH, uint64_t batchIndex);
-
-/**
- * @ingroup AscendCL
- * @brief set cropParams of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]  Pointer for aclmdlAIPP
- * @param cropSwitch [IN]     Crop switch
- * @param cropStartPosW [IN]  The start horizontal position of cropping
- * @param cropStartPosH [IN]  The start vertical position of cropping
- * @param cropSizeW [IN]      Crop width
- * @param cropSizeH [IN]      Crop height
- * @param batchIndex [IN]     Batch parameter index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPCropParams(aclmdlAIPP *aippParmsSet, int8_t cropSwitch, int32_t cropStartPosW,
-                                                     int32_t cropStartPosH, int32_t cropSizeW, int32_t cropSizeH,
-                                                     uint64_t batchIndex);
-
-/**
- * @ingroup AscendCL
- * @brief set paddingParams of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]      Pointer for aclmdlAIPP
- * @param paddingSwitch [IN]      Padding switch
- * @param paddingSizeTop [IN]     Top padding size
- * @param paddingSizeBottom [IN]  Bottom padding size
- * @param paddingSizeLeft [IN]    Left padding size
- * @param paddingSizeRight [IN]   Right padding size
- * @param batchIndex [IN]         Batch parameter index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPPaddingParams(aclmdlAIPP *aippParmsSet, int8_t paddingSwitch,
-                                                        int32_t paddingSizeTop, int32_t paddingSizeBottom,
-                                                        int32_t paddingSizeLeft, int32_t paddingSizeRight,
-                                                        uint64_t batchIndex);
-
-/**
- * @ingroup AscendCL
- * @brief set DtcPixelMean of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]      Pointer for aclmdlAIPP
- * @param dtcPixelMeanChn0 [IN]   Mean value of channel 0
- * @param dtcPixelMeanChn1 [IN]   Mean value of channel 1
- * @param dtcPixelMeanChn2 [IN]   Mean value of channel 2
- * @param dtcPixelMeanChn3 [IN]   Mean value of channel 3
- * @param batchIndex [IN]         Batch parameter index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPDtcPixelMean(aclmdlAIPP *aippParmsSet, int16_t dtcPixelMeanChn0,
-                                                       int16_t dtcPixelMeanChn1, int16_t dtcPixelMeanChn2,
-                                                       int16_t dtcPixelMeanChn3, uint64_t batchIndex);
-
-/**
- * @ingroup AscendCL
- * @brief set DtcPixelMin of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]    Pointer for aclmdlAIPP
- * @param dtcPixelMinChn0 [IN]  Min value of channel 0
- * @param dtcPixelMinChn1 [IN]  Min value of channel 1
- * @param dtcPixelMinChn2 [IN]  Min value of channel 2
- * @param dtcPixelMinChn3 [IN]  Min value of channel 3
- * @param batchIndex [IN]       Batch parameter index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPDtcPixelMin(aclmdlAIPP *aippParmsSet, float dtcPixelMinChn0,
-                                                      float dtcPixelMinChn1, float dtcPixelMinChn2,
-                                                      float dtcPixelMinChn3, uint64_t batchIndex);
-
-/**
- * @ingroup AscendCL
- * @brief set PixelVarReci of type aclmdlAIPP
- *
- * @param aippParmsSet [OUT]       Pointer for aclmdlAIPP
- * @param dtcPixelVarReciChn0 [IN] sfr_dtc_pixel_variance_reci_ch0
- * @param dtcPixelVarReciChn1 [IN] sfr_dtc_pixel_variance_reci_ch1
- * @param dtcPixelVarReciChn2 [IN] sfr_dtc_pixel_variance_reci_ch2
- * @param dtcPixelVarReciChn3 [IN] sfr_dtc_pixel_variance_reci_ch3
- * @param batchIndex [IN]          Batch parameter index
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPPixelVarReci(aclmdlAIPP *aippParmsSet, float dtcPixelVarReciChn0,
-                                                       float dtcPixelVarReciChn1, float dtcPixelVarReciChn2,
-                                                       float dtcPixelVarReciChn3, uint64_t batchIndex);
-
-/**
- * @ingroup AscendCL
- * @brief set aipp parameters to model
- *
- * @param modelId [IN]        model id
- * @param dataset [IN]        Pointer of dataset
- * @param index [IN]          index of input for aipp data(ACL_DYNAMIC_AIPP_NODE)
- * @param aippParmsSet [IN]   Pointer for aclmdlAIPP
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName | aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetInputAIPP(uint32_t modelId, aclmdlDataset *dataset, size_t index,
-                                                const aclmdlAIPP *aippParmsSet);
-
-/**
- * @ingroup AscendCL
- * @brief set aipp parameters to model
- *
- * @param modelId [IN]        model id
- * @param dataset [IN]        Pointer of dataset
- * @param index [IN]          index of input for data which linked dynamic aipp(ACL_DATA_WITH_DYNAMIC_AIPP)
- * @param aippParmsSet [IN]   Pointer for aclmdlAIPP
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName | aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetAIPPByInputIndex(uint32_t modelId, aclmdlDataset *dataset, size_t index,
-                                                       const aclmdlAIPP *aippParmsSet);
-
-/**
- * @ingroup AscendCL
- * @brief get input aipp type
- *
- * @param modelId [IN]        model id
- * @param index [IN]          index of input
- * @param type [OUT]          aipp type for input.refrer to aclmdlInputAippType(enum)
- * @param dynamicAttachedDataIndex [OUT]     index for dynamic attached data(ACL_DYNAMIC_AIPP_NODE)
- *        valid when type is ACL_DATA_WITH_DYNAMIC_AIPP, invalid value is ACL_INVALID_NODE_INDEX
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName | aclmdlCreateAIPP
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetAippType(uint32_t modelId, size_t index, aclmdlInputAippType *type,
-                                               size_t *dynamicAttachedDataIndex);
-
-/**
- * @ingroup AscendCL
- * @brief get static aipp parameters from model
- *
- * @param modelId [IN]        model id
- * @param index [IN]          index of tensor
- * @param aippinfo [OUT]      Pointer for static aipp info
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval ACL_ERROR_MODEL_AIPP_NOT_EXIST The tensor of index is not configured with aipp
- * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem | aclmdlGetInputIndexByName
- */
-ACL_FUNC_VISIBILITY aclError aclmdlGetFirstAippInfo(uint32_t modelId, size_t index, aclAippInfo *aippinfo);
-
-/**
- * @ingroup AscendCL
- * @brief get op description info
- *
- * @param deviceId [IN]       device id
- * @param streamId [IN]       stream id
- * @param taskId [IN]         task id
- * @param opName [OUT]        pointer to op name
- * @param opNameLen [IN]      the length of op name
- * @param inputDesc [OUT]     pointer to input description
- * @param numInputs [OUT]     the number of input tensor
- * @param outputDesc [OUT]    pointer to output description
- * @param numOutputs [OUT]    the number of output tensor
- *
- * @retval ACL_SUCCESS The function is successfully executed
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlCreateAndGetOpDesc(uint32_t deviceId, uint32_t streamId, uint32_t taskId,
-                                                      char *opName, size_t opNameLen, aclTensorDesc **inputDesc,
-                                                      size_t *numInputs, aclTensorDesc **outputDesc,
-                                                      size_t *numOutputs);
-
-/**
- * @ingroup AscendCL
- * @brief init dump
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlInitDump();
-
-/**
- * @ingroup AscendCL
- * @brief set param of dump
- *
- * @param dumpCfgPath [IN]   the path of dump config
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlSetDump(const char *dumpCfgPath);
-
-/**
- * @ingroup AscendCL
- * @brief finalize dump.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclmdlFinalizeDump();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INC_EXTERNAL_ACL_ACL_MODEL_H_
diff --git a/inc/external/acl/acl_prof.h b/inc/external/acl/acl_prof.h
deleted file mode 100644
index bfb8a68b..00000000
--- a/inc/external/acl/acl_prof.h
+++ /dev/null
@@ -1,297 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_ACL_PROF_H_
-#define INC_EXTERNAL_ACL_PROF_H_
-
-#include "acl_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ACL_PROF_ACL_API 0x0001
-#define ACL_PROF_TASK_TIME 0x0002
-#define ACL_PROF_AICORE_METRICS 0x0004
-#define ACL_PROF_AICPU_TRACE 0x0008
-
-#define ACL_PROF_MAX_OP_NAME_LEN 257
-#define ACL_PROF_MAX_OP_TYPE_LEN 65
-
-typedef enum {
-  ACL_AICORE_ARITHMATIC_THROUGHPUT = 0,
-  ACL_AICORE_PIPELINE = 1,
-  ACL_AICORE_SYNCHRONIZATION = 2,
-  ACL_AICORE_MEMORY = 3,
-  ACL_AICORE_INTERNAL_MEMORY = 4,
-  ACL_AICORE_STALL = 5,
-  ACL_AICORE_NONE = 0xFF
-} aclprofAicoreMetrics;
-
-typedef struct aclprofConfig aclprofConfig;
-typedef struct aclprofStopConfig aclprofStopConfig;
-typedef struct aclprofAicoreEvents aclprofAicoreEvents;
-typedef struct aclprofSubscribeConfig aclprofSubscribeConfig;
-
-/**
- * @ingroup AscendCL
- * @brief profiling initialize
- *
- * @param  profilerResultPath [IN]  path of profiling result
- * @param  length [IN]              length of profilerResultPath
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofFinalize
- */
-ACL_FUNC_VISIBILITY aclError aclprofInit(const char *profilerResultPath, size_t length);
-
-/**
- * @ingroup AscendCL
- * @brief profiling finalize
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofInit
- */
-ACL_FUNC_VISIBILITY aclError aclprofFinalize();
-
-/**
- * @ingroup AscendCL
- * @brief Start profiling modules by profilerConfig
- *
- * @param  profilerConfig [IN]  config of profiling
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofStop
- */
-ACL_FUNC_VISIBILITY aclError aclprofStart(const aclprofConfig *profilerConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Create data of type aclprofConfig
- *
- * @param  deviceIdList [IN]      list of device id
- * @param  deviceNums [IN]        number of devices
- * @param  aicoreMetrics [IN]     type of aicore metrics
- * @param  aicoreEvents [IN]      pointer to aicore events, only support NULL now
- * @param  dataTypeConfig [IN]    config modules need profiling
- *
- * @retval the aclprofConfig pointer
- *
- * @see aclprofDestroyConfig
- */
-ACL_FUNC_VISIBILITY aclprofConfig *aclprofCreateConfig(uint32_t *deviceIdList, uint32_t deviceNums,
-                                                       aclprofAicoreMetrics aicoreMetrics,
-                                                       aclprofAicoreEvents *aicoreEvents, uint64_t dataTypeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy data of type aclprofConfig
- *
- * @param  profilerConfig [IN]  config of profiling
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofCreateConfig
- */
-ACL_FUNC_VISIBILITY aclError aclprofDestroyConfig(const aclprofConfig *profilerConfig);
-
-/**
- * @ingroup AscendCL
- * @brief stop profiling modules by stopProfilingConfig
- *
- * @param  profilerConfig [IN]  pointer to stop config of profiling
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofStart
- */
-ACL_FUNC_VISIBILITY aclError aclprofStop(const aclprofConfig *profilerConfig);
-
-/**
- * @ingroup AscendCL
- * @brief subscribe profiling data of model
- *
- * @param  modelId [IN]              the model id subscribed
- * @param  profSubscribeConfig [IN]  pointer to config of model subscribe
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofModelUnSubscribe
- */
-ACL_FUNC_VISIBILITY aclError aclprofModelSubscribe(uint32_t modelId, const aclprofSubscribeConfig *profSubscribeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief unsubscribe profiling data of model
- *
- * @param  modelId [IN]  the model id unsubscribed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofModelSubscribe
- */
-ACL_FUNC_VISIBILITY aclError aclprofModelUnSubscribe(uint32_t modelId);
-
-/**
- * @ingroup AscendCL
- * @brief create subscribe config
- *
- * @param  timeInfoSwitch [IN] switch whether get time info from model
- * @param  aicoreMetrics [IN]  aicore metrics
- * @param  fd [IN]             pointer to write pipe
- *
- * @retval the aclprofSubscribeConfig pointer
- *
- * @see aclprofDestroySubscribeConfig
- */
-ACL_FUNC_VISIBILITY aclprofSubscribeConfig *aclprofCreateSubscribeConfig(int8_t timeInfoSwitch,
-                                                                         aclprofAicoreMetrics aicoreMetrics, void *fd);
-
-/**
- * @ingroup AscendCL
- * @brief destroy subscribe config
- *
- * @param  profSubscribeConfig [IN]  subscribe config
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclprofCreateSubscribeConfig
- */
-ACL_FUNC_VISIBILITY aclError aclprofDestroySubscribeConfig(const aclprofSubscribeConfig *profSubscribeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief create subscribe config
- *
- * @param  opDescSize [OUT]  size of op desc
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclprofGetOpDescSize(size_t *opDescSize);
-
-/**
- * @ingroup AscendCL
- * @brief get op number from subscription data
- *
- * @param  opInfo [IN]     pointer to subscription data
- * @param  opInfoLen [IN]  memory size of subscription data
- * @param  opNumber [OUT]  op number of subscription data
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclprofGetOpNum(const void *opInfo, size_t opInfoLen, uint32_t *opNumber);
-
-/**
- * @ingroup AscendCL
- * @brief get op type from subscription data
- *
- * @param  opInfo [IN]      pointer to subscription data
- * @param  opInfoLen [IN]   memory size of subscription data
- * @param  index [IN]       index of op array in opInfo
- * @param  opType [OUT]     obtained op type string
- * @param  opTypeLen [IN]   obtained length of op type string
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclprofGetOpType(const void *opInfo, size_t opInfoLen, uint32_t index, char *opType,
-                                              size_t opTypeLen);
-
-/**
- * @ingroup AscendCL
- * @brief get op type from subscription data
- *
- * @param  opInfo [IN]      pointer to subscription data
- * @param  opInfoLen [IN]   memory size of subscription data
- * @param  index [IN]       index of op array in opInfo
- * @param  opName [OUT]     obtained op name string
- * @param  opNameLen [IN]   obtained length of op name string
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclprofGetOpName(const void *opInfo, size_t opInfoLen, uint32_t index, char *opName,
-                                              size_t opNameLen);
-
-/**
- * @ingroup AscendCL
- * @brief get start time of specified op from subscription data
- *
- * @param  opInfo [IN]     pointer to subscription data
- * @param  opInfoLen [IN]  memory size of subscription data
- * @param  index [IN]      index of op array in opInfo
- *
- * @retval start time(us) of specified op with timestamp
- * @retval 0 for failed
- */
-ACL_FUNC_VISIBILITY uint64_t aclprofGetOpStart(const void *opInfo, size_t opInfoLen, uint32_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get end time of specified op from subscription data
- *
- * @param  opInfo [IN]     pointer to subscription data
- * @param  opInfoLen [IN]  memory size of subscription data
- * @param  index [IN]      index of op array in opInfo
- *
- * @retval end time(us) of specified op with timestamp
- * @retval 0 for failed
- */
-ACL_FUNC_VISIBILITY uint64_t aclprofGetOpEnd(const void *opInfo, size_t opInfoLen, uint32_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get excution time of specified op from subscription data
- *
- * @param  opInfo [IN]     pointer to subscription data
- * @param  opInfoLen [IN]  memory size of subscription data
- * @param  index [IN]      index of op array in opInfo
- *
- * @retval execution time(us) of specified op with timestamp
- * @retval 0 for failed
- */
-ACL_FUNC_VISIBILITY uint64_t aclprofGetOpDuration(const void *opInfo, size_t opInfoLen, uint32_t index);
-
-/**
- * @ingroup AscendCL
- * @brief get model id from subscription data
- *
- * @param  opInfo [IN]     pointer to subscription data
- * @param  opInfoLen [IN]  memory size of subscription data
- *
- * @retval model id of subscription data
- * @retval 0 for failed
- */
-ACL_FUNC_VISIBILITY size_t aclprofGetModelId(const void *opInfo, size_t opInfoLen, uint32_t index);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INC_EXTERNAL_ACL_PROF_H_
\ No newline at end of file
diff --git a/inc/external/acl/acl_rt.h b/inc/external/acl/acl_rt.h
deleted file mode 100644
index eb6b4240..00000000
--- a/inc/external/acl/acl_rt.h
+++ /dev/null
@@ -1,932 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_ACL_ACL_RT_H_
-#define INC_EXTERNAL_ACL_ACL_RT_H_
-
-#include <stdint.h>
-#include <stddef.h>
-#include "acl_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum aclrtRunMode {
-  ACL_DEVICE,
-  ACL_HOST,
-} aclrtRunMode;
-
-typedef enum aclrtTsId {
-  ACL_TS_ID_AICORE = 0,
-  ACL_TS_ID_AIVECTOR = 1,
-  ACL_TS_ID_RESERVED = 2,
-} aclrtTsId;
-
-typedef enum aclrtEventStatus {
-  ACL_EVENT_STATUS_COMPLETE = 0,
-  ACL_EVENT_STATUS_NOT_READY = 1,
-  ACL_EVENT_STATUS_RESERVED = 2,
-} aclrtEventStatus;
-
-typedef enum aclrtCallbackBlockType {
-  ACL_CALLBACK_NO_BLOCK,
-  ACL_CALLBACK_BLOCK,
-} aclrtCallbackBlockType;
-
-typedef enum aclrtMemcpyKind {
-  ACL_MEMCPY_HOST_TO_HOST,
-  ACL_MEMCPY_HOST_TO_DEVICE,
-  ACL_MEMCPY_DEVICE_TO_HOST,
-  ACL_MEMCPY_DEVICE_TO_DEVICE,
-} aclrtMemcpyKind;
-
-typedef enum aclrtMemMallocPolicy {
-  ACL_MEM_MALLOC_HUGE_FIRST,
-  ACL_MEM_MALLOC_HUGE_ONLY,
-  ACL_MEM_MALLOC_NORMAL_ONLY,
-  ACL_MEM_MALLOC_HUGE_FIRST_P2P,
-  ACL_MEM_MALLOC_HUGE_ONLY_P2P,
-  ACL_MEM_MALLOC_NORMAL_ONLY_P2P,
-} aclrtMemMallocPolicy;
-
-typedef enum aclrtMemAttr {
-  ACL_DDR_MEM,
-  ACL_HBM_MEM,
-  ACL_DDR_MEM_HUGE,
-  ACL_DDR_MEM_NORMAL,
-  ACL_HBM_MEM_HUGE,
-  ACL_HBM_MEM_NORMAL,
-  ACL_DDR_MEM_P2P_HUGE,
-  ACL_DDR_MEM_P2P_NORMAL,
-  ACL_HBM_MEM_P2P_HUGE,
-  ACL_HBM_MEM_P2P_NORMAL,
-} aclrtMemAttr;
-
-typedef enum aclrtGroupAttr {
-  ACL_GROUP_AICORE_INT,
-  ACL_GROUP_AIV_INT,
-  ACL_GROUP_AIC_INT,
-  ACL_GROUP_SDMANUM_INT,
-  ACL_GROUP_ASQNUM_INT
-} aclrtGroupAttr;
-
-typedef struct tagRtGroupInfo aclrtGroupInfo;
-
-typedef struct rtExceptionInfo aclrtExceptionInfo;
-
-typedef void (*aclrtCallback)(void *userData);
-
-typedef void (*aclrtExceptionInfoCallback)(aclrtExceptionInfo *exceptionInfo);
-
-/**
- * @ingroup AscendCL
- * @brief Set a callback function to handle exception information
- *
- * @param callback [IN] callback function to handle exception information
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtSetExceptionInfoCallback(aclrtExceptionInfoCallback callback);
-
-/**
- * @ingroup AscendCL
- * @brief Get task id from exception information
- *
- * @param info [IN]   pointer of exception information
- *
- * @retval The task id from exception information
- * @retval 0xFFFFFFFF if info is null
- */
-ACL_FUNC_VISIBILITY uint32_t aclrtGetTaskIdFromExceptionInfo(const aclrtExceptionInfo *info);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream id from exception information
- *
- * @param info [IN]   pointer of exception information
- *
- * @retval The stream id from exception information
- * @retval 0xFFFFFFFF if info is null
- */
-ACL_FUNC_VISIBILITY uint32_t aclrtGetStreamIdFromExceptionInfo(const aclrtExceptionInfo *info);
-
-/**
- * @ingroup AscendCL
- * @brief Get thread id from exception information
- *
- * @param info [IN]   pointer of exception information
- *
- * @retval The thread id of fail task
- * @retval 0xFFFFFFFF if info is null
- */
-ACL_FUNC_VISIBILITY uint32_t aclrtGetThreadIdFromExceptionInfo(const aclrtExceptionInfo *info);
-
-/**
- * @ingroup AscendCL
- * @brief Get device id from exception information
- *
- * @param info [IN]   pointer of exception information
- *
- * @retval The thread id of fail task
- * @retval 0xFFFFFFFF if info is null
- */
-ACL_FUNC_VISIBILITY uint32_t aclrtGetDeviceIdFromExceptionInfo(const aclrtExceptionInfo *info);
-
-/**
- * @ingroup AscendCL
- * @brief The thread that handles the callback function on the Stream
- *
- * @param threadId [IN] thread ID
- * @param stream [IN]   stream handle
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtSubscribeReport(uint64_t threadId, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Add a callback function to be executed on the host
- *        to the task queue of the Stream
- *
- * @param fn [IN]   Specify the callback function to be added
- *                  The function prototype of the callback function is:
- *                  typedef void (*aclrtCallback)(void *userData);
- * @param userData [IN]   User data to be passed to the callback function
- * @param blockType [IN]  callback block type
- * @param stream [IN]     stream handle
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtLaunchCallback(aclrtCallback fn, void *userData, aclrtCallbackBlockType blockType,
-                                                 aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief After waiting for a specified time, trigger callback processing
- *
- * @par Function
- *  The thread processing callback specified by
- *  the aclrtSubscribeReport interface
- *
- * @param timeout [IN]   timeout value
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtSubscribeReport
- */
-ACL_FUNC_VISIBILITY aclError aclrtProcessReport(int32_t timeout);
-
-/**
- * @ingroup AscendCL
- * @brief Cancel thread registration,
- *        the callback function on the specified Stream
- *        is no longer processed by the specified thread
- *
- * @param threadId [IN]   thread ID
- * @param stream [IN]     stream handle
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtUnSubscribeReport(uint64_t threadId, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief create context and associates it with the calling thread
- *
- * @par Function
- * The following use cases are supported:
- * @li If you don't call the aclrtCreateContext interface
- * to explicitly create the context,
- * the system will use the default context, which is implicitly created
- * when the aclrtSetDevice interface is called.
- * @li If multiple contexts are created in a process
- * (there is no limit on the number of contexts),
- * the current thread can only use one of them at the same time.
- * It is recommended to explicitly specify the context of the current thread
- * through the aclrtSetCurrentContext interface to increase.
- * the maintainability of the program.
- *
- * @param  context [OUT]    point to the created context
- * @param  deviceId [IN]    device to create context on
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtSetDevice | aclrtSetCurrentContext
- */
-ACL_FUNC_VISIBILITY aclError aclrtCreateContext(aclrtContext *context, int32_t deviceId);
-
-/**
- * @ingroup AscendCL
- * @brief destroy context instance
- *
- * @par Function
- * Can only destroy context created through aclrtCreateContext interface
- *
- * @param  context [IN]   the context to destroy
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtCreateContext
- */
-ACL_FUNC_VISIBILITY aclError aclrtDestroyContext(aclrtContext context);
-
-/**
- * @ingroup AscendCL
- * @brief set the context of the thread
- *
- * @par Function
- * The following scenarios are supported:
- * @li If the aclrtCreateContext interface is called in a thread to explicitly
- * create a Context (for example: ctx1), the thread's Context can be specified
- * without calling the aclrtSetCurrentContext interface.
- * The system uses ctx1 as the context of thread1 by default.
- * @li If the aclrtCreateContext interface is not explicitly created,
- * the system uses the default context as the context of the thread.
- * At this time, the aclrtDestroyContext interface cannot be used to release
- * the default context.
- * @li If the aclrtSetCurrentContext interface is called multiple times to
- * set the thread's Context, the last one prevails.
- *
- * @par Restriction
- * @li If the cevice corresponding to the context set for the thread
- * has been reset, you cannot set the context as the context of the thread,
- * otherwise a business exception will result.
- * @li It is recommended to use the context created in a thread.
- * If the aclrtCreateContext interface is called in thread A to create a context,
- * and the context is used in thread B,
- * the user must guarantee the execution order of tasks in the same stream
- * under the same context in two threads.
- *
- * @param  context [IN]   the current context of the thread
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtCreateContext | aclrtDestroyContext
- */
-ACL_FUNC_VISIBILITY aclError aclrtSetCurrentContext(aclrtContext context);
-
-/**
- * @ingroup AscendCL
- * @brief get the context of the thread
- *
- * @par Function
- * If the user calls the aclrtSetCurrentContext interface
- * multiple times to set the context of the current thread,
- * then the last set context is obtained
- *
- * @param  context [OUT]   the current context of the thread
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtSetCurrentContext
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetCurrentContext(aclrtContext *context);
-
-/**
- * @ingroup AscendCL
- * @brief Specify the device to use for the operation
- * implicitly create the default context and the default stream
- *
- * @par Function
- * The following use cases are supported:
- * @li Device can be specified in the process or thread.
- * If you call the aclrtSetDevice interface multiple
- * times to specify the same device,
- * you only need to call the aclrtResetDevice interface to reset the device.
- * @li The same device can be specified for operation
- *  in different processes or threads.
- * @li Device is specified in a process,
- * and multiple threads in the process can share this device to explicitly
- * create a Context (aclrtCreateContext interface).
- * @li In multi-device scenarios, you can switch to other devices
- * through the aclrtSetDevice interface in the process.
- *
- * @param  deviceId [IN]  the device id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtResetDevice |aclrtCreateContext
- */
-ACL_FUNC_VISIBILITY aclError aclrtSetDevice(int32_t deviceId);
-
-/**
- * @ingroup AscendCL
- * @brief Reset the current operating Device and free resources on the device,
- * including the default context, the default stream,
- * and all streams created under the default context,
- * and synchronizes the interface.
- * If the task under the default context or stream has not been completed,
- * the system will wait for the task to complete before releasing it.
- *
- * @par Restriction
- * @li The Context, Stream, and Event that are explicitly created
- * on the device to be reset. Before resetting,
- * it is recommended to follow the following interface calling sequence,
- * otherwise business abnormalities may be caused.
- * @li Interface calling sequence:
- * call aclrtDestroyEvent interface to release Event or
- * call aclrtDestroyStream interface to release explicitly created Stream->
- * call aclrtDestroyContext to release explicitly created Context->
- * call aclrtResetDevice interface
- *
- * @param  deviceId [IN]   the device id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtResetDevice(int32_t deviceId);
-
-/**
- * @ingroup AscendCL
- * @brief get target device of current thread
- *
- * @param deviceId [OUT]  the device id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetDevice(int32_t *deviceId);
-
-/**
- * @ingroup AscendCL
- * @brief get target side
- *
- * @param runMode [OUT]    the run mode
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetRunMode(aclrtRunMode *runMode);
-
-/**
- * @ingroup AscendCL
- * @brief Wait for compute device to finish
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtSynchronizeDevice(void);
-
-/**
- * @ingroup AscendCL
- * @brief Set Scheduling TS
- *
- * @param tsId [IN]   the ts id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtSetTsDevice(aclrtTsId tsId);
-
-/**
- * @ingroup AscendCL
- * @brief get total device number.
- *
- * @param count [OUT]    the device number
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetDeviceCount(uint32_t *count);
-
-/**
- * @ingroup AscendCL
- * @brief create event instance
- *
- * @param event [OUT]   created event
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtCreateEvent(aclrtEvent *event);
-
-/**
- * @ingroup AscendCL
- * @brief destroy event instance
- *
- * @par Function
- *  Only events created through the aclrtCreateEvent interface can be
- *  destroyed, synchronous interfaces. When destroying an event,
- *  the user must ensure that the tasks involved in the aclrtSynchronizeEvent
- *  interface or the aclrtStreamWaitEvent interface are completed before
- *  they are destroyed.
- *
- * @param  event [IN]   event to destroy
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtCreateEvent | aclrtSynchronizeEvent | aclrtStreamWaitEvent
- */
-ACL_FUNC_VISIBILITY aclError aclrtDestroyEvent(aclrtEvent event);
-
-/**
- * @ingroup AscendCL
- * @brief Record an Event in the Stream
- *
- * @param event [IN]    event to record
- * @param stream [IN]   stream handle
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtRecordEvent(aclrtEvent event, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Reset an event
- *
- * @par Function
- *  Users need to make sure to wait for the tasks in the Stream
- *  to complete before resetting the Event
- *
- * @param event [IN]    event to reset
- * @param stream [IN]   stream handle
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtResetEvent(aclrtEvent event, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Queries an event's status
- *
- * @param  event [IN]    event to query
- * @param  status [OUT]  event status
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtQueryEvent(aclrtEvent event, aclrtEventStatus *status);
-
-/**
- * @ingroup AscendCL
- * @brief Block Host Running, wait event to be complete
- *
- * @param  event [IN]   event to wait
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtSynchronizeEvent(aclrtEvent event);
-
-/**
- * @ingroup AscendCL
- * @brief computes the elapsed time between events.
- *
- * @param ms [OUT]     time between start and end in ms
- * @param start [IN]   starting event
- * @param end [IN]     ending event
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtCreateEvent | aclrtRecordEvent | aclrtSynchronizeStream
- */
-ACL_FUNC_VISIBILITY aclError aclrtEventElapsedTime(float *ms, aclrtEvent start, aclrtEvent end);
-
-/**
- * @ingroup AscendCL
- * @brief alloc memory on device
- *
- * @par Function
- *  alloc for size linear memory on device
- *  and return a pointer to allocated memory by *devPtr
- *
- * @par Restriction
- * @li The memory requested by the aclrtMalloc interface needs to be released
- * through the aclrtFree interface.
- * @li Before calling the media data processing interface,
- * if you need to apply memory on the device to store input or output data,
- * you need to call acldvppMalloc to apply for memory.
- *
- * @param devPtr [OUT]  pointer to pointer to allocated memory on device
- * @param size [IN]     alloc memory size
- * @param policy [IN]   memory alloc policy
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtFree | acldvppMalloc | aclrtMallocCached
- */
-ACL_FUNC_VISIBILITY aclError aclrtMalloc(void **devPtr, size_t size, aclrtMemMallocPolicy policy);
-
-/**
- * @ingroup AscendCL
- * @brief allocate memory on device with cache
- *
- * @par Function
- *  alloc for size linear memory on device
- *  and return a pointer to allocated memory by *devPtr
- *
- * @par Restriction
- * @li The memory requested by the aclrtMallocCached interface needs to be released
- * through the aclrtFree interface.
- *
- * @param devPtr [OUT]  pointer to pointer to allocated memory on device
- * @param size [IN]     alloc memory size
- * @param policy [IN]   memory alloc policy
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtFree | aclrtMalloc
- */
-ACL_FUNC_VISIBILITY aclError aclrtMallocCached(void **devPtr, size_t size, aclrtMemMallocPolicy policy);
-
-/**
- * @ingroup AscendCL
- * @brief flush cache data to ddr
- *
- * @param devPtr [IN]  the pointer that flush data to ddr
- * @param size [IN]    flush size
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtMemFlush(void *devPtr, size_t size);
-
-/**
- * @ingroup AscendCL
- * @brief invalidate cache data
- *
- * @param devPtr [IN]  pointer to invalidate cache data
- * @param size [IN]    invalidate size
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtMemInvalidate(void *devPtr, size_t size);
-
-/**
- * @ingroup AscendCL
- * @brief free device memory
- *
- * @par Function
- *  can only free memory allocated through the aclrtMalloc interface
- *
- * @param  devPtr [IN]  Pointer to memory to be freed
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtMalloc
- */
-ACL_FUNC_VISIBILITY aclError aclrtFree(void *devPtr);
-
-/**
- * @ingroup AscendCL
- * @brief alloc memory on host
- *
- * @par Restriction
- * @li The requested memory cannot be used in the Device
- * and needs to be explicitly copied to the Device.
- * @li The memory requested by the aclrtMallocHost interface
- * needs to be released through the aclrtFreeHost interface.
- *
- * @param  hostPtr [OUT] pointer to pointer to allocated memory on the host
- * @param  size [IN]     alloc memory size
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtFreeHost
- */
-ACL_FUNC_VISIBILITY aclError aclrtMallocHost(void **hostPtr, size_t size);
-
-/**
- * @ingroup AscendCL
- * @brief free host memory
- *
- * @par Function
- *  can only free memory allocated through the aclrtMallocHost interface
- *
- * @param  hostPtr [IN]   free memory pointer
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtMallocHost
- */
-ACL_FUNC_VISIBILITY aclError aclrtFreeHost(void *hostPtr);
-
-/**
- * @ingroup AscendCL
- * @brief synchronous memory replication between host and device
- *
- * @param dst [IN]       destination address pointer
- * @param destMax [IN]   Max length of the destination address memory
- * @param src [IN]       source address pointer
- * @param count [IN]     the length of byte to copy
- * @param kind [IN]      memcpy type
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtMemcpy(void *dst, size_t destMax, const void *src, size_t count,
-                                         aclrtMemcpyKind kind);
-
-/**
- * @ingroup AscendCL
- * @brief Initialize memory and set contents of memory to specified value
- *
- * @par Function
- *  The memory to be initialized is on the Host or device side,
- *  and the system determines whether
- *  it is host or device according to the address
- *
- * @param devPtr [IN]    Starting address of memory
- * @param maxCount [IN]  Max length of destination address memory
- * @param value [IN]     Set value
- * @param count [IN]     The length of memory
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtMemset(void *devPtr, size_t maxCount, int32_t value, size_t count);
-
-/**
- * @ingroup AscendCL
- * @brief  Asynchronous memory replication between Host and Device
- *
- * @par Function
- *  After calling this interface,
- *  be sure to call the aclrtSynchronizeStream interface to ensure that
- *  the task of memory replication has been completed
- *
- * @par Restriction
- * @li For on-chip Device-to-Device memory copy,
- *     both the source and destination addresses must be 64-byte aligned
- *
- * @param dst [IN]     destination address pointer
- * @param destMax [IN] Max length of destination address memory
- * @param src [IN]     source address pointer
- * @param count [IN]   the number of byte to copy
- * @param kind [IN]    memcpy type
- * @param stream [IN]  asynchronized task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtSynchronizeStream
- */
-ACL_FUNC_VISIBILITY aclError aclrtMemcpyAsync(void *dst, size_t destMax, const void *src, size_t count,
-                                              aclrtMemcpyKind kind, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Asynchronous initialize memory
- * and set contents of memory to specified value async
- *
- * @par Function
- *  The memory to be initialized is on the Host or device side,
- *  and the system determines whether
- *  it is host or device according to the address
- *
- * @param devPtr [IN]      destination address pointer
- * @param maxCount [IN]    Max length of destination address memory
- * @param value [IN]       set value
- * @param count [IN]       the number of byte to set
- * @param stream [IN]      asynchronized task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtSynchronizeStream
- */
-ACL_FUNC_VISIBILITY aclError aclrtMemsetAsync(void *devPtr, size_t maxCount, int32_t value, size_t count,
-                                              aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief  create stream instance
- *
- * @param  stream [OUT]   the created stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtCreateStream(aclrtStream *stream);
-
-/**
- * @ingroup AscendCL
- * @brief destroy stream instance
- *
- * @par Function
- * Can only destroy streams created through the aclrtCreateStream interface
- *
- * @par Restriction
- * Before calling the aclrtDestroyStream interface to destroy
- * the specified Stream, you need to call the aclrtSynchronizeStream interface
- * to ensure that the tasks in the Stream have been completed.
- *
- * @param stream [IN]  the stream to destroy
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtCreateStream | aclrtSynchronizeStream
- */
-ACL_FUNC_VISIBILITY aclError aclrtDestroyStream(aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief block the host until all tasks
- * in the specified stream have completed
- *
- * @param  stream [IN]   the stream to wait
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtSynchronizeStream(aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Blocks the operation of the specified Stream until
- * the specified Event is completed.
- * Support for multiple streams waiting for the same event.
- *
- * @param  stream [IN]   the wait stream If using thedefault Stream, set NULL
- * @param  event [IN]    the event to wait
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtStreamWaitEvent(aclrtStream stream, aclrtEvent event);
-
-/**
- * @ingroup AscendCL
- * @brief set group
- *
- * @par Function
- *  set the task to the corresponding group
- *
- * @param groupId [IN]   group id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtGetGroupCount | aclrtGetAllGroupInfo | aclrtGetGroupInfoDetail
- */
-ACL_FUNC_VISIBILITY aclError aclrtSetGroup(int32_t groupId);
-
-/**
- * @ingroup AscendCL
- * @brief get the number of group
- *
- * @par Function
- *  get the number of group. if the number of group is zero,
- *  it means that group is not supported or group is not created.
- *
- * @param count [OUT]   the number of group
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetGroupCount(uint32_t *count);
-
-/**
- * @ingroup AscendCL
- * @brief create group information
- *
- * @retval null for failed.
- * @retval OtherValues success.
- *
- * @see aclrtDestroyGroupInfo
- */
-ACL_FUNC_VISIBILITY aclrtGroupInfo *aclrtCreateGroupInfo();
-
-/**
- * @ingroup AscendCL
- * @brief destroy group information
- *
- * @param groupInfo [IN]   pointer to group information
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtCreateGroupInfo
- */
-ACL_FUNC_VISIBILITY aclError aclrtDestroyGroupInfo(aclrtGroupInfo *groupInfo);
-
-/**
- * @ingroup AscendCL
- * @brief get all group information
- *
- * @param groupInfo [OUT]   pointer to group information
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtGetGroupCount
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetAllGroupInfo(aclrtGroupInfo *groupInfo);
-
-/**
- * @ingroup AscendCL
- * @brief get detail information of group
- *
- * @param groupInfo [IN]    pointer to group information
- * @param groupId [IN]      group index value
- * @param attr [IN]         group attribute
- * @param attrValue [OUT]   pointer to attribute value
- * @param valueLen [IN]     length of attribute value
- * @param paramRetSize [OUT]   pointer to real length of attribute value
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtGetGroupCount | aclrtGetAllGroupInfo
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetGroupInfoDetail(const aclrtGroupInfo *groupInfo, int32_t groupId,
-                                                     aclrtGroupAttr attr, void *attrValue, size_t valueLen,
-                                                     size_t *paramRetSize);
-
-/**
- * @ingroup AscendCL
- * @brief checking whether current device and peer device support the p2p feature
- *
- * @param canAccessPeer [OUT]   pointer to save the checking result
- * @param deviceId [IN]         current device id
- * @param peerDeviceId [IN]     peer device id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtDeviceEnablePeerAccess | aclrtDeviceDisablePeerAccess
- */
-ACL_FUNC_VISIBILITY aclError aclrtDeviceCanAccessPeer(int32_t *canAccessPeer, int32_t deviceId, int32_t peerDeviceId);
-
-/**
- * @ingroup AscendCL
- * @brief enable the peer device to support the p2p feature
- *
- * @param peerDeviceId [IN]   the peer device id
- * @param flags [IN]   reserved field, now it must be zero
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtDeviceCanAccessPeer | aclrtDeviceDisablePeerAccess
- */
-ACL_FUNC_VISIBILITY aclError aclrtDeviceEnablePeerAccess(int32_t peerDeviceId, uint32_t flags);
-
-/**
- * @ingroup AscendCL
- * @brief disable the peer device to support the p2p function
- *
- * @param peerDeviceId [IN]   the peer device id
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclrtDeviceCanAccessPeer | aclrtDeviceEnablePeerAccess
- */
-ACL_FUNC_VISIBILITY aclError aclrtDeviceDisablePeerAccess(int32_t peerDeviceId);
-
-/**
- * @ingroup AscendCL
- * @brief Obtain the free memory and total memory of specified attribute.
- * the specified memory include normal memory and huge memory.
- *
- * @param attr [IN]    the memory attribute of specified device
- * @param free [OUT]   the free memory of specified device
- * @param total [OUT]  the total memory of specified device.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclrtGetMemInfo(aclrtMemAttr attr, size_t *free, size_t *total);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INC_EXTERNAL_ACL_ACL_RT_H_
diff --git a/inc/external/acl/acl_tdt.h b/inc/external/acl/acl_tdt.h
deleted file mode 100644
index c357518d..00000000
--- a/inc/external/acl/acl_tdt.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_ACL_ACL_TDT_H_
-#define INC_EXTERNAL_ACL_ACL_TDT_H_
-
-#include "acl/acl_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-enum acltdtTensorType {
-  ACL_TENSOR_DATA_UNDEFINED = -1,
-  ACL_TENSOR_DATA_TENSOR,
-  ACL_TENSOR_DATA_END_OF_SEQUENCE,
-  ACL_TENSOR_DATA_ABNORMAL
-};
-
-typedef struct acltdtDataItem acltdtDataItem;
-typedef struct acltdtDataset acltdtDataset;
-typedef struct acltdtChannelHandle acltdtChannelHandle;
-
-/**
- * @ingroup AscendCL
- * @brief Get tensor type from item
- *
- * @param dataItem [IN] pointer to the data item
- *
- * @retval Tensor type.
- * @retval ACL_DT_UNDEFINED if dataItem is null
- */
-ACL_FUNC_VISIBILITY acltdtTensorType acltdtGetTensorTypeFromItem(const acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Get data type from item
- *
- * @param dataItem [IN] pointer to the data item
- *
- * @retval Data type.
- * @retval ACL_DT_UNDEFINED if dataItem is null
- */
-ACL_FUNC_VISIBILITY aclDataType acltdtGetDataTypeFromItem(const acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Get data address from item
- *
- * @param dataItem [IN] pointer to data item
- *
- * @retval null for failed
- * @retval OtherValues success
- */
-ACL_FUNC_VISIBILITY void *acltdtGetDataAddrFromItem(const acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Get data size from item
- *
- * @param dataItem [IN] pointer to data item
- *
- * @retval 0 for failed
- * @retval OtherValues success
- */
-ACL_FUNC_VISIBILITY size_t acltdtGetDataSizeFromItem(const acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Get dim's number from item
- *
- * @param dataItem [IN] pointer to data item
- *
- * @retval 0 for failed
- * @retval OtherValues success
- */
-ACL_FUNC_VISIBILITY size_t acltdtGetDimNumFromItem(const acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Get dims from item
- *
- * @param  dataItem [IN]      the struct of data item
- * @param  dims [IN|OUT]      pointer to the dims of dataTtem
- * @param  dimNum [IN]        the size of the dims
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acltdtGetDimsFromItem(const acltdtDataItem *dataItem, int64_t *dims, size_t dimNum);
-
-/**
- * @ingroup AscendCL
- * @brief Create the struct of data item
- *
- * @param tdtType [IN]  Tdt tensor type
- * @param dims [IN]     pointer of tdtDataItem's dims
- * @param dimNum [IN]   Dim number
- * @param dataType [IN] Data type
- * @param data [IN]     Data pointer
- * @param size [IN]     Data size
- *
- * @retval null for failed
- * @retval OtherValues success
- *
- * @see acltdtDestroyDataItem
- */
-ACL_FUNC_VISIBILITY acltdtDataItem *acltdtCreateDataItem(acltdtTensorType tdtType, const int64_t *dims, size_t dimNum,
-                                                         aclDataType dataType, void *data, size_t size);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy the struct of data item
- *
- * @param dataItem [IN]  pointer to the data item
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtCreateDataItem
- */
-ACL_FUNC_VISIBILITY aclError acltdtDestroyDataItem(acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Create the tdt dataset
- *
- * @retval null for failed
- * @retval OtherValues success
- *
- * @see acltdtDestroyDataset
- */
-ACL_FUNC_VISIBILITY acltdtDataset *acltdtCreateDataset();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy the tdt dataset
- *
- * @param dataset [IN]  pointer to the dataset
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtCreateDataset
- */
-ACL_FUNC_VISIBILITY aclError acltdtDestroyDataset(acltdtDataset *dataset);
-
-/**
- * @ingroup AscendCL
- * @brief Get the data item
- *
- * @param dataset [IN] pointer to the dataset
- * @param index [IN]   index of the dataset
- *
- * @retval null for failed
- * @retval OtherValues success
- *
- * @see acltdtAddDataItem
- */
-ACL_FUNC_VISIBILITY acltdtDataItem *acltdtGetDataItem(const acltdtDataset *dataset, size_t index);
-
-/**
- * @ingroup AscendCL
- * @brief Get the data item
- *
- * @param dataset [OUT] pointer to the dataset
- * @param dataItem [IN] pointer to the data item
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtGetDataItem
- */
-ACL_FUNC_VISIBILITY aclError acltdtAddDataItem(acltdtDataset *dataset, acltdtDataItem *dataItem);
-
-/**
- * @ingroup AscendCL
- * @brief Get the size of dataset
- *
- * @param dataset [IN]  pointer to the dataset
- *
- * @retval 0 for failed
- * @retval OtherValues success
- */
-ACL_FUNC_VISIBILITY size_t acltdtGetDatasetSize(const acltdtDataset *dataset);
-
-/**
- * @ingroup AscendCL
- * @brief Stop the channel
- *
- * @param handle [IN]  pointer to the channel handle
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtCreateChannel | acltdtDestroyChannel
- */
-ACL_FUNC_VISIBILITY aclError acltdtStopChannel(acltdtChannelHandle *handle);
-
-/**
- * @ingroup AscendCL
- * @brief Create the channel
- *
- * @param deviceId [IN]  the device id
- * @param name [IN]      the channel's name
- *
- * @retval null for failed
- * @retval OtherValues success
- *
- * @see acltdtStopChannel | acltdtDestroyChannel
- */
-ACL_FUNC_VISIBILITY acltdtChannelHandle *acltdtCreateChannel(uint32_t deviceId, const char *name);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy the channel
- *
- * @param handle [IN]  pointer to the channel handle
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtCreateChannel | acltdtStopChannel
- */
-ACL_FUNC_VISIBILITY aclError acltdtDestroyChannel(acltdtChannelHandle *handle);
-
-/**
- * @ingroup AscendCL
- * @brief Send tensor to device
- *
- * @param handle [IN]  pointer to the channel handle
- * @param dataset [IN] pointer to the dataset
- * @param timeout [IN] to be reserved, now it must be -1
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtReceiveTensor
- */
-ACL_FUNC_VISIBILITY aclError acltdtSendTensor(const acltdtChannelHandle *handle, const acltdtDataset *dataset,
-                                              int32_t timeout);
-
-/**
- * @ingroup AscendCL
- * @brief Receive tensor from device
- *
- * @param handle [IN]      pointer to the channel handle
- * @param dataset [OUT]    pointer to the dataset
- * @param timeout [IN]     to be reserved, now it must be -1
- *
- * @retval ACL_SUCCESS  The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acltdtSendTensor
- */
-ACL_FUNC_VISIBILITY aclError acltdtReceiveTensor(const acltdtChannelHandle *handle, acltdtDataset *dataset,
-                                                 int32_t timeout);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INC_EXTERNAL_ACL_ACL_TDT_H_
diff --git a/inc/external/acl/ops/acl_dvpp.h b/inc/external/acl/ops/acl_dvpp.h
deleted file mode 100644
index 32a21e91..00000000
--- a/inc/external/acl/ops/acl_dvpp.h
+++ /dev/null
@@ -1,2340 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#if !defined(ENABLE_DVPP_INTERFACE)
-#if defined(_MSC_VER)
-#error message("if you want to use dvpp funtions ,please use the macro definition (ENABLE_DVPP_INTERFACE).")
-#else
-#error "if you want to use dvpp funtions ,please use the macro definition (ENABLE_DVPP_INTERFACE)."
-#endif
-#endif
-
-#ifndef INC_EXTERNAL_ACL_OPS_ACL_DVPP_H_
-#define INC_EXTERNAL_ACL_OPS_ACL_DVPP_H_
-
-#include <stdint.h>
-#include <stddef.h>
-#include "acl/acl.h"
-#include "acl/acl_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct acldvppPicDesc acldvppPicDesc;
-typedef struct acldvppBatchPicDesc acldvppBatchPicDesc;
-typedef struct acldvppRoiConfig acldvppRoiConfig;
-typedef struct acldvppResizeConfig acldvppResizeConfig;
-typedef struct acldvppBorderConfig acldvppBorderConfig;
-typedef struct acldvppLutMap acldvppLutMap;
-typedef struct acldvppChannelDesc acldvppChannelDesc;
-typedef struct acldvppJpegeConfig acldvppJpegeConfig;
-typedef struct aclvdecChannelDesc aclvdecChannelDesc;
-typedef struct acldvppStreamDesc acldvppStreamDesc;
-typedef struct aclvdecFrameConfig aclvdecFrameConfig;
-typedef struct aclvencChannelDesc aclvencChannelDesc;
-typedef struct aclvencFrameConfig aclvencFrameConfig;
-typedef struct acldvppHist acldvppHist;
-typedef void (*aclvdecCallback)(acldvppStreamDesc *input, acldvppPicDesc *output, void *userData);
-typedef void (*aclvencCallback)(acldvppPicDesc *input, acldvppStreamDesc *output, void *userdata);
-
-// Supported Pixel Format
-enum acldvppPixelFormat {
-  PIXEL_FORMAT_YUV_400 = 0,                      // 0
-  PIXEL_FORMAT_YUV_SEMIPLANAR_420 = 1,           // 1
-  PIXEL_FORMAT_YVU_SEMIPLANAR_420 = 2,           // 2
-  PIXEL_FORMAT_YUV_SEMIPLANAR_422 = 3,           // 3
-  PIXEL_FORMAT_YVU_SEMIPLANAR_422 = 4,           // 4
-  PIXEL_FORMAT_YUV_SEMIPLANAR_444 = 5,           // 5
-  PIXEL_FORMAT_YVU_SEMIPLANAR_444 = 6,           // 6
-  PIXEL_FORMAT_YUYV_PACKED_422 = 7,              // 7
-  PIXEL_FORMAT_UYVY_PACKED_422 = 8,              // 8
-  PIXEL_FORMAT_YVYU_PACKED_422 = 9,              // 9
-  PIXEL_FORMAT_VYUY_PACKED_422 = 10,             // 10
-  PIXEL_FORMAT_YUV_PACKED_444 = 11,              // 11
-  PIXEL_FORMAT_RGB_888 = 12,                     // 12
-  PIXEL_FORMAT_BGR_888 = 13,                     // 13
-  PIXEL_FORMAT_ARGB_8888 = 14,                   // 14
-  PIXEL_FORMAT_ABGR_8888 = 15,                   // 15
-  PIXEL_FORMAT_RGBA_8888 = 16,                   // 16
-  PIXEL_FORMAT_BGRA_8888 = 17,                   // 17
-  PIXEL_FORMAT_YUV_SEMI_PLANNER_420_10BIT = 18,  // 18
-  PIXEL_FORMAT_YVU_SEMI_PLANNER_420_10BIT = 19,  // 19
-  PIXEL_FORMAT_YVU_PLANAR_420 = 20,              // 20
-  PIXEL_FORMAT_YVU_PLANAR_422,
-  PIXEL_FORMAT_YVU_PLANAR_444,
-  PIXEL_FORMAT_RGB_444 = 23,
-  PIXEL_FORMAT_BGR_444,
-  PIXEL_FORMAT_ARGB_4444,
-  PIXEL_FORMAT_ABGR_4444,
-  PIXEL_FORMAT_RGBA_4444,
-  PIXEL_FORMAT_BGRA_4444,
-  PIXEL_FORMAT_RGB_555,
-  PIXEL_FORMAT_BGR_555,
-  PIXEL_FORMAT_RGB_565,
-  PIXEL_FORMAT_BGR_565,
-  PIXEL_FORMAT_ARGB_1555,
-  PIXEL_FORMAT_ABGR_1555,
-  PIXEL_FORMAT_RGBA_1555,
-  PIXEL_FORMAT_BGRA_1555,
-  PIXEL_FORMAT_ARGB_8565,
-  PIXEL_FORMAT_ABGR_8565,
-  PIXEL_FORMAT_RGBA_8565,
-  PIXEL_FORMAT_BGRA_8565,
-  PIXEL_FORMAT_RGB_BAYER_8BPP = 50,
-  PIXEL_FORMAT_RGB_BAYER_10BPP,
-  PIXEL_FORMAT_RGB_BAYER_12BPP,
-  PIXEL_FORMAT_RGB_BAYER_14BPP,
-  PIXEL_FORMAT_RGB_BAYER_16BPP,
-  PIXEL_FORMAT_BGR_888_PLANAR = 70,
-  PIXEL_FORMAT_HSV_888_PACKAGE,
-  PIXEL_FORMAT_HSV_888_PLANAR,
-  PIXEL_FORMAT_LAB_888_PACKAGE,
-  PIXEL_FORMAT_LAB_888_PLANAR,
-  PIXEL_FORMAT_S8C1,
-  PIXEL_FORMAT_S8C2_PACKAGE,
-  PIXEL_FORMAT_S8C2_PLANAR,
-  PIXEL_FORMAT_S16C1,
-  PIXEL_FORMAT_U8C1,
-  PIXEL_FORMAT_U16C1,
-  PIXEL_FORMAT_S32C1,
-  PIXEL_FORMAT_U32C1,
-  PIXEL_FORMAT_U64C1,
-  PIXEL_FORMAT_S64C1,
-  PIXEL_FORMAT_YUV_SEMIPLANAR_440 = 1000,
-  PIXEL_FORMAT_YVU_SEMIPLANAR_440,
-  PIXEL_FORMAT_FLOAT32,
-  PIXEL_FORMAT_BUTT,
-  PIXEL_FORMAT_UNKNOWN = 10000
-};
-
-// Stream Format
-enum acldvppStreamFormat { H265_MAIN_LEVEL = 0, H264_BASELINE_LEVEL, H264_MAIN_LEVEL, H264_HIGH_LEVEL };
-
-// Supported Channel Mode
-enum acldvppChannelMode { DVPP_CHNMODE_VPC = 1, DVPP_CHNMODE_JPEGD = 2, DVPP_CHNMODE_JPEGE = 4 };
-
-// Supported Border Type
-enum acldvppBorderType { BORDER_CONSTANT = 0, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101 };
-
-/**
- * @ingroup AscendCL
- * @brief alloc device memory for dvpp.
- *
- * @par Function
- * @li It's mainly used for allocating memory to device media data processing.
- * The requested memory meets the data processing requirements.
- * After calling this interface to request memory,
- * you must release the memory using the acldvppFree interface.
- * @li When calling the acldvppMalloc interface to apply for memory,
- * the size entered by the user is aligned upwards to 32 integer multiples,
- * and an additional 32 bytes are applied.
- *
- * @par Restriction
- * If the user uses the acldvppMalloc interface to apply for a large block of
- * memory and divide and manage the memory by himself,
- * when applying for memory, the user needs to align up to 32 integer
- * times + 32 bytes (ALIGN_UP [len] +32 words) according to
- * the actual data size of each picture Section) to manage memory.
- *
- * @param devPtr [OUT]    memory pointer.
- * @param size [IN]       memory size.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppFree
- */
-ACL_FUNC_VISIBILITY aclError acldvppMalloc(void **devPtr, size_t size);
-
-/**
- * @ingroup AscendCL
- * @brief free device memory for dvpp.
- *
- * @par Function
- * Free the memory requested through the acldvppMalloc interface
- * @param devPtr [IN]      memory pointer to free.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppMalloc
- */
-ACL_FUNC_VISIBILITY aclError acldvppFree(void *devPtr);
-
-/**
- * @ingroup AscendCL
- * @brief create DvppChannelDesc.
- *
- * @par Function
- * Create a channel for image data processing.
- * The same channel can be reused
- * and is no longer available after destruction
- *
- * @retval null for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY acldvppChannelDesc *acldvppCreateChannelDesc();
-
-/**
- * @ingroup AscendCL
- * @brief destroy dvppChannelDesc.
- *
- * @par Function
- * Can only destroy channels created by the acldvppCreateChannel interface
- * @param channelDesc [IN]     the channel description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannelDesc | acldvppDestroyChannel
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyChannelDesc(acldvppChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp channel Id.
- *
- * @par Restriction
- * Interface calling sequence:
- * acldvppCreateChannelDesc --> acldvppCreateChannel -->
- * acldvppGetChannelDescChannelId
- *
- * @param channelDesc [IN]     the channel description.
- *
- * @retval channel id.
- *
- * @see acldvppCreateChannelDesc | acldvppCreateChannel
- */
-ACL_FUNC_VISIBILITY uint64_t acldvppGetChannelDescChannelId(const acldvppChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp picture description.
- *
- * @retval null for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY acldvppPicDesc *acldvppCreatePicDesc();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp picture description.
- *
- * @par Function
- * Can only destroy picture description information created
- * through acldvppCreatePicDesc interface.
- * @param picDesc [IN]     dvpp picture description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreatePicDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyPicDesc(acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's data.
- *
- * @param picDesc [OUT]   dvpp picture description.
- * @param dataDev [IN]    dvpp picture dataDev.Must be the memory
- *                        requested using the acldvppMalloc interface.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppMalloc
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescData(acldvppPicDesc *picDesc, void *dataDev);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's size.
- *
- * @param picDesc [OUT]      dvpp picture description.
- * @param size dvpp [IN]     picture size.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescSize(acldvppPicDesc *picDesc, uint32_t size);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's format.
- *
- * @param picDesc [OUT]    dvpp picture description.
- * @param format [IN]      dvpp picture format.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescFormat(acldvppPicDesc *picDesc, acldvppPixelFormat format);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's width.
- *
- * @param picDesc [OUT]   dvpp picture description.
- * @param width [IN]      dvpp picture width.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescWidth(acldvppPicDesc *picDesc, uint32_t width);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's height.
- *
- * @param picDesc [OUT]  dvpp picture description.
- * @param height [IN]    dvpp picture height.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescHeight(acldvppPicDesc *picDesc, uint32_t height);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's widthStride.
- *
- * @par Restriction
- * Width alignment requirements:
- * @li The minimum stride is 32 and the maximum is 4096 * 4
- * (that is, an image in argb format with a width of 4096);
- * @li For 8K scaling, widthStride is required to be aligned to 2;
- * @li For non 8K scaling, the calculation formula for widthStride
- * is different for different image formats:
- *   @li yuv400sp, yuv420sp, yuv422sp, yuv444sp: input image width aligned to 16
- *   @li yuv422packed: input image width * 2 and then align to 16
- *   @li yuv444packed, rgb888: input image width alignment * 3, alignment to 16
- *   @li xrgb8888: input image width * 4, align to 16
- *   @li HFBC:input image width
- *
- * @param picDesc [OUT]      dvpp picture description.
- * @param widthStride [IN]   dvpp picture widthStride.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescWidthStride(acldvppPicDesc *picDesc, uint32_t widthStride);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's heightStride.
- *
- * @par Restriction
- * Height alignment requirements:
- * @li The height of the input image is aligned to 2.
- * High stride minimum 6 and maximum 4096.
- *
- * @param picDesc [OUT]        dvpp picture description.
- * @param heightStride [IN]    dvpp picture heightStride.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescHeightStride(acldvppPicDesc *picDesc, uint32_t heightStride);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp picture description's retcode.
- *
- * @param picDesc [OUT]    dvpp picture description.
- * @param retCode [IN]     dvpp picture retcode.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetPicDescRetCode(acldvppPicDesc *picDesc, uint32_t retCode);
-
-/**
- * @ingroup AscendCL
- * @brief Get picture data.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval picture data addr.
- * @retval default nullptr.
- */
-ACL_FUNC_VISIBILITY void *acldvppGetPicDescData(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get picture data size.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval picture data size.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetPicDescSize(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture desc's format.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval format
- * @retval default PIXEL_FORMAT_YUV_400.
- */
-ACL_FUNC_VISIBILITY acldvppPixelFormat acldvppGetPicDescFormat(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture desc's width.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval width.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetPicDescWidth(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture desc's height.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval height.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetPicDescHeight(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture desc's widthStride.
- *
- * @par Restriction
- * Width alignment requirements:
- * @li The minimum stride is 32 and the maximum is 4096 * 4
- * (that is, an image in argb format with a width of 4096);
- * @li For 8K scaling, widthStride is required to be aligned to 2;
- * @li For non 8K scaling, the calculation formula for widthStride
- * is different for different image formats:
- *   @li yuv400sp, yuv420sp, yuv422sp, yuv444sp: input image width aligned to 16
- *   @li yuv422packed: input image width * 2 and then align to 16
- *   @li yuv444packed, rgb888: input image width alignment * 3, alignment to 16
- *   @li xrgb8888: input image width * 4, align to 16
- *   @li HFBC:input image width
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval stride width.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetPicDescWidthStride(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture desc's heightStride.
- *
- * @par Restriction
- * Height alignment requirements:
- * @li The height of the input image is aligned to 2.
- * High stride minimum 6 and maximum 4096.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval stride height.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetPicDescHeightStride(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture desc's retcode.
- *
- * @param picDesc [IN]    dvpp picture description.
- *
- * @retval ret code.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetPicDescRetCode(const acldvppPicDesc *picDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp roi config.
- *
- * @param left [IN]    the left offset, must be even
- * @param right [IN]   the right offset, must be odd
- * @param top [IN]     the top offset, must be even
- * @param bottom [IN]  the bottom offset, must be odd
- *
- * @retval null for failed.
- * @retval other success
- */
-ACL_FUNC_VISIBILITY acldvppRoiConfig *acldvppCreateRoiConfig(uint32_t left, uint32_t right, uint32_t top,
-                                                             uint32_t bottom);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp roi config.
- *
- * @par Function
- * Destroys data created through the acldvppCreateRoiConfig interface
- * @param roiConfig [IN]    dvpp roi config.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateRoiConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyRoiConfig(acldvppRoiConfig *roiConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Set left of RoiConfig.
- *
- * @param config [OUT]  RoiConfig
- * @param left [IN]     left offset
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetRoiConfigLeft(acldvppRoiConfig *config, uint32_t left);
-
-/**
- * @ingroup AscendCL
- * @brief Set right of RoiConfig.
- *
- * @param config [OUT]  RoiConfig
- * @param right [IN]    right offset
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetRoiConfigRight(acldvppRoiConfig *config, uint32_t right);
-
-/**
- * @ingroup AscendCL
- * @brief Set top of RoiConfig.
- *
- * @param config [OUT]  RoiConfig
- * @param top [IN]      top offset
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetRoiConfigTop(acldvppRoiConfig *config, uint32_t top);
-
-/**
- * @ingroup AscendCL
- * @brief Set bottom of RoiConfig.
- *
- * @param config [OUT]   RoiConfig
- * @param bottom [IN]    bottom offset
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetRoiConfigBottom(acldvppRoiConfig *config, uint32_t bottom);
-
-/**
- * @ingroup AscendCL
- * @brief Set RoiConfig.
- *
- * @param config [OUT]    RoiConfig
- * @param left [IN]       left offset
- * @param right [IN]      right offset
- * @param top [IN]        top offset
- * @param bottom [IN]     bottom offset
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetRoiConfig(acldvppRoiConfig *config, uint32_t left, uint32_t right, uint32_t top,
-                                                 uint32_t bottom);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp resize config.
- * The specified scaling algorithm is not supported.
- * The default scaling algorithm is "nearest neighbor interpolation".
- *
- * @retval null for failed.
- * @retval other success.
- */
-ACL_FUNC_VISIBILITY acldvppResizeConfig *acldvppCreateResizeConfig();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp resize config.
- *
- * @par Function
- * Destroys the scaling configuration data created by
- * the acldvppCreateResizeConfig interface
- *
- * @param resizeConfig [IN]    resize config.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateResizeConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyResizeConfig(acldvppResizeConfig *resizeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Create jpege config.
- *
- * @retval null for failed.
- * @retval other success.
- */
-ACL_FUNC_VISIBILITY acldvppJpegeConfig *acldvppCreateJpegeConfig();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy jpege config.
- *
- * @par Function
- * Destroys the encoding configuration data created by
- * the acldvppCreateJpegeConfig interface
- * @param jpegeConfig [IN] config pointer to destroy.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateJpegeConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyJpegeConfig(acldvppJpegeConfig *jpegeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Set jpege config's level.
- *
- * @param jpegeConfig [OUT]    Call the acldvppCreateJpegeConfig
- *                             interface to create acldvppJpegeConfig data
- * @param level [IN]   Encoding quality range [0, 100],
- *                     where level 0 encoding quality is similar to level 100,
- *                     and the smaller the value in [1, 100],
- *                     the worse the quality of the output picture.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetJpegeConfigLevel(acldvppJpegeConfig *jpegeConfig, uint32_t level);
-
-/**
- * @ingroup AscendCL
- * @brief Get jpege config's level.
- *
- * @param jpegeConfig [IN]    jpege config.
- *
- * @retval compression level.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetJpegeConfigLevel(const acldvppJpegeConfig *jpegeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief create vdecChannelDesc.Channel description information
- * when creating a video data processing channel.
- *
- * @retval null for failed.
- * @retval other success
- */
-ACL_FUNC_VISIBILITY aclvdecChannelDesc *aclvdecCreateChannelDesc();
-
-/**
- * @ingroup AscendCL
- * @brief destroy vdecChannelDesc.
- *
- * @par Function
- * Can only destroy aclvdecChannelDesc type created
- * through aclvdecCreateChannelDesc interface
- * @param channelDesc [IN]    channel description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
-
- * @see aclvdecCreateChannelDesc
- */
-ACL_FUNC_VISIBILITY aclError aclvdecDestroyChannelDesc(aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's channel id.
- *
- * @param channelDesc [OUT]  vdec channel description.
- * @param channelId [IN]     decoding channel id: 0~15.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescChannelId(aclvdecChannelDesc *channelDesc, uint32_t channelId);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's thread id.
- *
- * @param channelDesc [OUT]    vdec channel description.
- * @param threadId [IN]        thread id.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescThreadId(aclvdecChannelDesc *channelDesc, uint64_t threadId);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's callback function.
- *
- * @param channelDesc [OUT]  vdec channel description.
- * @param callback [IN]      function callback.Function prototype:
- * void (* aclvdecCallback)
- * (acldvppStreamDesc * input, acldvppPicDesc * output, void* userdata)
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclvdecCallback
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescCallback(aclvdecChannelDesc *channelDesc, aclvdecCallback callback);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's video encoding type.
- *
- * @param channelDesc [OUT]  vdec channel description.
- * @param enType [IN]        video encoding type.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescEnType(aclvdecChannelDesc *channelDesc, acldvppStreamFormat enType);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's out picture format.
- *
- * @param channelDesc [OUT]     vdec channel description.
- * @param outPicFormat [IN]     out picture format (acldvppPixelFormat).
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescOutPicFormat(aclvdecChannelDesc *channelDesc,
-                                                               acldvppPixelFormat outPicFormat);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's out picture width.
- *
- * @param channelDesc [OUT]    vdec channel description.
- * @param outPicWidth [IN]     out picture width.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescOutPicWidth(aclvdecChannelDesc *channelDesc, uint32_t outPicWidth);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's out picture height.
- *
- * @param channelDesc [OUT]     vdec channel description.
- * @param outPicHeight [IN]     out picture height.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescOutPicHeight(aclvdecChannelDesc *channelDesc, uint32_t outPicHeight);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's reference frame num.
- *
- * @param channelDesc [OUT]    vdec channel description.
- * @param refFrameNum [IN]     reference frame num.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescRefFrameNum(aclvdecChannelDesc *channelDesc, uint32_t refFrameNum);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel description's bit depth.
- *
- * @param channelDesc [OUT]  vdec channel description.
- * @param bitDepth [IN]      bit depth.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescBitDepth(aclvdecChannelDesc *channelDesc, uint32_t bitDepth);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's channel id.
- *
- * @param channelDesc [IN]     vdec channel description.
- *
- * @retval decoding channel id: 0~15.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t aclvdecGetChannelDescChannelId(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's thread id.
- *
- * @param channelDesc [IN]     vdec channel description.
- *
- * @retval thread id.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint64_t aclvdecGetChannelDescThreadId(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's callback function.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval function callback.Function prototype:
- * void (* aclvdecCallback)
- * (acldvppStreamDesc * input, acldvppPicDesc * output, void* userdata)
- * @retval default null.
- *
- * @see aclvdecCallback
- */
-ACL_FUNC_VISIBILITY aclvdecCallback aclvdecGetChannelDescCallback(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's video encoding type.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval video encoding type.
- * @retval default H265_MAIN_LEVEL.
- */
-ACL_FUNC_VISIBILITY acldvppStreamFormat aclvdecGetChannelDescEnType(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's out picture format.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval out picture format.
- * @retval default DVPP_OUTPUT_YUV420SP_UV.
- */
-ACL_FUNC_VISIBILITY acldvppPixelFormat aclvdecGetChannelDescOutPicFormat(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's out picture width.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval out picture width.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t aclvdecGetChannelDescOutPicWidth(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's out picture height.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval out picture height (for vdec malloc memory).
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t aclvdecGetChannelDescOutPicHeight(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's bit depth.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval bit depth.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t aclvdecGetChannelDescBitDepth(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel description's reference frame num.
- *
- * @param channelDesc [IN]    vdec channel description.
- *
- * @retval reference frame num.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t aclvdecGetChannelDescRefFrameNum(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief create vencChannelDesc.
- *
- * @retval null for failed, other success
- */
-ACL_FUNC_VISIBILITY aclvencChannelDesc *aclvencCreateChannelDesc();
-
-/**
- * @ingroup AscendCL
- * @brief destroy vencChannelDesc.
- *
- * @param channelDesc [IN] channel desc.
- *
- * @retval ACL_SUCCESS:success, other:failed
- */
-ACL_FUNC_VISIBILITY aclError aclvencDestroyChannelDesc(aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Set decoding thread id for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param threadId [IN] thread id
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescThreadId(aclvencChannelDesc *channelDesc, uint64_t threadId);
-
-/**
- * @ingroup AscendCL
- * @brief Set func callback for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param callback [IN]     func callback
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescCallback(aclvencChannelDesc *channelDesc, aclvencCallback callback);
-
-/**
- * @ingroup AscendCL
- * @brief Set video encoding type for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param enType [IN]       video encoding type
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescEnType(aclvencChannelDesc *channelDesc, acldvppStreamFormat enType);
-
-/**
- * @ingroup AscendCL
- * @brief Set pic format for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param picFormat [IN]    pic format
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescPicFormat(aclvencChannelDesc *channelDesc,
-                                                            acldvppPixelFormat picFormat);
-
-/**
- * @ingroup AscendCL
- * @brief Set out pic width for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param picWidth [IN]     pic width
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescPicWidth(aclvencChannelDesc *channelDesc, uint32_t picWidth);
-
-/**
- * @ingroup AscendCL
- * @brief Set pic height for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param picHeight [IN]    pic height
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescPicHeight(aclvencChannelDesc *channelDesc, uint32_t picHeight);
-
-/**
- * @ingroup AscendCL
- * @brief Set key frame interval for venc channel desc.
- *
- * @param channelDesc [OUT]     venc channel desc
- * @param keyFrameInterval [IN] Interval of key frame
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescKeyFrameInterval(aclvencChannelDesc *channelDesc,
-                                                                   uint32_t keyFrameInterval);
-
-/**
- * @ingroup AscendCL
- * @brief Set output buffer address for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param bufAddr [IN]      output buffer address
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescBufAddr(aclvencChannelDesc *channelDesc, void *bufAddr);
-
-/**
- * @ingroup AscendCL
- * @brief Set output buffer size for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param bufSize [IN]      output buffer size
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescBufSize(aclvencChannelDesc *channelDesc, uint32_t bufSize);
-
-/**
- * @ingroup AscendCL
- * @brief Set rc model for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param rcMode [IN]       venc rc mode(VBR=1, CBR=2)
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescRcMode(aclvencChannelDesc *channelDesc, uint32_t rcMode);
-
-/**
- * @ingroup AscendCL
- * @brief Set source rate for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param srcRate [IN] source rate
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescSrcRate(aclvencChannelDesc *channelDesc, uint32_t srcRate);
-
-/**
- * @ingroup AscendCL
- * @brief Set max bit rate for venc channel desc.
- *
- * @param channelDesc [OUT] venc channel desc
- * @param maxBitRate [IN]   max bit rate
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetChannelDescMaxBitRate(aclvencChannelDesc *channelDesc, uint32_t maxBitRate);
-
-/**
- * @ingroup AscendCL
- * @brief Get output buffer address for venc channel desc.
- *
- * @param channelDesc[IN] venc channel desc
- *
- * @retval output buffer address
- */
-ACL_FUNC_VISIBILITY void *aclvencGetChannelDescBufAddr(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get output buffer size for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval output buffer size
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescBufSize(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get decoding channel id for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval decoding channel id: 0~15, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescChannelId(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get decoding thread id for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval thread id, default 0
- */
-ACL_FUNC_VISIBILITY uint64_t aclvencGetChannelDescThreadId(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get func callback for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval func callback, default null
- */
-ACL_FUNC_VISIBILITY aclvencCallback aclvencGetChannelDescCallback(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get video encoding type for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval video encoding type, default H265_MAIN_LEVEL
- */
-ACL_FUNC_VISIBILITY acldvppStreamFormat aclvencGetChannelDescEnType(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get pic format for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval pic format
- */
-ACL_FUNC_VISIBILITY acldvppPixelFormat aclvencGetChannelDescPicFormat(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get pic width for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval pic width, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescPicWidth(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get pic height for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval pic height, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescPicHeight(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get interval of key frame for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval interval of key frame, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescKeyFrameInterval(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- *
- * @brief Get rc mode for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval rc mode, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescRcMode(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- *
- * @brief Get source rate for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval source rate, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescSrcRate(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- *
- * @brief Get max bit rate for venc channel desc.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval max bit rate, default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvencGetChannelDescMaxBitRate(const aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief get forced restart of I-frame interval from config
- *
- * @param config [IN] venc frame config
- *
- * @retval 0: Not forced; 1: Forced restart of I-frame -1: error
- */
-ACL_FUNC_VISIBILITY uint8_t aclvencGetFrameConfigForceIFrame(const aclvencFrameConfig *config);
-
-/**
- * @ingroup AscendCL
- * @brief get forced restart of I-frame interval from config
- *
- * @param config [IN] venc frame config
- *
- * @retval Whether it is the end frame: 0: no; 1: end frame
- */
-ACL_FUNC_VISIBILITY uint8_t aclvencGetFrameConfigEos(const aclvencFrameConfig *config);
-
-/**
- * @ingroup AscendCL
- * @brief set single frame encoding configuration parameters
- *
- * @param config [OUT]    venc frame config
- * @param forceFrame [IN] forced restart of I-frame interval: 0: Not forced; 1: Forced restart of I-frame
- *
- * @retval ACL_SUCCESS for ok, others for fail
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetFrameConfigForceIFrame(aclvencFrameConfig *config, uint8_t forceIFrame);
-
-/**
- * @ingroup AscendCL
- * @brief set single frame encoding configuration parameters
- *
- * @param config [OUT] venc frame config
- * @param eos [IN]     Whether it is the end frame: 0: no; 1: end frame
- *
- * @retval ACL_SUCCESS for ok, others for fail
- */
-ACL_FUNC_VISIBILITY aclError aclvencSetFrameConfigEos(aclvencFrameConfig *config, uint8_t eos);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp venc destroy frame config
- *
- * @param config [IN] venc frame config
- *
- * @retval ACL_SUCCESS for ok, others for fail
- */
-ACL_FUNC_VISIBILITY aclError aclvencDestroyFrameConfig(aclvencFrameConfig *config);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp venc frame config.
- *
- * @retval null for failed, other aclvencFrameConfig ptr
- */
-ACL_FUNC_VISIBILITY aclvencFrameConfig *aclvencCreateFrameConfig();
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp venc channel.
- *
- * @param channelDesc [IN|OUT] venc channel desc
- *
- * @retval ACL_SUCCESS for ok, others for fail
- */
-ACL_FUNC_VISIBILITY aclError aclvencCreateChannel(aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp venc channel.
- *
- * @param channelDesc [IN] venc channel desc
- *
- * @retval ACL_SUCCESS for ok, others for fail
- */
-ACL_FUNC_VISIBILITY aclError aclvencDestroyChannel(aclvencChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp venc launch send frame task.
- *
- * @param channelDesc [IN] venc channel desc
- * @param input [IN]       input picture desc
- * @param reserve [IN]     reserve parameter
- * @param config [IN]      dvpp frame config
- * @param userdata [IN]    user callback function
- *
- * @retval ACL_SUCCESS for ok, others for fail
- */
-ACL_FUNC_VISIBILITY aclError aclvencSendFrame(aclvencChannelDesc *channelDesc, acldvppPicDesc *input, void *reserve,
-                                              aclvencFrameConfig *config, void *userdata);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp stream description.
- *
- * @retval null for failed.
- * @retval other success.
- */
-ACL_FUNC_VISIBILITY acldvppStreamDesc *acldvppCreateStreamDesc();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp stream description.
- *
- * @par Function
- * Can only destroy acldvppStreamDesc type created through
- * acldvppCreateStreamDesc interface.
- *
- * @param streamDesc [IN]     dvpp stream description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateStreamDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyStreamDesc(acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Set stream description's data addr.
- *
- * @param streamDesc [OUT]    dvpp stream description.
- * @param dataDev [IN]        data addr.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetStreamDescData(acldvppStreamDesc *streamDesc, void *dataDev);
-
-/**
- * @ingroup AscendCL
- * @brief Set stream description's data size.
- *
- * @param streamDesc [OUT]     dvpp stream description.
- * @param size [IN]            data size.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetStreamDescSize(acldvppStreamDesc *streamDesc, uint32_t size);
-
-/**
- * @ingroup AscendCL
- * @brief Set stream description's format.
- *
- * @param streamDesc [OUT]    dvpp stream description.
- * @param format [IN]         stream format.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetStreamDescFormat(acldvppStreamDesc *streamDesc, acldvppStreamFormat format);
-
-/**
- * @ingroup AscendCL
- * @brief Set stream description's timestamp.
- *
- * @param streamDesc [OUT]  dvpp stream description.
- * @param timestamp [IN]    current timestamp.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetStreamDescTimestamp(acldvppStreamDesc *streamDesc, uint64_t timestamp);
-
-/**
- * @ingroup AscendCL
- * @brief Set stream description's ret code.
- *
- * @param streamDesc [OUT]    dvpp stream description.
- * @param retCode [IN]        result code.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetStreamDescRetCode(acldvppStreamDesc *streamDesc, uint32_t retCode);
-
-/**
- * @ingroup AscendCL
- * @brief Set stream description's eos.
- *
- * @param streamDesc [OUT]    dvpp stream description.
- * @param eos [IN]            end flag of sequence.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetStreamDescEos(acldvppStreamDesc *streamDesc, uint8_t eos);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream description's data addr.
- *
- * @param streamDesc [IN]     dvpp stream description.
- *
- * @retval data addr.
- * @retval deault nullptr.
- */
-ACL_FUNC_VISIBILITY void *acldvppGetStreamDescData(const acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream description's data size.
- *
- * @param streamDesc [IN]    dvpp stream description.
- *
- * @retval data size.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetStreamDescSize(const acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream description's format.
- *
- * @param streamDesc [IN]    dvpp stream description.
- *
- * @retval stream format.
- * @retval default ACL_DVPP_STREAM_H264.
- */
-ACL_FUNC_VISIBILITY acldvppStreamFormat acldvppGetStreamDescFormat(const acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream description's timestamp.
- *
- * @param streamDesc [IN]    dvpp stream description.
- *
- * @retval current timestamp.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint64_t acldvppGetStreamDescTimestamp(const acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream description's retCode.
- *
- * @param streamDesc [IN]    dvpp stream description.
- *
- * @retval result code.
- * @retval default 0.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetStreamDescRetCode(const acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Get stream description's eos.
- *
- * @param streamDesc [IN]    dvpp stream description.
- *
- * @retval end flag of sequence.
- * @retval default 0(false).
- */
-ACL_FUNC_VISIBILITY uint8_t acldvppGetStreamDescEos(const acldvppStreamDesc *streamDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Create vdec frame config.
- *
- * @retval null for failed.
- * @retval other success.
- */
-ACL_FUNC_VISIBILITY aclvdecFrameConfig *aclvdecCreateFrameConfig();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy vdec frame config.
- *
- * @par Function
- * Can only destroy aclvdecFrameConfig type created through
- *  aclvdecCreateFrameConfig interface
- *
- * @param vdecFrameConfig [IN]     vdec frame config.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclvdecCreateFrameConfig
- */
-ACL_FUNC_VISIBILITY aclError aclvdecDestroyFrameConfig(aclvdecFrameConfig *vdecFrameConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Get image width and height of jpeg.
- *
- * @param data [IN]          image data in host memory
- * @param size [IN]          the size of image data
- * @param width [OUT]        the width of image from image header
- * @param height [OUT]       the height of image from image header
- * @param components [OUT]   the components of image from image header
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppJpegGetImageInfo(const void *data, uint32_t size, uint32_t *width, uint32_t *height,
-                                                     int32_t *components);
-
-/**
- * @ingroup AscendCL
- * @brief Predict encode size of jpeg image.
- *
- * @param inputDesc [IN]     dvpp image desc
- * @param config [IN]        jpeg encode config
- * @param size [OUT]         the size predicted of image
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppJpegPredictEncSize(const acldvppPicDesc *inputDesc,
-                                                       const acldvppJpegeConfig *config, uint32_t *size);
-
-/**
- * @ingroup AscendCL
- * @brief Predict decode size of jpeg image.
- *
- * @param data [IN]                 origin image data in host memory
- * @param dataSize [IN]             the size of origin image data
- * @param outputPixelFormat [IN]    the pixel format jpeg decode
- * @param decSize [OUT]             the size predicted for decode image
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppJpegPredictDecSize(const void *data, uint32_t dataSize,
-                                                       acldvppPixelFormat outputPixelFormat, uint32_t *decSize);
-
-/**
- * @ingroup AscendCL
- * @brief Get image width and height of png.
- *
- * @param data [IN]          image data in host memory
- * @param size [IN]          the size of image data
- * @param width [OUT]        the width of image from image header
- * @param height [OUT]       the height of image from image header
- * @param components [OUT]   the components of image from image header
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppPngGetImageInfo(const void *data, uint32_t dataSize, uint32_t *width,
-                                                    uint32_t *height, int32_t *components);
-
-/**
- * @ingroup AscendCL
- * @brief Predict decode size of png image.
- *
- * @param data [IN]                 origin image data in host memory
- * @param dataSize [IN]             the size of origin image data
- * @param outputPixelFormat [IN]    the pixel format jpeg decode
- * @param decSize [OUT]             the size predicted for decode image
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppPngPredictDecSize(const void *data, uint32_t dataSize,
-                                                      acldvppPixelFormat outputPixelFormat, uint32_t *decSize);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp channel, the same channel can be reused
- * and is no longer available after destruction.
- *
- * @param channelDesc [IN|OUT]    the channel destruction
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannelDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppCreateChannel(acldvppChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp channel.
- *
- * @par Restriction
- * Can only destroy channel created through the acldvppCreateChannel interface
- *
- * @param channelDesc [IN]   the channel destruction
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyChannel(acldvppChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc resize.
- *
- * @par Restriction
- * Width alignment requirements:
- * @li The minimum stride is 32 and the maximum is 4096 * 4
- * (that is, an image in argb format with a width of 4096);
- * @li For 8K scaling, widthStride is required to be aligned to 2;
- * @li For non 8K scaling, the calculation formula for widthStride
- * is different for different image formats:
- *   @li yuv400sp, yuv420sp, yuv422sp, yuv444sp: input image width aligned to 16
- *   @li yuv422packed: input image width * 2 and then align to 16
- *   @li yuv444packed, rgb888: input image width alignment * 3, alignment to 16
- *   @li xrgb8888: input image width * 4, align to 16
- *   @li HFBC:input image width
- * Height alignment requirements:
- * @li The height of the input image is aligned to 2.
- * High stride minimum 6 and maximum 4096.
- *
- * @param channelDesc [IN]  the channel destruction
- * @param inputDesc [IN]    resize input picture destruction
- * @param outputDesc [IN|OUT]  resize output picture destruction
- * @param resizeConfig [IN] resize config
- * @param stream [IN]       resize task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc
- * | acldvppCreateResizeConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcResizeAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *inputDesc,
-                                                   acldvppPicDesc *outputDesc, acldvppResizeConfig *resizeConfig,
-                                                   aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc crop.
- *
- * @par Function
- * crop the input picture according to the specified area,
- * and then store the  picture in the output memory as the output picture
- *
- * @par Restriction
- * Width alignment requirements:
- * @li The minimum stride is 32 and the maximum is 4096 * 4
- * (that is, an image in argb format with a width of 4096);
- * @li For 8K scaling, widthStride is required to be aligned to 2;
- * @li For non 8K scaling, the calculation formula for widthStride
- * is different for different image formats:
- *   @li yuv400sp, yuv420sp, yuv422sp, yuv444sp: input image width aligned to 16
- *   @li yuv422packed: input image width * 2 and then align to 16
- *   @li yuv444packed, rgb888: input image width alignment * 3, alignment to 16
- *   @li xrgb8888: input image width * 4, align to 16
- *   @li HFBC:input image width
- * Height alignment requirements:
- * @li The height of the input image is aligned to 2.
- * High stride minimum 6 and maximum 4096.
- *
- * @param channelDesc [IN]  the channel destruction
- * @param inputDesc [IN]    crop input picture destruction
- * @param outputDesc [IN|OUT]  crop output picture destruction
- * @param cropArea [IN]     crop area config
- * @param stream [IN]       crop task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcCropAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *inputDesc,
-                                                 acldvppPicDesc *outputDesc, acldvppRoiConfig *cropArea,
-                                                 aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc batch crop.
- *
- * @par Function
- * crop the input batch picture according to the specified area
- * as the output batch pictures
- *
- * @param channelDesc [IN]         the channel destruction
- * @param srcBatchPicDescs [IN]    crop input batch picture destruction
- * @param roiNums [IN]    roi config numbers
- * @param size [IN]       roiNum size
- * @param dstBatchPicDescs [IN|OUT]    crop output batch picture destruction
- * @param cropAreas [IN]    crop area configs
- * @param stream [IN]       crop batch task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreateBatchPicDesc | acldvppCreateRoiConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcBatchCropAsync(acldvppChannelDesc *channelDesc,
-                                                      acldvppBatchPicDesc *srcBatchPicDescs, uint32_t *roiNums,
-                                                      uint32_t size, acldvppBatchPicDesc *dstBatchPicDescs,
-                                                      acldvppRoiConfig *cropAreas[], aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc crop and paste.
- *
- * @par Function
- * crop the input picture according to the specified area,
- * and paste the picture to the specified position of the target picture
- * as the output picture
- *
- * @param channelDesc [IN]   thechannel destruction
- * @param inputDesc [IN]     crop and paste input picture destruction
- * @param outputDesc [IN|OUT]   crop and paste output picture destruction
- * @param cropArea [IN]      crop area config
- * @param pasteArea [IN]     paste area config
- * @param stream [IN]        crop and paste task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc | acldvppCreateRoiConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcCropAndPasteAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *inputDesc,
-                                                         acldvppPicDesc *outputDesc, acldvppRoiConfig *cropArea,
-                                                         acldvppRoiConfig *pasteArea, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc batch crop and paste.
- *
- * @par Function
- * crop the input batch picture according to the specified area,
- * and paste the pictures to the specified position of the target pictures
- * as the output batch pictures
- *
- * @param channelDesc [IN]       the channel destruction
- * @param srcBatchPicDescs [IN]  crop input batch picture destruction
- * @param roiNums [IN]     roi config numbers
- * @param size [IN]        roiNum size
- * @param dstBatchPicDescs [IN|OUT]    crop output batch picture destruction
- * @param cropAreas [IN]   crop area configs
- * @param pasteAreas [IN]  paste area configs
- * @param stream [IN]      crop batch task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreateBatchPicDesc | acldvppCreateRoiConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcBatchCropAndPasteAsync(acldvppChannelDesc *channelDesc,
-                                                              acldvppBatchPicDesc *srcBatchPicDescs, uint32_t *roiNums,
-                                                              uint32_t size, acldvppBatchPicDesc *dstBatchPicDescs,
-                                                              acldvppRoiConfig *cropAreas[],
-                                                              acldvppRoiConfig *pasteAreas[], aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc jpeg decode.
- *
- * @par Function
- * For different source picture formats, after decoding,
- * output pictures in the following format:
- * @li jpeg(444) -> YUV444SP:V is front U is back,
- * YUV420 SP V is front U is back, YUV420SP U is front V is back;
- * @li jpeg(422) -> YUV422SP:V is in front U is behind,
- * YUV420SP V is in front U is behind, YUV420SP U is in front V is behind;
- * @li jpeg(420) -> YUV420SP:
- * V is front U is back, YUV420SP U is front V is back;
- * @li jpeg(400) -> YUV420SP:UV data is filled with 0 x 80.
- *
- * @param channelDesc [IN]  the channel destruction
- * @param data [IN]         decode input picture destruction's data
- * @param size [IN]         decode input picture destruction's size
- * @param outputDesc [IN|OUT]  decode output picture destruction
- * @param stream [IN]       decode task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppJpegDecodeAsync(acldvppChannelDesc *channelDesc, const void *data, uint32_t size,
-                                                    acldvppPicDesc *outputDesc, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc jpeg encode.
- *
- * @param channelDesc [IN]  the channel destruction
- * @param inputDesc [IN]    encode input picture destruction
- * @param data [OUT]        encode output picture destruction's data
- * @param size [IN|OUT]     encode output picture destruction's size
- * @param config [IN]       jpeg encode config
- * @param stream [IN]       encode task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreateJpegeConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppJpegEncodeAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *inputDesc,
-                                                    const void *data, uint32_t *size, acldvppJpegeConfig *config,
-                                                    aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc png decode.
- *
- * @param channelDesc [IN]    the channel destruction
- * @param data [IN]           decode input picture destruction's data
- * @param size [IN]           decode input picture destruction's size
- * @param outputDesc [IN|OUT]    decode output picture destruction
- * @param stream [IN]         decode task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppPngDecodeAsync(acldvppChannelDesc *channelDesc, const void *data, uint32_t size,
-                                                   acldvppPicDesc *outputDesc, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Create vdec channel.
- *
- * @par Function
- * Create a channel for video data processing,
- * the same channel can be reused,
- * and is no longer available after destruction
- *
- * @param channelDesc [IN|OUT]    the channel destruction
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclvdecCreateChannelDesc
- */
-ACL_FUNC_VISIBILITY aclError aclvdecCreateChannel(aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy vdec channel.
- *
- * @par Function
- * Can only destroy channels created by the aclvdecCreateChannel interface
- *
- * @param channelDesc [IN]    the channel destruction
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclvdecCreateChannel
- */
-ACL_FUNC_VISIBILITY aclError aclvdecDestroyChannel(aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vdec send frame.
- *
- * @par Function
- * Pass the input memory to be decoded
- * and the decoded output memory to the decoder for decoding
- *
- * @param channelDesc [IN] vdec channel destruction
- * @param input [IN]       input stream destruction
- * @param output [IN|OUT]  output picture destruction
- * @param config [IN]      vdec frame config
- * @param userData [IN]    user data for callback function
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclvdecCreateChannel | acldvppCreateStreamDesc | acldvppCreatePicDesc
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSendFrame(aclvdecChannelDesc *channelDesc, acldvppStreamDesc *input,
-                                              acldvppPicDesc *output, aclvdecFrameConfig *config, void *userData);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vdec send skipped frame.
- *
- * @par Function
- * Pass video frame to decoder
- *
- * @param channelDesc [IN] vdec channel destruction
- * @param input [IN]       input stream destruction
- * @param config [IN]      vdec frame config
- * @param userData [IN]    user data for callback function
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see aclvdecCreateChannel | acldvppCreateStreamDesc | acldvppCreatePicDesc | aclvdecSendFrame
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSendSkippedFrame(aclvdecChannelDesc *channelDesc, acldvppStreamDesc *input,
-                                                     aclvdecFrameConfig *config, void *userData);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc convert color.
- *
- * @par Restriction
- * @li outputDesc:Width height stride, No changes are allowed. Just configure 0
- * @par Function
- * Convert color gamut
- *
- * @param channelDesc [IN] the channel destruction
- * @param inputDesc [IN]   convert color input picture destruction
- * @param outputDesc [IN|OUT] convert color output picture destruction
- * @param stream [IN]      convert color task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcConvertColorAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *inputDesc,
-                                                         acldvppPicDesc *outputDesc, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief dvpp vpc pyramid down.
- *
- * @par Restriction
- * @li outputDesc:format only supported YUV400
- * @par Function
- * Image pyramid down
- *
- * @param channelDesc [IN] the channel destruction
- * @param inputDesc [IN]   pyr down input picture destruction
- * @param outputDesc [IN|OUT] pyr down output picture destruction
- * @param reserve [IN]     reserved param , must be nullptr
- * @param stream [IN]      pyr down task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcPyrDownAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *inputDesc,
-                                                    acldvppPicDesc *outputDesc, void *reserve, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Set dvpp channel mode.
- *
- * @param channelDesc [OUT] the channel destruction
- * @param mode [IN]         channel mode
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetChannelDescMode(acldvppChannelDesc *channelDesc, uint32_t mode);
-
-/**
- * @ingroup AscendCL
- * @brief Set resize config interpolation.
- *
- * @param resizeConfig [OUT] the resize config
- * @param interpolation [IN] interpolation
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetResizeConfigInterpolation(acldvppResizeConfig *resizeConfig,
-                                                                 uint32_t interpolation);
-
-/**
- * @ingroup AscendCL
- * @brief Get resize config interpolation.
- *
- * @param resizeConfig [IN] the resize config
- *
- * @retval Interpolation of resize config.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetResizeConfigInterpolation(const acldvppResizeConfig *resizeConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Set vdec channel out mode.
- *
- * @param channelDesc [OUT] the channel destruction
- * @param outMode [IN] channel out mode
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescOutMode(aclvdecChannelDesc *channelDesc, uint32_t outMode);
-
-/**
- * @ingroup AscendCL
- * @brief Get vdec channel out mode.
- *
- * @param channelDesc [IN] the channel destruction
- *
- * @retval Out mode of channel destruction
- * @retval default 0
- */
-ACL_FUNC_VISIBILITY uint32_t aclvdecGetChannelDescOutMode(const aclvdecChannelDesc *channelDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp batch picture description.
- *
- * @param batchSize [IN]    batch size
- *
- * @retval null for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY acldvppBatchPicDesc *acldvppCreateBatchPicDesc(uint32_t batchSize);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp picture description.
- *
- * @param batchPicDesc [IN] dvpp batch picture description.
- * @param index [IN]        index of batch
- *
- * @retval null for failed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateBatchPicDesc
- */
-ACL_FUNC_VISIBILITY acldvppPicDesc *acldvppGetPicDesc(acldvppBatchPicDesc *batchPicDesc, uint32_t index);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy dvpp batch picture description.
- *
- * @par Function
- * Can only destroy batch picture description information created
- * through acldvppCreateBatchPicDesc interface.
- *
- * @param batchPicDesc [IN]     dvpp batch picture description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateBatchPicDesc
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyBatchPicDesc(acldvppBatchPicDesc *batchPicDesc);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp lut map.
- *
- * @retval null for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY acldvppLutMap *acldvppCreateLutMap();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy lut map.
- *
- * @param lutMap [IN]    lut map
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyLutMap(acldvppLutMap *lutMap);
-
-/**
- * @ingroup AscendCL
- * @brief Get lut map dims.
- *
- * @param lutMap [IN]    lut map
- *
- * @retval 0 for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetLutMapDims(const acldvppLutMap *lutMap);
-
-/**
- * @ingroup AscendCL
- * @brief Get lut map data.
- *
- * @param lutMap [IN]   lut map
- * @param dim [IN]      input dim of map
- * @param data [OUT]    the dim of lut map's data
- * @param len [OUT]     the dim of lut map's length
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppGetLutMapData(const acldvppLutMap *lutMap, uint32_t dim, uint8_t **data,
-                                                  uint32_t *len);
-/**
- * @ingroup AscendCL
- * @brief Vpc equalize hist.
- *
- * @param channelDesc [IN] channel desc
- * @param inputDesc [IN]   input desc
- * @param outputDesc [IN|OUT] output desc
- * @param lutMap [IN]      lut map param
- * @param stream [IN]      runtime stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel|acldvppCreatePicDesc|acldvppCreateLutMap
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcEqualizeHistAsync(const acldvppChannelDesc *channelDesc,
-                                                         const acldvppPicDesc *inputDesc, acldvppPicDesc *outputDesc,
-                                                         const acldvppLutMap *lutMap, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Create dvpp border config.
- *
- * @retval null for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY acldvppBorderConfig *acldvppCreateBorderConfig();
-
-/**
- * @ingroup AscendCL
- * @brief Set value of border config.
- *
- * @param borderConfig [OUT] border config
- * @param index [IN]         index of value array
- * @param value [IN]         value
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetBorderConfigValue(acldvppBorderConfig *borderConfig, uint32_t index,
-                                                         double value);
-
-/**
- * @ingroup AscendCL
- * @brief Set border type of border config.
- *
- * @param borderConfig [OUT] border config
- * @param borderType [IN]    border type
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetBorderConfigBorderType(acldvppBorderConfig *borderConfig,
-                                                              acldvppBorderType borderType);
-
-/**
- * @ingroup AscendCL
- * @brief Set top of border config.
- *
- * @param borderConfig [OUT] border config
- * @param top [IN]           top of border
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetBorderConfigTop(acldvppBorderConfig *borderConfig, uint32_t top);
-
-/**
- * @ingroup AscendCL
- * @brief Set bottom of border config.
- *
- * @param borderConfig [OUT] border config
- * @param bottom [IN]        bottom of border
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetBorderConfigBottom(acldvppBorderConfig *borderConfig, uint32_t bottom);
-
-/**
- * @ingroup AscendCL
- * @brief Set left of border config.
- *
- * @param borderConfig [OUT] border config
- * @param left [IN]          left of border
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetBorderConfigLeft(acldvppBorderConfig *borderConfig, uint32_t left);
-
-/**
- * @ingroup AscendCL
- * @brief Set right of border config.
- *
- * @param borderConfig [OUT] border config
- * @param right [IN]         right of border
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppSetBorderConfigRight(acldvppBorderConfig *borderConfig, uint32_t right);
-
-/**
- * @ingroup AscendCL
- * @brief Get value of border config.
- *
- * @param borderConfig [IN] border config
- * @param index[IN] index of value array
- *
- * @retval invalid value is < 0, normal Value is >= 0
- */
-ACL_FUNC_VISIBILITY double acldvppGetBorderConfigValue(const acldvppBorderConfig *borderConfig, uint32_t index);
-
-/**
- * @ingroup AscendCL
- * @brief Get border type of border config.
- *
- * @param borderConfig [IN] border config
- * @retval border type of border config
- */
-ACL_FUNC_VISIBILITY acldvppBorderType acldvppGetBorderConfigBorderType(const acldvppBorderConfig *borderConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Get right of border config.
- *
- * @param borderConfig [IN] border config
- *
- * @retval default 0, top value of border config
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetBorderConfigTop(const acldvppBorderConfig *borderConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Get Bottom of border config.
- *
- * @param borderConfig [IN] border config
- *
- * @retval default 0, top value of border config
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetBorderConfigBottom(const acldvppBorderConfig *borderConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Get left of border config.
- *
- * @param borderConfig [IN] border config
- *
- * @retval default 0, top value of border config
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetBorderConfigLeft(const acldvppBorderConfig *borderConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Get right of border config.
- *
- * @param borderConfig [IN] border config
- *
- * @retval default 0, right value of border config
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetBorderConfigRight(const acldvppBorderConfig *borderConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Destroy border config.
- *
- * @param borderConfig [IN] border config
- *
- * @retval ACL_SUCCESS for success, other for failure
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyBorderConfig(acldvppBorderConfig *borderConfig);
-
-/**
- * @ingroup AscendCL
- * @brief Vpc make border.
- *
- * @param channelDesc [IN]  channel desc
- * @param inputDesc [IN]    input desc
- * @param outputDesc [IN|OUT]  output desc
- * @param borderConfig [IN] border config param
- * @param stream [IN]       runtime stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel|acldvppCreatePicDesc|acldvppCreateBorderConfig
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcMakeBorderAsync(const acldvppChannelDesc *channelDesc,
-                                                       const acldvppPicDesc *inputDesc, acldvppPicDesc *outputDesc,
-                                                       const acldvppBorderConfig *borderConfig, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Dvpp vpc calc hist.
- *
- * @param channelDesc [IN] the channel destruction
- * @param srcPicDesc [IN]  pyr down input picture destruction
- * @param hist [IN|OUT]    pyr down output picture destruction
- * @param reserve [IN]     reserved param, must be nullptr
- * @param stream [IN]      task stream
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateChannel | acldvppCreatePicDesc | acldvppCreateHist
- */
-ACL_FUNC_VISIBILITY aclError acldvppVpcCalcHistAsync(acldvppChannelDesc *channelDesc, acldvppPicDesc *srcPicDesc,
-                                                     acldvppHist *hist, void *reserve, aclrtStream stream);
-
-/**
- * @ingroup AscendCL
- * @brief Create vpc hist description.
- *
- * @retval null for failed.
- * @retval OtherValues success.
- */
-ACL_FUNC_VISIBILITY acldvppHist *acldvppCreateHist();
-
-/**
- * @ingroup AscendCL
- * @brief Destroy vpc hist description.
- *
- * @par Function
- * Can only destroy hist description information created
- * through acldvppCreateHist interface.
- *
- * @param hist [IN] vpc hist description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateHist
- */
-ACL_FUNC_VISIBILITY aclError acldvppDestroyHist(acldvppHist *hist);
-
-/**
- * @ingroup AscendCL
- * @brief Get dims of vpc hist description.
- *
- * @param hist [IN] vpc hist description.
- *
- * @retval dims of vpc hist description.
- *
- * @see acldvppCreateHist | acldvppVpcCalcHistAsync
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetHistDims(acldvppHist *hist);
-
-/**
- * @ingroup AscendCL
- * @brief Get data from vpc hist description by dim.
- *
- * @param hist [IN]  vpc hist description.
- * @param dim [IN]   which dim to get data.
- * @param data [OUT] address of output hist data.
- * @param len [OUT]  len of output hist data.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateHist | acldvppVpcCalcHistAsync
- */
-ACL_FUNC_VISIBILITY aclError acldvppGetHistData(acldvppHist *hist, uint32_t dim, uint32_t **data, uint16_t *len);
-
-/**
- * @ingroup AscendCL
- * @brief Get dvpp calc hist process return code.
- *
- * @param hist [IN] vpc hist description.
- *
- * @retval Dvpp calc hist process return code.
- *
- * @see acldvppCreateHist | acldvppVpcCalcHistAsync
- */
-ACL_FUNC_VISIBILITY uint32_t acldvppGetHistRetCode(acldvppHist *hist);
-
-/**
- * @ingroup AscendCL
- * @brief Set vpc hist description to 0.
- *
- * @par Function
- * Can only clear hist description information created
- * through acldvppCreateHist interface.
- *
- * @param hist [IN] vpc hist description.
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- *
- * @see acldvppCreateHist
- */
-ACL_FUNC_VISIBILITY aclError acldvppClearHist(acldvppHist *hist);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // INC_EXTERNAL_ACL_OPS_ACL_DVPP_H_
diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h
index 374a816a..250252f9 100644
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -61,6 +61,11 @@ const char *const OPTION_EXEC_HCCL_FLAG = "ge.exec.hcclFlag";
 const char *const OPTION_EXEC_ATOMIC_FLAG = "ge.exec.enable_atomic";
 const char *const OPTION_EXEC_DISABLE_REUSED_MEMORY = "ge.exec.disableReuseMemory";
 const char *const OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION = "ge.exec.isTailingOptimization";
+// Dynamic input flag. ge.exec.dynamicInput=1, means enable dynaimc input,
+// ge.exec.dynamicGraphExecuteMode, dynamic_execute[default]
+const char *const OPTION_EXEC_DYNAMIC_INPUT = "ge.exec.dynamicInput";
+const char *const OPTION_EXEC_DYNAMIC_EXECUTE_MODE = "ge.exec.dynamicGraphExecuteMode";
+const char *const OPTION_EXEC_DATA_INPUTS_SHAPE_RANGE = "ge.exec.dataInputsShapeRange";
 
 // Option key: memory init
 const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize";
@@ -291,8 +296,17 @@ const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel";
 // Configure model bank path
 const std::string MDL_BANK_PATH_FLAG = "ge.mdl_bank_path";
 
+// Configure display_model_info flag
+const std::string DISPLAY_MODEL_INFO = "ge.display_model_info";
+
 // Configure op bank path
 const std::string OP_BANK_PATH_FLAG = "ge.op_bank_path";
+const std::string OP_BANK_UPDATE_FLAG = "ge.op_bank_update";
+
+// Configure for fix hcombroadcast format.
+// when config model multi, broadcast format should be fixed
+// 0: data multi; 1: model multi;
+const std::string HCOM_MULTI_MODE = "ge.hcomMultiMode";
 
 // Graph run mode
 enum GraphRunMode { PREDICTION = 0, TRAIN };
@@ -366,9 +380,11 @@ static const char *const OP_COMPILER_CACHE_DIR = ge::OP_COMPILER_CACHE_DIR;
 static const char *const OP_COMPILER_CACHE_MODE = ge::OP_COMPILER_CACHE_MODE;
 static const char *const MDL_BANK_PATH = ge::MDL_BANK_PATH_FLAG.c_str();
 static const char *const OP_BANK_PATH = ge::OP_BANK_PATH_FLAG.c_str();
+static const char *const OP_BANK_UPDATE = ge::OP_BANK_UPDATE_FLAG.c_str();
 static const char *const OP_DEBUG_LEVEL = ge::OP_DEBUG_LEVEL.c_str();
 
 // for interface: aclgrphBuildModel
+#ifdef __GNUC__
 const std::set<std::string> ir_builder_suppported_options = {INPUT_FORMAT,
                                                              INPUT_SHAPE,
                                                              OP_NAME_MAP,
@@ -388,22 +404,13 @@ const std::set<std::string> ir_builder_suppported_options = {INPUT_FORMAT,
                                                              OP_COMPILER_CACHE_DIR,
                                                              OP_COMPILER_CACHE_MODE,
                                                              MDL_BANK_PATH,
-                                                             OP_BANK_PATH};
+                                                             OP_BANK_PATH,
+                                                             OP_BANK_UPDATE};
 
 // for interface: aclgrphParse
-const std::set<std::string> ir_parser_suppported_options = {INPUT_FORMAT,
-                                                            INPUT_SHAPE,
-                                                            OP_NAME_MAP,
-                                                            IS_DYNAMIC_INPUT,
-                                                            INPUT_FP16_NODES,
-                                                            IS_INPUT_ADJUST_HW_LAYOUT,
-                                                            IS_OUTPUT_ADJUST_HW_LAYOUT,
-                                                            OUTPUT,
-                                                            OUTPUT_TYPE,
-                                                            OUT_NODES,
-                                                            COMPRESS_WEIGHT_CONF,
-                                                            ENABLE_SCOPE_FUSION_PASSES,
-                                                            LOG_LEVEL};
+const std::set<std::string> ir_parser_suppported_options = {
+  INPUT_FP16_NODES, IS_INPUT_ADJUST_HW_LAYOUT, IS_OUTPUT_ADJUST_HW_LAYOUT, OUTPUT,
+  OUT_NODES,        ENABLE_SCOPE_FUSION_PASSES};
 
 // for interface: aclgrphBuildInitialize
 const std::set<std::string> global_options = {CORE_TYPE,
@@ -424,6 +431,7 @@ const std::set<std::string> global_options = {CORE_TYPE,
                                               DEBUG_DIR,
                                               OP_COMPILER_CACHE_DIR,
                                               OP_COMPILER_CACHE_MODE};
+#endif
 }  // namespace ir_option
 }  // namespace ge
 
diff --git a/inc/external/ge/ge_ir_build.h b/inc/external/ge/ge_ir_build.h
index 778ec21d..afaf42ac 100644
--- a/inc/external/ge/ge_ir_build.h
+++ b/inc/external/ge/ge_ir_build.h
@@ -24,9 +24,9 @@
 #include "graph/ge_error_codes.h"
 
 namespace {
-#define IR_MAJOR_VERSION (int(1))
-#define IR_MINOR_VERSION (int(0))
-#define IR_PATCH_VERSION (int(0))
+const int IR_MAJOR_VERSION = 1;
+const int IR_MINOR_VERSION = 0;
+const int IR_PATCH_VERSION = 0;
 }  // namespace
 
 namespace ge {
@@ -102,24 +102,29 @@ graphStatus aclgrphGetIRVersion(int *major_version, int *minor_version, int *pat
 
 /**
  * @ingroup AscendCL
- * @brief infer shape and data type
+ * @brief dump graph
  *
  * @param graph[IN] the graph ready to build
+ * @param file[IN] file path
+ * @param file[IN] file path string len
  * @retval GRAPH_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-graphStatus aclgrphInferShapeAndType(ge::Graph &graph);
+graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const size_t len);
 
 /**
  * @ingroup AscendCL
- * @brief dump graph
+ * @brief create single op graph
  *
- * @param graph[IN] the graph ready to build
- * @param file[IN] file path
- * @param file[IN] file path string len
+ * @param op_type[IN] the op_type
+ * @param inputs[IN] the inputdesc
+ * @param outputs[IN] the outputdesc
+ * @param graph[OUT] the graph
  * @retval GRAPH_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const size_t len);
+graphStatus aclgrphGenerateForOp(const AscendString &op_type, const std::vector<TensorDesc> &inputs,
+                                 const std::vector<TensorDesc> &outputs, Graph &graph);
+
 };      // namespace ge
 #endif  // INC_EXTERNAL_GE_IR_BUILD_H_
diff --git a/inc/external/ge/ge_prof.h b/inc/external/ge/ge_prof.h
deleted file mode 100644
index 658cea76..00000000
--- a/inc/external/ge/ge_prof.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef INC_EXTERNAL_GE_GE_PROF_H_
-#define INC_EXTERNAL_GE_GE_PROF_H_
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "ge/ge_api_error_codes.h"
-
-namespace ge {
-enum ProfDataTypeConfig {
-  kProfTaskTime = 0x0002,
-  kProfAiCoreMetrics = 0x0004,
-  kProfAicpuTrace = 0x0008,
-  kProfTrainingTrace = 0x0800,
-  kProfHcclTrace = 0x1000
-};
-
-enum ProfilingAicoreMetrics {
-  kAicoreArithmaticThroughput = 0,
-  kAicorePipeline = 1,
-  kAicoreSynchronization = 2,
-  kAicoreMemory = 3,
-  kAicoreInternalMemory = 4,
-  kAicoreStall = 5
-};
-
-typedef struct ProfAicoreEvents ProfAicoreEvents;
-typedef struct aclgrphProfConfig aclgrphProfConfig;
-
-///
-/// @ingroup AscendCL
-/// @brief Initialize the profiling and set profiling configuration path
-/// @param [in] profiler_path: configuration path of profiling
-/// @param [in] length: length of configuration path
-/// @return Status result of function
-///
-Status aclgrphProfInit(const char *profiler_path, uint32_t length);
-
-///
-/// @ingroup AscendCL
-/// @brief Finalize profiling
-/// @return Status result of function
-///
-Status aclgrphProfFinalize();
-
-///
-/// @ingroup AscendCL
-/// @brief Create data of type aclgrphProfConfig
-/// @param [in] deviceid_list: device id list
-/// @param [in] device_nums: device numbers
-/// @param [in] aicore_metrics: type of aicore metrics
-/// @param [in] aicore_events: pointer to aicore events be reserved, only support NULL now
-/// @param [in] data_type_config: modules need profiling
-/// @return Status result of function
-///
-aclgrphProfConfig *aclgrphProfCreateConfig(uint32_t *deviceid_list, uint32_t device_nums,
-                                           ProfilingAicoreMetrics aicore_metrics, ProfAicoreEvents *aicore_events,
-                                           uint64_t data_type_config);
-
-///
-/// @ingroup AscendCL
-/// @brief  Destroy data of type aclgrphProfConfig
-/// @param [in] profiler_config: config of profiling
-/// @return Status result of function
-///
-Status aclgrphProfDestroyConfig(aclgrphProfConfig *profiler_config);
-
-///
-/// @ingroup AscendCL
-/// @brief Start profiling of modules which is configured by profiler config
-/// @param [in] profiler_config: config of profiling
-/// @return Status result of function
-///
-Status aclgrphProfStart(aclgrphProfConfig *profiler_config);
-
-///
-/// @ingroup AscendCL
-/// @brief Stop profiling of modules which is configured by profiler config
-/// @param [in] profiler_config: config of profiling
-/// @return Status result of function
-///
-Status aclgrphProfStop(aclgrphProfConfig *profiler_config);
-}  // namespace ge
-
-#endif  // INC_EXTERNAL_GE_GE_PROF_H_
diff --git a/inc/external/hccl/hccl.h b/inc/external/hccl/hccl.h
deleted file mode 100644
index 46d934e6..00000000
--- a/inc/external/hccl/hccl.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hccl.h
- * @brief HCCL API
- */
-
-#ifndef HCCL_H_
-#define HCCL_H_
-
-#include <hccl/hccl_types.h>
-#include <acl/acl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/**
- * @brief Initialize HCCL.
- *
- * @param clusterInfo A string identifying the cluster info file path, include file name.
- * @param rank A integer identifying the identify for the rank.
- * @param comm A pointer identifying the initialized communication resource.
- * @return HcclResult
- * @see HcclCommDestroy()
- */
-extern HcclResult HcclCommInitClusterInfo(const char *clusterInfo, uint32_t rank, HcclComm *comm);
-
-/**
- * @brief Get hccl root info.
- *
- * @param rootInfo A pointer identifying the hccl root info.
- * @return HcclResult
- */
-extern HcclResult HcclGetRootInfo(HcclRootInfo *rootInfo);
-
-/**
- * @brief Initialize HCCL with root info.
- *
- * @param nRanks A integer identifying the rank size of the cluster.
- * @param rootInfo A struct identifying the hccl root info.
- * @param rank A integer identifying the identify for the rank.
- * @param comm A pointer identifying the initialized communication resource.
- * @return HcclResult
- * @see HcclCommDestroy()
- */
-extern HcclResult HcclCommInitRootInfo(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclComm *comm);
-
-/**
- * @brief AllReduce operator.
- *
- * @param sendBuf A pointer identifying the input data address of the operator.
- * @param recvBuf A pointer identifying the output data address of the operator.
- * @param count An integer(u64) identifying the number of the output data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int16, int32, float16,
- * float32.
- * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
- * @param comm A pointer identifying the communication resource based on.
- * @param stream A pointer identifying the stream information.
- * @return HcclResult
- */
-extern HcclResult HcclAllReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, HcclReduceOp op,
-                                HcclComm comm, aclrtStream stream);
-
-/**
- * @brief Broadcast operator.
- *
- * @param buf A pointer identifying the data address of the operator.
- * @param count An integer(u64) identifying the number of the data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
- * @param root An integer(u32) identifying the the root rank in the operator.
- * @param comm A pointer identifying the communication resource based on
- * @param stream A pointer identifying the stream information.
- * @return HcclResult
- */
-extern HcclResult HcclBroadcast(void *buf, uint64_t count, HcclDataType dataType, uint32_t root, HcclComm comm,
-                                aclrtStream stream);
-
-/**
- * @brief ReduceScatter operator.
- *
- * @param sendBuf A pointer identifying the input data address of the operator.
- * @param recvBuf A pointer identifying the output data address of the operator.
- * @param recvCount An integer(u64) identifying the number of the output data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
- * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
- * @param comm A pointer identifying the communication resource based on.
- * @param stream A pointer identifying the stream information.
- * @return HcclResult
- */
-extern HcclResult HcclReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, HcclDataType dataType,
-                                    HcclReduceOp op, HcclComm comm, aclrtStream stream);
-
-/**
- * @brief AllGather operator.
- *
- * @param sendBuf A pointer identifying the input data address of the operator.
- * @param recvBuf A pointer identifying the output data address of the operator.
- * @param sendCount An integer(u64) identifying the number of the input data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
- * @param comm A pointer identifying the communication resource based on.
- * @param stream A pointer identifying the stream information.
- * @return HcclResult
- */
-extern HcclResult HcclAllGather(void *sendBuf, void *recvBuf, uint64_t sendCount, HcclDataType dataType, HcclComm comm,
-                                aclrtStream stream);
-
-/**
- * @brief Destroy HCCL comm
- *
- * @param comm A pointer identifying the communication resource targetting
- * @return HcclResult
- * @see HcclCommInitClusterInfo()
- */
-extern HcclResult HcclCommDestroy(HcclComm comm);
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-#endif  // HCCL_H_
diff --git a/inc/external/hccl/hccl_types.h b/inc/external/hccl/hccl_types.h
deleted file mode 100644
index 0e832396..00000000
--- a/inc/external/hccl/hccl_types.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hccl_types.h
- * @brief HCCL data type definition
- *
- */
-
-#ifndef HCCL_TYPES_H_
-#define HCCL_TYPES_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-/**
- * @brief HCCL functions return value definition
- */
-typedef enum {
-  HCCL_SUCCESS = 0,              /**< success */
-  HCCL_E_PARA = 1,               /**< parameter error */
-  HCCL_E_PTR = 2,                /**< empty pointer */
-  HCCL_E_MEMORY = 3,             /**< memory error */
-  HCCL_E_INTERNAL = 4,           /**< internal error */
-  HCCL_E_NOT_SUPPORT = 5,        /**< not support feature */
-  HCCL_E_NOT_FOUND = 6,          /**< not found specific resource */
-  HCCL_E_UNAVAIL = 7,            /**< resource unavailable */
-  HCCL_E_SYSCALL = 8,            /**< call system interface error */
-  HCCL_E_TIMEOUT = 9,            /**< timeout */
-  HCCL_E_OPEN_FILE_FAILURE = 10, /**< open file fail */
-  HCCL_E_TCP_CONNECT = 11,       /**< tcp connect fail */
-  HCCL_E_ROCE_CONNECT = 12,      /**< roce connect fail */
-  HCCL_E_TCP_TRANSFER = 13,      /**< tcp transfer fail */
-  HCCL_E_ROCE_TRANSFER = 14,     /**< roce transfer fail */
-  HCCL_E_RUNTIME = 15,           /**< call runtime api fail */
-  HCCL_E_DRV = 16,               /**< call driver api fail */
-  HCCL_E_PROFILING = 17,         /**< call profiling api fail */
-  HCCL_E_CCE = 18,               /**< call cce api fail */
-  HCCL_E_NETWORK = 19,           /**< call network api fail */
-  HCCL_E_RESERVED                /**< reserved */
-} HcclResult;
-
-/**
- * @brief handle to HCCL communicator
- */
-typedef void *HcclComm;
-
-/**
- * @brief HCCL Reduction opperation
- */
-typedef enum {
-  HCCL_REDUCE_SUM = 0,  /**< sum */
-  HCCL_REDUCE_PROD = 1, /**< prod */
-  HCCL_REDUCE_MAX = 2,  /**< max */
-  HCCL_REDUCE_MIN = 3,  /**< min */
-  HCCL_REDUCE_RESERVED  /**< reserved */
-} HcclReduceOp;
-
-/**
- * @brief HCCL data type
- */
-typedef enum {
-  HCCL_DATA_TYPE_INT8 = 0,   /**< int8 */
-  HCCL_DATA_TYPE_INT16 = 1,  /**< int16 */
-  HCCL_DATA_TYPE_INT32 = 2,  /**< int32 */
-  HCCL_DATA_TYPE_FP16 = 3,   /**< fp16 */
-  HCCL_DATA_TYPE_FP32 = 4,   /**< fp32 */
-  HCCL_DATA_TYPE_INT64 = 5,  /**< int64 */
-  HCCL_DATA_TYPE_UINT64 = 6, /**< uint64 */
-  HCCL_DATA_TYPE_RESERVED    /**< reserved */
-} HcclDataType;
-
-const uint32_t HCCL_ROOT_INFO_BYTES = 4108;  // 4108: root info length
-
-/**
- * @brief HCCL root info
- */
-typedef struct HcclRootInfoDef {
-  char internal[HCCL_ROOT_INFO_BYTES];
-} HcclRootInfo;
-
-#ifdef __cplusplus
-}
-#endif  // __cplusplus
-#endif  // HCCL_TYPES_H_
diff --git a/inc/framework/common/debug/log.h b/inc/framework/common/debug/log.h
index 249271a6..72dba126 100644
--- a/inc/framework/common/debug/log.h
+++ b/inc/framework/common/debug/log.h
@@ -28,7 +28,7 @@
 #include "ge/ge_api_error_codes.h"
 
 #if !defined(__ANDROID__) && !defined(ANDROID)
-#define DOMI_LOGE(...) GE_LOG_ERROR(GE_MODULE_NAME, ge::FAILED, __VA_ARGS__)
+#define DOMI_LOGE(fmt, ...) GE_LOG_ERROR(GE_MODULE_NAME, ge::FAILED, fmt, ##__VA_ARGS__)
 #else
 #include <android/log.h>
 #if defined(BUILD_VERSION_PERF)
@@ -258,7 +258,7 @@
 #define GE_ERRORLOG_AND_ERRORMSG(_status, errormsg)                                    \
   {                                                                                    \
     GELOGE(_status, "%s", errormsg);                                                   \
-    ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {errormsg}); \
+    ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {errormsg}); \
   }
 
 #define GE_CHK_LOG_AND_ERRORMSG(expr, _status, errormsg)                                 \
@@ -266,7 +266,7 @@
     bool b = (expr);                                                                     \
     if (!b) {                                                                            \
       GELOGE(_status, "%s", errormsg);                                                   \
-      ErrorManager::GetInstance().ATCReportErrMessage("E10043", {"reason"}, {errormsg}); \
+      ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {errormsg}); \
       return _status;                                                                    \
     }                                                                                    \
   } while (0)
diff --git a/inc/framework/common/fmk_error_codes.h b/inc/framework/common/fmk_error_codes.h
index ec1f26d0..358fca04 100644
--- a/inc/framework/common/fmk_error_codes.h
+++ b/inc/framework/common/fmk_error_codes.h
@@ -23,10 +23,6 @@
 #include "framework/common/fmk_types.h"
 #include "register/register_error_codes.h"
 
-#define MODID_OMG 1          // OMG module ID
-#define MODID_OME 2          // OME module ID
-#define MODID_CALIBRATION 3  // Calibration module ID
-
 // Each module uses the following four macros to define error codes:
 #define DECLARE_ERRORNO_OMG(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OMG, name, value)
 #define DECLARE_ERRORNO_OME(name, value) DECLARE_ERRORNO(SYSID_FWK, MODID_OME, name, value)
@@ -37,6 +33,10 @@
 // Interface for Obtaining Error Code Description
 #define GET_ERRORNO_STR(value) domi::StatusFactory::Instance()->GetErrDesc(value)
 
+const int MODID_OMG = 1;          // OMG module ID
+const int MODID_OME = 2;          // OME module ID
+const int MODID_CALIBRATION = 3;  // Calibration module ID
+
 namespace domi {
 class StatusFactory {
  public:
diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h
index fb1f0be1..f7e6d679 100644
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -37,7 +37,15 @@ enum FrameworkType {
   MINDSPORE = 1,
   TENSORFLOW = 3,
   ANDROID_NN,
-  FRAMEWORK_RESERVED,
+  ONNX,
+};
+
+const std::map<std::string, std::string> kFwkTypeToStr = {
+    {"0", "Caffe"},
+    {"1", "MindSpore"},
+    {"3", "TensorFlow"},
+    {"4", "Android_NN"},
+    {"5", "Onnx"}
 };
 
 enum OpEngineType {
@@ -70,14 +78,15 @@ struct DataBuffer {
 /// @brief External input data
 ///
 struct InputData {
-  uint32_t index;                 // Index of input data
-  uint32_t timestamp;             // Data creation time
-  uint32_t timeout;               // Processing timeout
-  uint32_t model_id;              // Model ID required for data processing
-  uint64_t request_id = 0;        // Request ID
-  std::vector<DataBuffer> blobs;  // Actual input data, currently only supports one input
-  bool is_dynamic_batch = false;  // Whether is dynamic batch size scene, default:false
-  std::string batch_label;        // Gear used for current inference in dynamic batch scene
+  uint32_t index;                            // Index of input data
+  uint32_t timestamp;                        // Data creation time
+  uint32_t timeout;                          // Processing timeout
+  uint32_t model_id;                         // Model ID required for data processing
+  uint64_t request_id = 0;                   // Request ID
+  std::vector<DataBuffer> blobs;             // Actual input data, currently only supports one input
+  bool is_dynamic_batch = false;             // Whether is dynamic batch size scene, default:false
+  std::string batch_label;                   // Gear used for current inference in dynamic batch scene
+  std::vector<std::vector<int64_t>> shapes;  // Input shapes
 };
 
 /// Output result structure definition
@@ -245,6 +254,8 @@ struct TaskDescInfo {
   uint32_t block_dim;
   uint32_t task_id;
   uint32_t stream_id;
+  std::string shape_type;
+  int64_t cur_iter_num;
 };
 
 // Profiling info of graph
@@ -258,6 +269,8 @@ struct ComputeGraphDescInfo {
   std::vector<Format> output_format;
   std::vector<std::vector<int64_t>> output_shape;
   std::vector<DataType> output_data_type;
+  uint32_t task_id;
+  uint32_t stream_id;
 };
 
 struct OpDescInfo {
diff --git a/inc/framework/common/helper/model_helper.h b/inc/framework/common/helper/model_helper.h
index 949d8b4c..4a169dda 100644
--- a/inc/framework/common/helper/model_helper.h
+++ b/inc/framework/common/helper/model_helper.h
@@ -25,6 +25,7 @@
 #include "common/types.h"
 #include "graph/model.h"
 #include "model/ge_model.h"
+#include "model/ge_root_model.h"
 
 namespace ge {
 class ModelHelper {
@@ -32,17 +33,22 @@ class ModelHelper {
   ModelHelper() = default;
   ~ModelHelper();
 
-  Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param,
-                       const std::string &output_file, ge::ModelBufferData &model);
-  Status SaveOriginalGraphToOmModel(const ge::Graph& graph, const std::string& output_file);
+  Status SaveToOmModel(const GeModelPtr &ge_model, const SaveParam &save_param, const std::string &output_file,
+                       ge::ModelBufferData &model);
+  Status SaveToOmRootModel(const GeRootModelPtr &ge_root_model, const SaveParam &save_param, const string &output_file,
+                           ModelBufferData &model, bool is_unknown_shape);
+  Status SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::string &output_file);
   Status LoadModel(const ge::ModelData &model_data);
-  Status GetModelBufferData(ge::ModelBufferData& model);
+  Status LoadRootModel(const ge::ModelData &model_data);
+  Status GetModelBufferData(ge::ModelBufferData &model);
 
-  const ModelFileHeader* GetFileHeader() const { return file_header_; }
+  const ModelFileHeader *GetFileHeader() const { return file_header_; }
 
   GeModelPtr GetGeModel();
+  GeRootModelPtr GetGeRootModel();
   void SetSaveMode(bool val) { is_offline_ = val; }
   bool GetSaveMode(void) const { return is_offline_; }
+  bool GetModelType() const { return is_unknown_shape_model_; };
 
   Status GetBaseNameFromFileName(const std::string &file_name, std::string &base_name);
   Status GetModelNameFromMergedGraphName(const std::string &graph_name, std::string &model_name);
@@ -50,24 +56,47 @@ class ModelHelper {
  private:
   bool is_assign_model_ = false;
   bool is_offline_ = true;
-  ModelFileHeader* file_header_ = nullptr;
+  bool is_unknown_shape_model_ = false;
+  ModelFileHeader *file_header_ = nullptr;
   // Encrypted model need delete temp model and unencrypted model need not delete model
   uint8_t *model_addr_tmp_ = nullptr;
   uint32_t model_len_tmp_ = 0;
   GeModelPtr model_;
+  GeRootModelPtr root_model_;
 
-  ModelHelper(const ModelHelper&);
-  ModelHelper& operator=(const ModelHelper&);
-  Status GenerateGeModel(OmFileLoadHelper& om_load_helper);
-  Status LoadModelData(OmFileLoadHelper& om_load_helper);
-  void SetModelToGeModel(ge::Model& model);
-  Status LoadWeights(OmFileLoadHelper& om_load_helper);
-  Status LoadTask(OmFileLoadHelper& om_load_helper);
-  Status LoadTBEKernelStore(OmFileLoadHelper& om_load_helper);
-  Status LoadCustAICPUKernelStore(OmFileLoadHelper& om_load_helper);
+  ModelHelper(const ModelHelper &);
+  ModelHelper &operator=(const ModelHelper &);
+  Status GenerateGeModel(OmFileLoadHelper &om_load_helper);
+  Status GenerateGeRootModel(OmFileLoadHelper &om_load_helper);
+  Status LoadModelData(OmFileLoadHelper &om_load_helper);
+  void SetModelToGeModel(GeModelPtr &ge_model, Model &model);
+  Status LoadModelData(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadWeights(OmFileLoadHelper &om_load_helper);
+  Status LoadWeights(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadTask(OmFileLoadHelper &om_load_helper);
+  Status LoadTask(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper);
+  Status LoadTBEKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
+  Status LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper);
+  Status LoadCustAICPUKernelStore(OmFileLoadHelper &om_load_helper, GeModelPtr &cur_model, size_t mode_index);
   Status ReleaseLocalModelData() noexcept;
-  Status SaveModelPartition(std::shared_ptr<OmFileSaveHelper>& om_file_save_helper,
-                            ModelPartitionType type, const uint8_t* data, size_t size);
+  Status SaveModelPartition(std::shared_ptr<OmFileSaveHelper> &om_file_save_helper, ModelPartitionType type,
+                            const uint8_t *data, size_t size, size_t model_index);
+  Status SaveModelDef(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                      Buffer &model_buffer, size_t model_index = 0);
+  Status SaveSizeToModelDef(const GeModelPtr &ge_model);
+  Status SaveModelWeights(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                          size_t model_index = 0);
+  Status SaveModelTbeKernel(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                            size_t model_index = 0);
+  Status SaveModelCustAICPU(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                            size_t model_index = 0);
+  Status SaveModelTaskDef(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                          Buffer &task_buffer, size_t model_index = 0);
+  Status SaveModelHeader(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                         size_t model_num = 1);
+  Status SaveAllModelPartiton(shared_ptr<OmFileSaveHelper> &om_file_save_helper, const GeModelPtr &ge_model,
+                              Buffer &model_buffer, Buffer &task_buffer, size_t model_index = 0);
 };
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_HELPER_MODEL_HELPER_H_
diff --git a/inc/framework/common/helper/om_file_helper.h b/inc/framework/common/helper/om_file_helper.h
index 4ca54b50..98ad55d7 100644
--- a/inc/framework/common/helper/om_file_helper.h
+++ b/inc/framework/common/helper/om_file_helper.h
@@ -32,14 +32,14 @@ using std::vector;
 namespace ge {
 struct ModelPartition {
   ModelPartitionType type;
-  uint8_t* data = 0;
+  uint8_t *data = 0;
   uint32_t size = 0;
 };
 
 struct OmFileContext {
   std::vector<ModelPartition> partition_datas_;
   std::vector<char> partition_table_;
-  uint32_t model_data_len_;
+  uint32_t model_data_len_ = 0;
 };
 
 struct SaveParam {
@@ -57,15 +57,23 @@ class OmFileLoadHelper {
 
   Status Init(uint8_t *model_data, const uint32_t model_data_size);
 
+  Status Init(uint8_t *model_data, const uint32_t model_data_size, uint32_t model_num);
+
   Status GetModelPartition(ModelPartitionType type, ModelPartition &partition);
 
+  Status GetModelPartition(ModelPartitionType type, ModelPartition &partition, size_t model_index);
+
   OmFileContext context_;
 
+  vector<OmFileContext> model_contexts_;
+
  private:
   Status CheckModelValid(const ge::ModelData &model) const;
 
   Status LoadModelPartitionTable(uint8_t *model_data, const uint32_t model_data_size);
 
+  Status LoadModelPartitionTable(uint8_t *model_data, const uint32_t model_data_size, uint32_t model_num);
+
   bool is_inited_{false};
 };
 
@@ -79,15 +87,23 @@ class OmFileSaveHelper {
 
   Status AddPartition(ModelPartition &partition);
 
+  Status AddPartition(ModelPartition &partition, size_t cur_index);
+
   const std::vector<ModelPartition> &GetModelPartitions() const;
 
-  Status SaveModel(const SaveParam &save_param, const char *target_file,
-                   ge::ModelBufferData& model, bool is_offline = true);
+  Status SaveModel(const SaveParam &save_param, const char *target_file, ge::ModelBufferData &model,
+                   bool is_offline = true);
 
   Status SaveModelToFile(const char *output_file, ge::ModelBufferData &model, bool is_offline = true);
 
+  vector<OmFileContext> model_contexts_;
+
   ModelFileHeader model_header_;
   OmFileContext context_;
+
+  ModelPartitionTable *GetPartitionTable(size_t cur_ctx_index);
+
+  Status SaveRootModel(const SaveParam &save_param, const char *output_file, ModelBufferData &model, bool is_offline);
 };
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_HELPER_OM_FILE_HELPER_H_
diff --git a/inc/framework/common/op/ge_op_utils.h b/inc/framework/common/op/ge_op_utils.h
index 4718b180..aa50c8a1 100644
--- a/inc/framework/common/op/ge_op_utils.h
+++ b/inc/framework/common/op/ge_op_utils.h
@@ -17,7 +17,6 @@
 #ifndef INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_
 #define INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_
 
-#include <cce/dnn.h>
 #include <memory>
 #include <vector>
 
@@ -32,7 +31,6 @@
 #include "proto/insert_op.pb.h"
 
 namespace ge {
-using namespace cce;
 using domi::Status;
 
 // Add Sub Mul
@@ -76,18 +74,7 @@ class OpUtils {
   static inline bool CheckEnumValid(int32_t check_value, int32_t min_enum_value, int32_t max_enum_value) {
     return check_value < min_enum_value ? false : (check_value >= max_enum_value ? false : true);
   }
-  ///
-  /// @ingroup domi_omg
-  /// @brief Convert the dimension of array according to different format
-  /// @param [in] src_format src_shape format
-  /// @param [in] src Dimension array to be converted
-  /// @param [in] dst_format Target format after conversion
-  /// @param [out] dst Dimension array after conversion
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static bool ConvertDim(ccTensorFormat_t src_format, const std::vector<int64_t> &src, ccTensorFormat_t dst_format,
-                         std::vector<int64_t> &dst);
+
   ///
   /// @ingroup domi_omg
   /// @brief Determine whether to manually calculate the tensor size based on the values of format and dim
@@ -97,73 +84,6 @@ class OpUtils {
   /// @return false skip
   ///
   static bool IsComputDimsSize(const int32_t format, const uint32_t real_dim_cnt);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description, which is used for input and output.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [out] cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensorDesc &model_tensor, ccTensorDescriptor_t &cc_tensor);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description, which is used for input and output.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [in] dst_data_type data_type of the target cc_tensor
-  /// @param [out] cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensorDesc &model_tensor, int32_t dst_data_type,
-                                     ccTensorDescriptor_t &cc_tensor);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description for bias.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [out]  cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensor &model_tensor, ccTensorDescriptor_t &cc_tensor);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initialize the tensor description for bias.
-  /// @param [in] model_tensor Tensor information defined by the offline model
-  /// @param [in] dst_data_type data_type of the target cc_tensor
-  /// @param [out] cc_tensor Tensor definition used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitTensorDescriptor(const ge::GeTensor &model_tensor, int32_t dst_data_type,
-                                     ccTensorDescriptor_t &cc_tensor);
-
-  static Status InitTensorDescriptor(int32_t format, int32_t data_type, const std::vector<int64_t> &dim,
-                                     ccTensorDescriptor_t &cc_tensor, uint32_t real_dim_cnt = 4);
-  ///
-  /// @ingroup domi_ome
-  /// @brief Destroys a tensor
-  /// @param [inout] cc_tensor Tensor definition used by CC
-  ///
-  static void DestroyTensorDescriptor(ccTensorDescriptor_t &cc_tensor) noexcept;
-
-  ///
-  /// @ingroup domi_ome
-  /// @brief Destroys a tensor
-  /// @param [inout] cc_filter cc_filter Definition of the filter used by CC
-  ///
-  static void DestroyFilterDescriptor(ccFilterDescriptor_t &cc_filter);
-
-  ///
-  /// @ingroup domi_ome
-  /// @brief Initializing Filter Description
-  /// @param [in] model_filter Filter information defined in the offline model
-  /// @param [out] cc_filter Definition of the filter used by CC
-  /// @return SUCCESS success
-  /// @return FAILED fail
-  ///
-  static Status InitFilterDescriptor(const ge::GeTensor &model_filter, ccFilterDescriptor_t &cc_filter);
 
   ///
   /// @brief Extract AIPP parameters from AttrDefMap and splice them
@@ -209,16 +129,7 @@ class OpUtils {
   /// @param [out] output Data pointer after conversion. The format is HWCK
   ///
   static void TransDataKCHW2HWCK(const void *input, int64_t K, int64_t C, int64_t H, int64_t W, void *output);
-  ///
-  /// @ingroup domi_omg
-  /// @brief Initialize the input and output description of the data node which is applied to filter weight in the
-  /// training network
-  /// @param [in] model_tensor input and output tensor information
-  /// @param [out] cc_tensor Tensor in CCE format after conversion
-  ///
-  static Status InitFilterTensorDescriptor(const ge::GeTensorDesc &model_tensor, ccFilterDescriptor_t &cc_tensor);
-
-  static void SetTensorDescriptorAllOffsetQuantizeInfo(const GeTensorDesc &tensor, ccTensorDescriptor_t cc_tensor);
+  
   static vector<ConstGeTensorPtr> GetWeights(const ge::Node &node);
   static vector<ConstGeTensorPtr> GetWeights(ge::ConstNodePtr node);
   static vector<GeTensorPtr> MutableWeights(const ge::Node &node);
@@ -228,69 +139,7 @@ class OpUtils {
   static Status GetShapeDataFromConstTensor(const ConstGeTensorPtr &tensor, DataType type, std::vector<int64_t> &dims);
 
  private:
-  friend class CceTensorDescriptor;
   static uint32_t GetRealDimCnt(const GeTensorDesc &tensor_desc);
 };
-
-class CceTensorDescriptor;
-
-using CceTensorDescriptorPtr = std::shared_ptr<CceTensorDescriptor>;
-
-class CceTensorDescriptor {
- public:
-  explicit CceTensorDescriptor(ccTensorDescriptor_t cc_tensor);
-  CceTensorDescriptor(const CceTensorDescriptor &) = delete;
-  CceTensorDescriptor &operator=(const CceTensorDescriptor &) = delete;
-
-  ~CceTensorDescriptor();
-
-  ccTensorDescriptor_t GetPtr() { return cc_tensor_; }
-
-  ///
-  /// @brief      Initializes the tensor based on shape information.
-  /// @param[in]  format  data permutation format
-  /// @param[in]  data_type Data Type
-  /// @param[in]  dim dim information
-  /// @return     return code
-  ///
-  Status InitTensor(int32_t format, int32_t data_type, const std::vector<int64_t> &dims);
-
-  Status InitTensor(int32_t format, int32_t data_type, const ge::GeShape &shape);
-
-  ///
-  /// @brief      get format of tensor
-  /// @param[out] format format of the tensor
-  /// @return     return code
-  ///
-  Status GetFormat(ccTensorFormat_t *format);
-
-  ///
-  /// @brief      Obtains the size of the tensor.
-  /// @param[out] size size of Tensor
-  /// @return     return code
-  ///
-  Status GetTensorSizeInBytes(uint32_t *size);
-
-  ///
-  /// @brief transform tensor between 4d(NCHW) and 5d(NC1HWC0)
-  /// @param [in] xDesc   descriptor of input tensor
-  /// @param [in] x   point to input data in host memory
-  /// @param [in] dataTypeTransmode   mode of data type transform
-  /// @param [in] yDesc   descriptor of output tensor
-  /// @param [in|out] y   point to output data in host memory
-  /// @param [in] ySizeInBytes   size of outputData
-  /// @return return code
-  ///
-  static Status TransTensor(const ccTensorDescriptor_t xDesc, const void *x, const CceTensorDescriptorPtr &yDesc,
-                            void *y, uint32_t ySizeInBytes);
-
-  ///
-  /// @brief      CceTensorDescriptor Static Constructor
-  /// @return     CceTensorDescriptor smart pointer
-  ///
-  static CceTensorDescriptorPtr Create();
-
-  ccTensorDescriptor_t cc_tensor_ = nullptr;
-};
 }  // namespace ge
 #endif  // INC_FRAMEWORK_COMMON_OP_GE_OP_UTILS_H_
diff --git a/inc/framework/common/op/op_parser_util.h b/inc/framework/common/op/op_parser_util.h
index 49b4350a..43254ca9 100644
--- a/inc/framework/common/op/op_parser_util.h
+++ b/inc/framework/common/op/op_parser_util.h
@@ -17,7 +17,6 @@
 #ifndef INC_FRAMEWORK_COMMON_OP_OP_PARSER_UTIL_H_
 #define INC_FRAMEWORK_COMMON_OP_OP_PARSER_UTIL_H_
 
-#include <cce/dnn.h>
 #include <limits.h>
 #include <math.h>
 #include <stdint.h>
@@ -31,10 +30,7 @@ const uint32_t NORMAL_OUTPUT_NUM = 1;
 const uint32_t NORMAL_WORKSPACE_NUM = 0;
 const int32_t NORMAL_1D_DIM_NUM = 1;
 const int32_t NORMAL_SCALE_DIM_NUM = 0;
-const int NORMAL_TENSOR_FORMAT = static_cast<const int>(cce::CC_TENSOR_NC1HWC0);
 const int NORMAL_TENSOR_SIZE = 4;
-const int NORMAL_DEVICE_DATA_TYPE = static_cast<const int>(cce::CC_DATA_HALF);
-const int DEFAULT_POOLING_MODE = static_cast<const int>(cce::CC_POOLING_MAX);
 const uint32_t DEFAULT_REAL_DIM_CNT = 4;
 
 // const
@@ -183,7 +179,6 @@ const int32_t SSD_DETECTIONOUTPUT_BACKGROUND_LABEL_ID_DEFAULT_VALUE = 0;
 const float SSD_DETECTIONOUTPUT_NMS_THRESHOLD_DEFAULT_VALUE = 0.3;
 const int32_t SSD_DETECTIONOUTPUT_TOP_K_DEFAULT_VALUE = 200;
 const float SSD_DETECTIONOUTPUT_ETA_DEFAULT_VALUE = 1.0;
-const int SSD_DETECTIONOUTPUT_CODE_TYPE_DEFAULT_VALUE = static_cast<const int>(cce::CC_BOX_CENTER_SIZE);
 const int32_t SSD_DETECTIONOUTPUT_KEEP_TOP_K_DEFAULT_VALUE = 200;
 const bool SSD_DETECTIONOUTPUT_VARIANCE_ENCODED_IN_TARGET_DEFAULT_VALUE = false;
 const float SSD_DETECTIONOUTPUT_CONFIDENCE_THRESHOLD_DEFAULT_VALUE = 0.1;
@@ -200,7 +195,6 @@ const float REFINEDET_DETECTIONOUTPUT_NMS_THRESHOLD_DEFAULT_VALUE = 0.3;
 const int32_t REFINEDET_DETECTIONOUTPUT_TOP_K_DEFAULT_VALUE = 200;
 const float REFINEDET_DETECTIONOUTPUT_ETA_DEFAULT_VALUE = 1.0;
 const bool REFINEDET_DETECTIONOUTPUT_VARIANCE_ENCODED_IN_TARGET_DEFAULT_VALUE = false;
-const int REFINEDET_DETECTIONOUTPUT_CODE_TYPE_DEFAULT_VALUE = static_cast<const int>(cce::CC_BOX_CENTER_SIZE);
 const int32_t REFINEDET_DETECTIONOUTPUT_KEEP_TOP_K_DEFAULT_VALUE = 200;
 const float REFINEDET_DETECTIONOUTPUT_CONFIDENCE_THRESHOLD_DEFAULT_VALUE = 0.1;
 const float REFINEDET_DETECTIONOUTPUT_OBJECTNESS_SCORE_DEFAULT_VALUE = 0;
diff --git a/inc/framework/common/profiling/ge_profiling.h b/inc/framework/common/profiling/ge_profiling.h
new file mode 100644
index 00000000..83699754
--- /dev/null
+++ b/inc/framework/common/profiling/ge_profiling.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_FRAMEWORK_COMMON_GE_PROFILING_H_
+#define INC_FRAMEWORK_COMMON_GE_PROFILING_H_
+
+#include "ge/ge_api_error_codes.h"
+#include "toolchain/prof_callback.h"
+
+const int MAX_DEV_NUM = 64;
+
+enum ProfCommandHandleType {
+  kProfCommandhandleInit = 0,
+  kProfCommandhandleStart,
+  kProfCommandhandleStop,
+  kProfCommandhandleFinalize,
+  kProfCommandhandleModelSubscribe,
+  kProfCommandhandleModelUnsubscribe
+};
+
+struct ProfCommandHandleData {
+  uint64_t profSwitch;
+  uint32_t devNums;  // length of device id list
+  uint32_t devIdList[MAX_DEV_NUM];
+  uint32_t modelId;
+};
+
+ge::Status RegProfCtrlCallback(MsprofCtrlCallback func);
+ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func);
+ge::Status RegProfReporterCallback(MsprofReporterCallback func);
+ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len);
+
+#endif  // INC_FRAMEWORK_COMMON_GE_PROFILING_H_
diff --git a/inc/framework/common/profiling/ge_runner_profiling.h b/inc/framework/common/profiling/ge_runner_profiling.h
new file mode 100644
index 00000000..d2eff767
--- /dev/null
+++ b/inc/framework/common/profiling/ge_runner_profiling.h
@@ -0,0 +1,24 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_FRAMEWORK_COMMON_GE_RUNNER_PROFILING_H_
+#define INC_FRAMEWORK_COMMON_GE_RUNNER_PROFILING_H_
+
+#include "profiling/ge_profiling.h"
+
+bool IsInitialize();
+
+#endif  // INC_FRAMEWORK_COMMON_GE_RUNNER_PROFILING_H_
diff --git a/inc/framework/common/taskdown_common.h b/inc/framework/common/taskdown_common.h
new file mode 100644
index 00000000..090e7e26
--- /dev/null
+++ b/inc/framework/common/taskdown_common.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_FRAMEWORK_COMMON_TASKDOWN_COMMON_H_
+#define INC_FRAMEWORK_COMMON_TASKDOWN_COMMON_H_
+
+#include "runtime/rt.h"
+
+namespace ge {
+
+const int CC_FUSION_OP_MAX = 32;
+
+typedef enum tagCcStatus {
+  CC_STATUS_SUCCESS = 0,         /**< succ */
+  CC_STATUS_NOT_INITIALIZED = 1, /**< not init */
+  CC_STATUS_ALLOC_FAILED = 2,    /**< alloc mem failed */
+  CC_STATUS_BAD_PARAM = 3,       /**< para check failed */
+  CC_STATUS_INTERNAL_ERROR = 4,  /**< internal error */
+  CC_STATUS_KERNEL_ERROR = 5,    /**< kernel error */
+  CC_STATUS_RUNTIME_ERROR = 6,   /**< runtime error */
+  CC_STATUS_NOT_SUPPORTED = 7,   /**< unsupport error */
+  CC_STATUS_INVALID_VALUE = 7,   /**< invalid value error for blas*/
+  CC_STATUS_RESERVED             /**< just for check */
+} ccStatus_t;
+
+typedef enum tagccKernelType {
+  CCE_AI_CORE = 0, /* cce aicore */
+  CCE_AI_CPU = 1,  /* cce aicpu */
+  TE = 2,          /* te operator*/
+  CUSTOMIZED = 3,  /* customized operator */
+  TE_AI_CORE = 4,  /* te aicore operator*/
+  TE_AI_CPU = 5,   /* te aicpu operator */
+  AI_CPU = 6,      /* aicpu */
+  CUST_AI_CPU = 7, /* custom aicpu*/
+  INVALID = 8,     /* unknown kernel type */
+} ccKernelType;
+
+typedef struct tagOpContext {
+  ccKernelType kernelType;
+  uint32_t opId;
+  uint32_t kernelFuncId;
+  uint32_t opIndex;
+  uint32_t opCount;
+  uint32_t opIndex2[CC_FUSION_OP_MAX];
+  bool isFlowtable;
+  uint16_t *argsOffset;
+  uint32_t argsCount;
+  uint64_t genDataBaseAddr;
+  uint64_t genDataBaseSize;
+  uint64_t genWeightBaseAddr;
+  uint64_t genWeightBaseSize;
+  uint64_t genVariableBaseAddr;
+  uint64_t genVariableBaseSize;
+  uint64_t l2ctrlSize;
+} ccOpContext;
+}  // namespace ge
+
+#endif  // INC_FRAMEWORK_COMMON_TASKDOWN_COMMON_H_
diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h
index 441d0757..4d4c54d1 100644
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -529,7 +529,10 @@ REGISTER_OPTYPE_DECLARE(HVDWAIT, "HorovodWait");
 // aicpu op for online_infer dynamic_dims
 REGISTER_OPTYPE_DECLARE(GETDYNAMICDIMS, "GetDynamicDims");
 
-enum InputMode { INPUT = 0, CONST_INPUT};
+// profiling training trace node
+REGISTER_OPTYPE_DECLARE(PROFILINGTRAININGTRACE, "ProfilingTrainingTrace");
+
+enum InputMode { INPUT = 0, CONST_INPUT };
 
 // Definition of the processing status enum of the process module
 enum ModelProcessState {
@@ -605,7 +608,7 @@ static constexpr uint32_t MODEL_FILE_CHECKSUM_LENGTH = 64;
 ///
 /// @brief length of the reserved field in the model file header
 ///
-static constexpr uint32_t MODEL_FILE_RESERVED_LENGTH = 79;
+static constexpr uint32_t MODEL_FILE_RESERVED_LENGTH = 75;
 
 ///
 /// @ingroup domi_omg
@@ -843,9 +846,10 @@ struct ModelFileHeader {
   uint32_t ops = 0;                                       // Computing power (Kops)
   uint8_t userdefineinfo[USER_DEFINE_INFO_LENGTH] = {0};  // User-defined information. The value contains 32 characters
   uint32_t om_ir_version = 0;
+  uint32_t model_num = 0;
   uint8_t platform_version[PLATFORM_VERSION_LEN] = {0};
   uint8_t platform_type = {0};
-  uint8_t reserved[MODEL_FILE_RESERVED_LENGTH] = {0};  // Reserved field 79
+  uint8_t reserved[MODEL_FILE_RESERVED_LENGTH] = {0};  // Reserved field 75
 };
 
 static constexpr uint8_t TARGET_TYPE_LTTE_8BIT = 0;
@@ -1093,6 +1097,7 @@ struct BasicInfo {
   uint32_t total_size;       // total memory size
 };
 #pragma pack()  // Cancels single-byte alignment
+enum class MemorySizeCalcType { NORMAL = 0, ALWAYS_EMPTY };
 }  // namespace ge
 
 namespace domi {
diff --git a/inc/framework/executor/ge_executor.h b/inc/framework/executor/ge_executor.h
index 5a73126f..3136e172 100644
--- a/inc/framework/executor/ge_executor.h
+++ b/inc/framework/executor/ge_executor.h
@@ -30,8 +30,6 @@
 #include "runtime/base.h"
 
 namespace ge {
-class ModelListenerAdapter;
-
 class SingleOp;
 class DynamicSingleOp;
 
@@ -55,14 +53,8 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor {
   ge::Status Initialize();
   ge::Status Finalize();
 
-  // Load model
-  ge::Status LoadModelOffline(uint32_t &model_id, const std::string &path, const std::string &key, int32_t priority,
-                              std::shared_ptr<ge::ModelListener> listener);
-
   ge::Status UnloadModel(uint32_t modelId);
 
-  ge::Status RunModel(const ge::RunModelData &input_data, ge::RunModelData &output_data);
-
   // Get input and output descriptor
   ge::Status GetModelDescInfo(uint32_t model_id, std::vector<ge::TensorDesc> &input_desc,
                               std::vector<ge::TensorDesc> &output_desc, bool new_model_desc = false);
@@ -165,12 +157,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor {
 
   ge::Status GetAippType(uint32_t model_id, uint32_t index, InputAippType &type, size_t &aipp_index);
 
-  ge::Status GetModelDescInfoForZeroCopy(uint32_t model_id, std::vector<ge::TensorDesc> &input_desc,
-                                         std::vector<ge::TensorDesc> &output_desc);
-
-  ge::Status LoadModel(uint32_t &model_id, const ge::ModelData &model_data,
-                       std::shared_ptr<ge::ModelListener> listener);
-
   ge::Status CommandHandle(const ge::Command &command);
 
   ge::Status SetDump(const DumpConfig &dump_config);
@@ -297,8 +283,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeExecutor {
  private:
   static bool isInit_;
 };
-
-ge::Status ModelInfoParser(const ge::ModelData &model, ge::ModelInfo &model_info);
 }  // namespace ge
 
 #endif  // INC_FRAMEWORK_EXECUTOR_GE_EXECUTOR_H_
diff --git a/inc/framework/generator/ge_generator.h b/inc/framework/generator/ge_generator.h
index c446b983..e0904965 100644
--- a/inc/framework/generator/ge_generator.h
+++ b/inc/framework/generator/ge_generator.h
@@ -74,11 +74,22 @@ class GeGenerator {
   /// @param [in] op_desc: the OP description.
   /// @param [in] inputs: input tensors.
   /// @param [in] outputs: output tensors.
-  /// @param [in] engine_type: specific engine.
-  /// @param [out] model_buff: model buff of single op.
+  /// @param [in] engine_type: engine type.
+  /// @param [out] model_buff: model buff of op.
   /// @return SUCCESS or FAILED
   Status BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                             OpEngineType engine_type, ModelBufferData &model_buff);
+  ///
+  /// @ingroup ge
+  /// @brief: Build single Op into model buff.
+  /// @param [in] op_desc: the OP description.
+  /// @param [in] inputs: input tensors.
+  /// @param [in] outputs: output tensors.
+  /// @param [in] graph_name: graph name.
+  /// @param [out] graph: graph of single op.
+  /// @return SUCCESS or FAILED
+  Status BuildSingleOpGraph(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
+                            std::string graph_name, Graph &graph);
 
  private:
   Status GenerateModel(const Graph &graph, const string &file_name_prefix, const vector<GeTensor> &inputs,
diff --git a/inc/framework/omg/omg.h b/inc/framework/omg/omg.h
index e7ca05f7..62332b8d 100644
--- a/inc/framework/omg/omg.h
+++ b/inc/framework/omg/omg.h
@@ -73,7 +73,7 @@ Status ParseGraph(ge::Graph &graph, const std::map<string, string> &atc_params,
  * @param [key] encrypted key
  * @return Status result code
  */
-Status ConvertOmModelToJson(const char *model_file, const char *json_file);
+Status ConvertOm(const char *model_file, const char *json_file, bool is_covert_to_json);
 
 Status ConvertPbtxtToJson(const char *model_file, const char *json_file);
 /**
@@ -103,6 +103,8 @@ void GetOutputNodesNameAndIndex(std::vector<std::pair<ge::NodePtr, int32_t>> &ou
 void UpdateOmgCtxWithParserCtx();
 
 void UpdateParserCtxWithOmgCtx();
+
+void PrintModelInfo(ge::proto::ModelDef *model_def);
 }  // namespace ge
 
 namespace domi {
diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h
index 454890aa..1049b6b5 100644
--- a/inc/framework/omg/omg_inner_types.h
+++ b/inc/framework/omg/omg_inner_types.h
@@ -26,6 +26,7 @@
 #include <vector>
 #include "framework/common/fmk_error_codes.h"
 #include "register/register_fmk_types.h"
+#include "graph/node.h"
 
 using domi::DOMI_TENSOR_ND;
 using domi::DOMI_TENSOR_RESERVED;
@@ -46,7 +47,8 @@ enum RunMode {
   GEN_OM_MODEL = 0,    // generate offline model file
   MODEL_TO_JSON = 1,   // convert to JSON file
   ONLY_PRE_CHECK = 3,  // only for pre-check
-  PBTXT_TO_JSON = 5    // pbtxt to json
+  PBTXT_TO_JSON = 5,   // pbtxt to json
+  DISPLAY_OM_INFO = 6  // display model info
 };
 
 ///
@@ -119,6 +121,8 @@ struct OmgContext {
   std::vector<std::vector<int64_t>> user_real_input_dims;
   std::vector<int64_t> cur_dynamic_dims;
   bool need_multi_batch = false;
+  std::vector<NodePtr> data_nodes;
+  std::vector<NodePtr> getnext_nosink_nodes;
 };
 }  // namespace ge
 
diff --git a/inc/framework/omg/parser/model_parser.h b/inc/framework/omg/parser/model_parser.h
index 20bfcef4..9eda685d 100644
--- a/inc/framework/omg/parser/model_parser.h
+++ b/inc/framework/omg/parser/model_parser.h
@@ -36,7 +36,7 @@ using Status = domi::Status;
 
 namespace domi {
 using GetGraphCallback = std::function<std::unique_ptr<google::protobuf::Message>(
-    const google::protobuf::Message *root_proto, const std::string &graph)>;
+  const google::protobuf::Message *root_proto, const std::string &graph)>;
 class ModelParser {
  public:
   ModelParser() {}
@@ -44,19 +44,20 @@ class ModelParser {
   virtual ~ModelParser() {}
 
   /**
-  * @ingroup domi_omg
-  * @brief Analyze network model data
-  * @param [in] file  Network model file path
-  * @param [in|out]  graph Save the network information after analysis
-  * @return SUCCESS
-  * @return Others failed
-  */
+   * @ingroup domi_omg
+   * @brief Analyze network model data
+   * @param [in] file  Network model file path
+   * @param [in|out]  graph Save the network information after analysis
+   * @return SUCCESS
+   * @return Others failed
+   */
   virtual Status Parse(const char *file, ge::Graph &graph) = 0;
 
   /**
    * @ingroup domi_omg
    * @brief Parse relevant data from memory and save it to graph
    * @param [in] input Model file memory data
+   * @param [in] input Model file memory size
    * @param [in|out] graph A graph for saving the model information after analysis
    * @return SUCCESS
    * @return FAILED
@@ -65,35 +66,46 @@ class ModelParser {
   virtual Status ParseFromMemory(const char *data, uint32_t size, ge::ComputeGraphPtr &graph) = 0;
 
   /**
-  * @ingroup domi_omg
-  * @brief Analyze network model data
-  * @param [in] proto  network model
-  * @param [in|out]  graph Save the network information after analysis
-  * @return SUCCESS
-  * @return Others failed
-  */
+   * @ingroup domi_omg
+   * @brief Parse relevant data from memory and save it to graph
+   * @param [in] input Model file memory data
+   * @param [in] input Model file memory size
+   * @param [in|out] graph A graph for saving the model information after analysis
+   * @return SUCCESS
+   * @return FAILED
+   * @author
+   */
+  virtual Status ParseFromMemory(const char *data, uint32_t size, ge::Graph &graph) = 0;
+
+  /**
+   * @ingroup domi_omg
+   * @brief Analyze network model data
+   * @param [in] proto  network model
+   * @param [in|out]  graph Save the network information after analysis
+   * @return SUCCESS
+   * @return Others failed
+   */
   virtual Status ParseProto(const google::protobuf::Message *proto, ge::ComputeGraphPtr &graph) = 0;
 
   /**
-  * @ingroup domi_omg
-  * @brief Analyze callback model data in subgraph
-  * @param [in] proto network model
-  * @param [in] callback callback of subgraph
-  * @param [in|out] graph Save the network information after analysis
-  * @return SUCCESS
-  * @return Others failed
-  */
-  virtual Status ParseProtoWithSubgraph(const google::protobuf::Message *proto,
-                                        GetGraphCallback callback,
+   * @ingroup domi_omg
+   * @brief Analyze callback model data in subgraph
+   * @param [in] proto network model
+   * @param [in] callback callback of subgraph
+   * @param [in|out] graph Save the network information after analysis
+   * @return SUCCESS
+   * @return Others failed
+   */
+  virtual Status ParseProtoWithSubgraph(const google::protobuf::Message *proto, GetGraphCallback callback,
                                         ge::ComputeGraphPtr &graph) = 0;
   /**
-  * @ingroup domi_omg
-  * @brief Convert model files to JSON format
-  * @param [in] model_file  Model file path to be converted
-  * @param [out] json_file Converted JSON file path
-  * @return SUCCESS
-  * @return Others failed
-  */
+   * @ingroup domi_omg
+   * @brief Convert model files to JSON format
+   * @param [in] model_file  Model file path to be converted
+   * @param [out] json_file Converted JSON file path
+   * @return SUCCESS
+   * @return Others failed
+   */
   virtual Status ToJson(const char *model_file, const char *json_file) { return domi::SUCCESS; }
 
   /*
diff --git a/inc/framework/omg/parser/parser_inner_ctx.h b/inc/framework/omg/parser/parser_inner_ctx.h
index f24e2639..5d91bd46 100644
--- a/inc/framework/omg/parser/parser_inner_ctx.h
+++ b/inc/framework/omg/parser/parser_inner_ctx.h
@@ -59,7 +59,7 @@ struct ParserContext {
   bool train_flag = false;
   domi::domiTensorFormat_t format = domi::DOMI_TENSOR_ND;
   domi::FrameworkType type = domi::FRAMEWORK_RESERVED;
-  RunMode run_mode = ONLY_PRE_CHECK;
+  RunMode run_mode = GEN_OM_MODEL;
   // save caffe custom proto path, used by caffe parse
   std::string custom_proto_path;
   // save caffe proto path, used by caffe parse
diff --git a/metadef b/metadef
index 4176fab0..fcd0833c 160000
--- a/metadef
+++ b/metadef
@@ -1 +1 @@
-Subproject commit 4176fab0cb2fd4f8794061916878983afb75c8da
+Subproject commit fcd0833cffcd201701f71d17db0c696c1bb01715
diff --git a/parser b/parser
index 9e392045..1601d66b 160000
--- a/parser
+++ b/parser
@@ -1 +1 @@
-Subproject commit 9e392045c26a57913b512d0686e1285650b62abe
+Subproject commit 1601d66b6187c83cbf38e762beb5538ce2c7c573
diff --git a/tests/depends/cce/CMakeLists.txt b/tests/depends/cce/CMakeLists.txt
index 85e69e6d..7550c63f 100644
--- a/tests/depends/cce/CMakeLists.txt
+++ b/tests/depends/cce/CMakeLists.txt
@@ -46,6 +46,7 @@ set(SRCS
     "${GE_CODE_DIR}/metadef/graph/anchor.cc"
     "${GE_CODE_DIR}/metadef/graph/ge_attr_value.cc"
     "${GE_CODE_DIR}/metadef/graph/buffer.cc"
+    "${GE_CODE_DIR}/metadef/graph/aligned_ptr.cc"
     "${GE_CODE_DIR}/metadef/graph/compute_graph.cc"
     "${GE_CODE_DIR}/metadef/graph/graph.cc"
     "${GE_CODE_DIR}/metadef/graph/model.cc"
diff --git a/tests/depends/error_manager/src/error_manager_stub.cc b/tests/depends/error_manager/src/error_manager_stub.cc
index 4f6b6b3d..a57b2457 100644
--- a/tests/depends/error_manager/src/error_manager_stub.cc
+++ b/tests/depends/error_manager/src/error_manager_stub.cc
@@ -58,7 +58,7 @@
   /// @param [in] value: vector parameter value
   ///
   void ErrorManager::ATCReportErrMessage(std::string error_code, const std::vector<std::string> &key,
-                           const std::vector<std::string> &value) { 
+                                         const std::vector<std::string> &value) { 
   }
 
   ///
@@ -66,13 +66,8 @@
   /// @param [in] msg: failed message map, key is error code, value is op_name
   /// @return int 0(success) -1(fail)
   ///
-  int ErrorManager::ReportMstuneCompileFailedMsg(const std::map<std::string, std::string> &msg) { return 0; }
-
-  ///
-  /// @brief save graph compile failed message from thread local map to global map
-  /// @param [in] graph_name: graph name
-  ///
-  void ErrorManager::SaveMstuneCompileFailedMsg(const std::string &graph_name) {}
+  int ErrorManager::ReportMstuneCompileFailedMsg(const std::string &root_graph_name,
+		                                 const std::map<std::string, std::string> &msg) { return 0; }
 
   ///
   /// @brief get graph compile failed message in mstune case
diff --git a/tests/depends/hccl/src/hccl_stub.cc b/tests/depends/hccl/src/hccl_stub.cc
index 1cc8fdb3..b9b9d4f6 100644
--- a/tests/depends/hccl/src/hccl_stub.cc
+++ b/tests/depends/hccl/src/hccl_stub.cc
@@ -19,26 +19,26 @@
 #include "hccl/hcom.h"
 
 HcclResult hcom_all_gather(const char *tag, void *input_count_ptr, void *output_ptr, u64 input_count,
-                             HcclDataType data_type, const char *group, rtStream_t stream) {
+                           HcclDataType data_type, const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_broadcast(const char *tag, void *ptr, u64 count, HcclDataType data_type, u32 root,
-                            const char *group, rtStream_t stream) {
+                          const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_all_reduce(const char *tag, void *input_ptr, void *output_ptr, u64 count, HcclDataType data_type,
-                             HcclReduceOp op, const char *group, rtStream_t stream) {
+                           HcclReduceOp op, const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_get_split_strategy(const char *group, const struct model_feature *feature, u32 max_segment_num,
-                                     u32 *segment_num, u32 *segment_idx) {
+                                   u32 *segment_num, u32 *segment_idx) {
   return HCCL_SUCCESS;
 }
 
 HcclResult hcom_reduce_scatter(const char *tag, void *input_ptr, void *output_ptr, u64 count,
-                                 HcclDataType data_type, HcclReduceOp op, const char *group, rtStream_t stream) {
+                               HcclDataType data_type, HcclReduceOp op, const char *group, rtStream_t stream) {
   return HCCL_SUCCESS;
 }
diff --git a/tests/depends/mmpa/src/mmpa_stub.cc b/tests/depends/mmpa/src/mmpa_stub.cc
index 17a0c8e4..de09c52c 100644
--- a/tests/depends/mmpa/src/mmpa_stub.cc
+++ b/tests/depends/mmpa/src/mmpa_stub.cc
@@ -272,3 +272,8 @@ VOID *mmDlsym(VOID *handle, const CHAR *funcName)
 {
   return NULL;
 }
+
+INT32 mmGetPid()
+{
+  return (INT32)getpid();
+}
diff --git a/tests/depends/runtime/src/runtime_stub.cc b/tests/depends/runtime/src/runtime_stub.cc
index 2ab6684d..9b45e7e2 100644
--- a/tests/depends/runtime/src/runtime_stub.cc
+++ b/tests/depends/runtime/src/runtime_stub.cc
@@ -325,7 +325,7 @@ rtError_t rtSetTaskFailCallback(rtTaskFailCallback callback)
 }
 
 rtError_t rtMallocHostSharedMemory(rtMallocHostSharedMemoryIn *in,
-		                   rtMallocHostSharedMemoryOut *out)
+		                               rtMallocHostSharedMemoryOut *out)
 {
   out->ptr = new uint8_t[in->size];
   out->devPtr = new uint8_t[in->size];
@@ -384,3 +384,8 @@ rtError_t rtModelExit(rtModel_t model, rtStream_t stream)
 {
  return RT_ERROR_NONE;
 }
+
+rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId)
+{
+ return RT_ERROR_NONE;
+}
diff --git a/tests/st/CMakeLists.txt b/tests/st/CMakeLists.txt
deleted file mode 100644
index 56babec1..00000000
--- a/tests/st/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-cmake_minimum_required(VERSION 3.0)
-set(CMAKE_CXX_STANDARD 11)
-project(ge_st CXX C)
-
-set(CMAKE_CXX_FLAGS "-O1 -fPIC -Wl,-unresolved-symbols=ignore-in-shared-libs")
-
-
-file(GLOB_RECURSE RES50_TRAIN_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-        "resnet50/resnet50_train.cc"
-        "resnet50/common.cc"
-)
-
-include_directories(${GE_SOURCE_DIR}/inc)
-include_directories(${GE_SOURCE_DIR}/inc/graph)
-include_directories(${GE_SOURCE_DIR}/inc/framework)
-include_directories(${GE_SOURCE_DIR}/inc/external)
-include_directories(${GE_SOURCE_DIR}/inc/external/ge)
-include_directories(${GE_SOURCE_DIR}/inc/external/graph)
-include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc)
-include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops)
-include_directories(/usr/local/HiAI/opp/op_proto/built-in/inc)
-
-add_executable(st_resnet50_train ${RES50_TRAIN_SRCS})
-target_link_libraries(st_resnet50_train
-        ${PROTOBUF_LIBRARY}
-        ge_client_train ge_memory
-)
\ No newline at end of file
diff --git a/tests/st/resnet50/common.cc b/tests/st/resnet50/common.cc
deleted file mode 100644
index 674ef926..00000000
--- a/tests/st/resnet50/common.cc
+++ /dev/null
@@ -1,768 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <vector>
-
-#include "common.h"
-#include "model.h"
-
-#define MAX_HEAD_SIZE 50
-
-using namespace std;
-using namespace ge;
-
-void update_op_format(Operator ops, Format format) {
-  printf("set format begin.........\n");
-  ge::TensorDesc tensor_desc_x = ops.GetInputDesc("x");
-  ge::TensorDesc tensor_desc_y = ops.GetOutputDesc("y");
-  Format f_x0 = tensor_desc_x.GetFormat();
-  Format f_y0 = tensor_desc_x.GetFormat();
-  printf("before set  x format:%d \n", f_x0);
-  printf("before set  y format:%d \n", f_y0);
-  printf("format to be set is :%d \n", format);
-  tensor_desc_x.SetFormat(format);
-  tensor_desc_y.SetFormat(format);
-  ops.UpdateInputDesc("x", tensor_desc_x);
-  ops.UpdateOutputDesc("y", tensor_desc_y);
-  Format f_x = tensor_desc_x.GetFormat();
-  Format f_y = tensor_desc_y.GetFormat();
-  printf("after set  x format:%d \n", f_x);
-  printf("after set  y format:%d \n", f_y);
-}
-
-/// getDimInfo: get dim info from data file
-/// param:
-/// fp: the testing datafile object
-///
-/// return :
-/// dim_info: array to store the info of the dim in datafile, like [4,3,3,6,3,162(3*3*6*3)],4 is dim size,3,3,6,3 is the
-/// dim shape data_size: the size of the testing data including the data file
-void getDimInfo(FILE *fp, std::vector<uint64_t> &dim_info) {
-  // get dim info from hisi testing data file
-  uint32_t *dim_buffer = (uint32_t *)malloc(MAX_HEAD_SIZE * sizeof(uint32_t));
-  fread(dim_buffer, sizeof(uint32_t), MAX_HEAD_SIZE, fp);
-  dim_info.push_back(*dim_buffer);  // get dim size
-
-  // get data shape to compute the datasize
-  uint64_t data_size = 1;
-  uint32_t i = 1;
-  for (; i <= dim_info[0]; i++) {
-    dim_info.push_back(*(dim_buffer + i));
-    data_size *= *(dim_buffer + i);
-  }
-  dim_info.push_back(data_size);
-
-  free(dim_buffer);
-}
-
-/// readTestDataFile: read test date from hisi .t datafile
-/// param:
-///  infile: the path of hisi .t datafile
-/// return:
-///  dim_info: array to store the info of the dim in datafile, like [4,3,3,6,3],4 is dim size,3,3,6,3 is the dim shape
-void *readTestDataFile(std::string infile, std::vector<uint64_t> &dim_info) {
-  FILE *fp;
-  fp = fopen(infile.c_str(), "r");
-
-  if (fp == NULL) {
-    printf("ERROR: cant't open file %s\n", infile.c_str());
-    return NULL;
-  } else {
-    getDimInfo(fp, dim_info);
-    uint64_t data_size = dim_info[dim_info.size() - 1];
-
-    fclose(fp);
-
-    fp = fopen(infile.c_str(), "r");
-    if (fp == NULL) {
-      printf("ERROR: cant't open file %s\n", infile.c_str());
-      return NULL;
-    }
-    uint32_t *memory = (uint32_t *)malloc((dim_info[0] + 1 + data_size) * sizeof(uint32_t));
-    fread(memory, sizeof(uint32_t), (dim_info[0] + 1 + data_size), fp);
-    fclose(fp);
-    return memory + (dim_info[0] + 1);
-  }
-}
-
-void *readUint8TestDataFile(std::string infile, int size) {
-  FILE *fp;
-  fp = fopen(infile.c_str(), "r");
-
-  if (fp == NULL) {
-    printf("ERROR: cant't open file %s\n", infile.c_str());
-    return NULL;
-  }
-  uint8_t *memory = (uint8_t *)malloc((size) * sizeof(uint8_t));
-  fread(memory, sizeof(uint8_t), (size), fp);
-  fclose(fp);
-  return memory;
-}
-
-/// allclose
-/// param:
-///  a:compared file a
-///  b:compared file b
-///  count: the count size which will compare
-///  rtol:
-///  atol:
-/// return:
-///  true or false
-bool allclose(float *a, float *b, uint64_t count, float rtol = 1e-05, float atol = 1e-08) {
-  uint32_t i = 0;
-
-  for (; i < count; ++i) {
-    if (fabs(a[i] - b[i]) > (atol + rtol * fabs(b[i]))) {
-      printf("compara failed: i= %d, a[i]=%f, b[i]=%f,atol=%f,rtol=%f\n", i, a[i], b[i], atol, rtol);
-      return false;
-    }
-  }
-
-  return true;
-}
-
-/// compFp32WithTData: compare the data with the data in hisi .t file
-/// param:
-///  actual_output_data: the result of ge
-///  expected_data_file: the path of hisi .t result file
-///  rtol:
-///  atol:
-/// return:
-///  true of false
-bool compFp32WithTData(float *actual_output_data, std::string expected_data_file, float rtol = 1e-05, float atol = 1e-08) {
-  std::vector<uint64_t> dim_info;
-  float *expected_output_data = (float *)readTestDataFile(expected_data_file, dim_info);
-
-  uint32_t i = 1;
-  uint64_t data_size = 1;
-  for (; i <= dim_info[0]; i++) {
-    data_size *= dim_info[i];
-  }
-  return allclose(actual_output_data, expected_output_data, data_size, rtol, atol);
-}
-
-int SwitchDatatype(DataType dt) {
-  int size = 1;
-  if (dt == ge::DT_FLOAT) size = 4;
-  if (dt == ge::DT_INT32) size = 4;
-  if (dt == ge::DT_FLOAT16) size = 2;
-  if (dt == ge::DT_INT64) size = 8;
-  return size;
-}
-
-ge::Tensor genTensor(std::vector<int64_t> tensor_shape, Format format, DataType dt) {
-  int size = 1;
-  for (int i = 0; i < tensor_shape.size(); i++) {
-    size = size * tensor_shape[i];
-  }
-
-  int data_type_size = SwitchDatatype(dt);
-
-  size = abs(size * data_type_size);
-  vector<uint8_t> data_value;
-
-  if (size == 0) {
-    TensorDesc input_tensor_desc = TensorDesc(ge::Shape(tensor_shape), format, dt);
-    input_tensor_desc.SetRealDimCnt(tensor_shape.size());
-    Tensor gen_tensor = Tensor(input_tensor_desc, data_value);
-    return gen_tensor;
-  }
-  for (int i = 0; i < size; i++) {
-    data_value.push_back(1);
-  }
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape(tensor_shape), format, dt);
-  input_tensor_desc.SetRealDimCnt(tensor_shape.size());
-  Tensor gen_tensor = Tensor(input_tensor_desc, data_value);
-  return gen_tensor;
-}
-
-ge::Tensor genTensor_withVaule(std::vector<int64_t> tensor_shape, float value) {
-  int size = 1;
-  for (int i = 0; i < tensor_shape.size(); i++) {
-    size = size * tensor_shape[i];
-  }
-
-  float *data_value = new float[size];
-  for (int i = 0; i < size; i++) {
-    *(data_value + i) = value;
-  }
-  Tensor gen_ge_tensor;
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape(tensor_shape), FORMAT_NCHW);
-  gen_ge_tensor.SetTensorDesc(input_tensor_desc);
-  gen_ge_tensor.SetData((uint8_t *)data_value, size * 4);
-
-  return gen_ge_tensor;
-}
-
-Tensor genTesnor_Shape_as_data(std::vector<int64_t> tensor_shape) {
-  Format format = FORMAT_NCHW;
-  DataType dt = DT_INT32;
-  int size = tensor_shape.size();
-  int32_t *tensor_data = new int32_t[size];
-  std::cout << "shape tensor size:" << size << endl;
-  for (int i = 0; i < size; i++) {
-    *(tensor_data + i) = tensor_shape[i];
-  }
-
-  Tensor gen_tensor;
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape({size}), FORMAT_NCHW, DT_INT32);
-  gen_tensor.SetData((uint8_t *)tensor_data, size * GetDatTypeSize(dt));
-  gen_tensor.SetTensorDesc(input_tensor_desc);
-
-  return gen_tensor;
-}
-
-/// train_flag is 0 when infer; train_flag is 1 when train; train_flag is 0 default
-/// run_mode_path is not 0,1,2 when TBE; run_mode_path is 1 when FE; run_mode_path is 0 default
-/// run_mode_path is 2 now when AICPU, ge.enabledlocalFmkop is 1
-ge::Status GEInitialize_api(string train_flag, string run_mode_path) {
-  ge::Status ret;
-  if (run_mode_path == "0") {
-    const std::map<string, string> config = {
-        {"device_id", "0,2,4,6"},
-        {"rank_table_file", "hccl from csa/paas"},
-        {"ge.graphRunMode", train_flag},
-        {"ge.aicpuFlag", "1"},
-        {"ge.feFlag", "1"},
-        {DDK_VERSION_FLAG, "1.60.T17.B830"},
-        {"ge.soLoadPath",
-         "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-         "libaicpu_plugin.so"}};
-    ret = ge::GEInitialize(config);
-  } else if (run_mode_path == "1") {
-    const std::map<string, string> config = {
-        {"device_id", "0,2,4,6"},
-        {"rank_table_file", "hccl from csa/paas"},
-        {"ge.graphRunMode", train_flag},
-        {"ge.feFlag", "1"},
-        {DDK_VERSION_FLAG, "1.60.T17.B830"},
-        {TBE_PLUGIN_PATH_FLAG, "/usr/local/HiAI/runtime/lib64/tbe_plugin/bert"},
-        {"ge.soLoadPath", "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so"}};
-    ret = ge::GEInitialize(config);
-  } else if (run_mode_path == "2") {
-    const std::map<string, string> config = {{"device_id", "0,2,4,6"},
-                                             {"rank_table_file", "hccl from csa/paas"},
-                                             {"ge.graphRunMode", train_flag},
-                                             {LOCAL_FMKOP_FLAG, "1"}};
-    ret = ge::GEInitialize(config);
-  } else {
-    const std::map<string, string> config = {
-        {"device_id", "0,2,4,6"},
-        {"rank_table_file", "hccl from csa/paas"},
-        {"ge.graphRunMode", train_flag},
-        {DDK_VERSION_FLAG, "1.60.T17.B830"},
-        {TBE_PLUGIN_PATH_FLAG, "/usr/local/HiAI/runtime/lib64/tbe_plugin/" + run_mode_path}};
-    ret = ge::GEInitialize(config);
-  }
-  std::cout << "GEInitialize_ret is " << ret << std::endl;
-
-  return ret;
-}
-
-/// train_flag is infer default
-/// run_mode: is multi group of [fe,aicpu,bert,deeplabv3,mobilenetv2,single_path_nas,ssd]
-/// but bert,deeplabv3,mobilenetv2,single_path_nas,ssd can only set one value from array
-/// eg:"fe,aicpu,bert" or "fe", default is “fe”
-/// "fe,aicpu,bert" remain open fe aicpu and bert
-ge::Status GEInitialize_api_new(string train_flag, string run_mode) {
-  ge::Status ret;
-  vector<string> modes;
-
-  char *strs = new char[run_mode.length() + 1];
-  strcpy(strs, run_mode.c_str());
-  const char *delim = ",";
-  char *p = strtok(strs, delim);
-  while (p) {
-    string s = p;        // transform substr to string
-    modes.push_back(s);  // save to result array
-    p = strtok(NULL, delim);
-  }
-
-  std::map<string, string> config = {
-      {"device_id", "0,2,4,6"},
-      {"rank_table_file", "hccl from csa/paas"},
-      {DDK_VERSION_FLAG, "1.60.T17.B830"},
-      {"ge.opsProtoLibPath", "/usr/local/HiAI/runtime/ops/op_proto/built-in/libopsproto.so"}};
-  if (train_flag == "infer")
-    config.insert(pair<string, string>("ge.graphRunMode", "0"));
-  else if (train_flag == "train")
-    config.insert(pair<string, string>("ge.graphRunMode", "1"));
-  else
-    std::cout << "GeInitialize give the error param" << std::endl;
-
-  for (int i = 0; i < modes.size(); i++) {
-    if (modes[i] == "fe") {
-      config.insert(pair<string, string>("ge.feFlag", "1"));
-      if (config.find("ge.soLoadPath") != config.end()) {
-        config["ge.soLoadPath"] =
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-            "libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/HiAI/"
-            "runtime/lib64/plugin/opskernel/librts_engine.so";
-      } else {
-        config.insert(pair<string, string>(
-            "ge.soLoadPath",
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-            "libge_local_engine.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/librts_engine.so"));
-      }
-    } else if (modes[i] == "aicpu") {
-      config.insert(pair<string, string>("ge.aicpuFlag", "1"));
-      if (config.find("ge.soLoadPath") != config.end()) {
-        config["ge.soLoadPath"] =
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libfe.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/"
-            "libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/libge_local_engine.so:/usr/local/HiAI/"
-            "runtime/lib64/plugin/opskernel/librts_engine.so";
-      } else {
-        config.insert(pair<string, string>(
-            "ge.soLoadPath",
-            "/usr/local/HiAI/runtime/lib64/plugin/opskernel/libaicpu_plugin.so:/usr/local/HiAI/runtime/lib64/plugin/"
-            "opskernel/libge_local_engine.so:/usr/local/HiAI/runtime/lib64/plugin/opskernel/librts_engine.so"));
-      }
-    } else if (modes[i] == "bert" || modes[i] == "deeplabv3" || modes[i] == "mobilenetv2" ||
-               modes[i] == "single_path_nas" || modes[i] == "ssd") {
-      config.insert(pair<string, string>(TBE_PLUGIN_PATH_FLAG, "/usr/local/HiAI/runtime/lib64/tbe_plugin/" + modes[i]));
-    } else if (modes[i] == "plugin") {
-
-    } else
-      std::cout << "GeInitialize give the error param" << std::endl;
-  }
-  ret = ge::GEInitialize(config);
-
-  std::cout << "GEInitialize_ret is " << ret << std::endl;
-
-  return ret;
-}
-
-ge::Status GEFinalize_api() {
-  ge::Status ret = ge::GEFinalize();
-  std::cout << "GEFinalize ret is " << ret << std::endl;
-
-  return ret;
-}
-
-/// set train_flag
-/// if run_mode_path is "fe" remain FE process; "fe,plugin" is FE and TBE plugin process
-/// "aicpu" is open aicpu plugin
-int RunGraph_initData(Graph &graph, string op_name, map<string, std::vector<int64_t>> attr_test, string train_flag,
-                      string run_mode_path) {
-  std::map<string, string> options = {{RUN_FLAG, "1"}};
-  uint32_t graph_id = 0;
-
-  ge::Status ret = GEInitialize_api_new(train_flag, run_mode_path);
-  EXPECT_EQ(ret, ge::SUCCESS);
-
-  ge::Session *session = new Session(options);
-  ASSERT_TRUE(session != NULL);
-
-  std::vector<Tensor> input;
-  if (attr_test.find("input1") != attr_test.end()) {
-    Tensor input_tensor = genTensor(attr_test["input1"]);
-    input.push_back(input_tensor);
-  }
-  if (attr_test.find("input2") != attr_test.end()) {
-    Tensor input_tensor = genTensor(attr_test["input2"]);
-    input.push_back(input_tensor);
-  }
-  if (attr_test.find("input3") != attr_test.end()) {
-    Tensor input_tensor = genTensor(attr_test["input3"]);
-    input.push_back(input_tensor);
-  }
-  std::vector<Tensor> output;
-
-  ret = session->AddGraph(graph_id, graph);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  if (train_flag == "1") {
-    setenv("GE_TRAIN", "1", true);
-    ret = session->RunGraph(graph_id, input, output);
-    setenv("GE_TRAIN", "0", true);
-  } else {
-    ret = session->RunGraph(graph_id, input, output);
-  }
-  delete session;
-  GEFinalize_api();
-
-  if (ret != ge::SUCCESS) {
-    std::cout << " run graph failed" << std::endl;
-    return -1;
-  } else {
-    return 0;
-  }
-}
-
-ge::Status session_add_and_run_graph(ge::Session *session, uint32_t graph_id, Graph &graph, std::vector<Tensor> inputs,
-                                     std::vector<Tensor> &outputs) {
-  ge::Status ret = session->AddGraph(graph_id, graph);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  ret = session->RunGraph(graph_id, inputs, outputs);
-
-  return ret;
-}
-
-ge::Session *create_session() {
-  // Init session
-  std::map<string, string> options = {{"a", "b"}, {TRAIN_FLAG, "1"}};
-  ge::Session *session = new Session(options);
-  ASSERT_TRUE(session != NULL);
-
-  return session;
-}
-
-ge::Session *create_aipp_session() {
-  // Init session
-  std::map<string, string> options = {{"a", "b"}, {TRAIN_FLAG, "1"}, {"ge.insertOpFile", "/root/host/ge/aipp.cfg"}};
-  ge::Session *session = new Session(options);
-  ASSERT_TRUE(session != NULL);
-
-  return session;
-}
-
-int buildCheckPointGraph(Graph &graph, map<string, TensorDesc> variables) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  for (map<string, TensorDesc>::iterator it = variables.begin(); it != variables.end(); ++it) {
-    auto var = op::Variable(string(it->first));
-    var.update_output_desc_y(it->second);
-    inputs.push_back(var);
-    graph.AddOp(var);
-  }
-
-  auto save = op::Save().create_dynamic_input_tensors(inputs.size());
-  for (int i = 0; i < inputs.size(); i++) {
-    save.set_dynamic_input_tensors(i, inputs[i]);
-  }
-
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-
-int buildInitGraph(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var,
-                   std::vector<float> values_var) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  for (int i = 0; i < desc_var.size(); i++) {
-    desc_var[i].SetRealDimCnt(desc_var[i].GetShape().GetDimNum());
-    auto tensor_data = genTensor_withVaule(desc_var[i].GetShape().GetDims(), values_var[i]);
-    auto var_constant = op::Constant().set_attr_value(tensor_data);
-    var_constant.update_output_desc_y(desc_var[i]);
-
-    auto var_init = op::Variable(string(name_var[i]));
-    var_init.update_output_desc_y(desc_var[i]);
-    auto var_assign = op::Assign().set_input_ref(var_init).set_input_value(var_constant);
-    inputs.push_back(var_init);
-  }
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-
-int buildInitGraph_other_dataType(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  for (int i = 0; i < desc_var.size(); i++) {
-    desc_var[i].SetRealDimCnt(desc_var[i].GetShape().GetDimNum());
-    auto tensor_data = genTensor(desc_var[i].GetShape().GetDims(), desc_var[i].GetFormat(), desc_var[i].GetDataType());
-    auto var_constant = op::Constant().set_attr_value(tensor_data);
-    var_constant.update_output_desc_y(desc_var[i]);
-
-    auto var_init = op::Variable(string(name_var[i]));
-    var_init.update_output_desc_y(desc_var[i]);
-    auto var_assign = op::Assign().set_input_ref(var_init).set_input_value(var_constant);
-    inputs.push_back(var_init);
-
-    graph.AddOp(var_constant);
-    graph.AddOp(var_init);
-    graph.AddOp(var_assign);
-  }
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-
-bool build_multi_input_multi_output_graph(Graph &graph) {
-  auto data1 = op::Data("Data1").set_attr_index(0);
-  auto data2 = op::Data("Data2").set_attr_index(1);
-
-  vector<uint64_t> dim_info;
-
-  auto relu1 = op::Relu("Relu1").set_input_x(data1);
-  auto relu2 = op::Relu("Relu2").set_input_x(data2);
-
-  auto eltwise = op::Eltwise("Eltwise")
-                     .create_dynamic_input_x(2)
-                     .set_dynamic_input_x(0, relu1)
-                     .set_dynamic_input_x(1, relu2)
-                     .set_attr_N(2)
-                     .set_attr_mode(1)
-                     .set_attr_coeff({1, 1});
-
-  auto eltwise1 = op::Eltwise("Eltwise1")
-                      .create_dynamic_input_x(2)
-                      .set_dynamic_input_x(0, eltwise)
-                      .set_dynamic_input_x(1, eltwise)
-                      .set_attr_N(2)
-                      .set_attr_mode(1)
-                      .set_attr_coeff({1, 1});
-
-  auto eltwise2 = op::Eltwise("Eltwise2")
-                      .create_dynamic_input_x(2)
-                      .set_dynamic_input_x(0, eltwise)
-                      .set_dynamic_input_x(1, eltwise)
-                      .set_attr_N(2)
-                      .set_attr_mode(1)
-                      .set_attr_coeff({1, 1});
-
-  std::vector<Operator> inputs{data1, data2};
-  std::vector<Operator> outputs{eltwise1, eltwise2};
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return true;
-}
-
-void build_big_graph(Graph &graph, map<string, std::vector<int64_t>> attr) {
-  auto data = op::Data("Data").set_attr_index(0);
-  auto weight = op::Const("weight1").set_attr_value(genTensor(attr["weight"]));
-  vector<int64_t> weight_shape(attr["weight"].begin(), attr["weight"].end());
-  TensorDesc weight_desc(ge::Shape(weight_shape), FORMAT_NCHW, DT_FLOAT);
-  weight.update_output_desc_y(weight_desc);
-  auto conv_1 = op::Conv2D("conv1").set_input_x(data).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-
-  auto conv_2 = op::Conv2D("conv2").set_input_x(conv_1).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_3 = op::Conv2D("conv3").set_input_x(conv_2).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_4 = op::Conv2D("conv4").set_input_x(conv_3).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_5 = op::Conv2D("conv5").set_input_x(conv_4).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_6 = op::Conv2D("conv6").set_input_x(conv_5).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_7 = op::Conv2D("conv7").set_input_x(conv_6).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_8 = op::Conv2D("conv8").set_input_x(conv_7).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_9 = op::Conv2D("conv9").set_input_x(conv_8).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_10 = op::Conv2D("conv10").set_input_x(conv_9).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_11 = op::Conv2D("conv11").set_input_x(conv_10).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_12 = op::Conv2D("conv12").set_input_x(conv_11).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_13 = op::Conv2D("conv13").set_input_x(conv_12).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_14 = op::Conv2D("conv14").set_input_x(conv_13).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_15 = op::Conv2D("conv15").set_input_x(conv_14).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_16 = op::Conv2D("conv16").set_input_x(conv_15).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_17 = op::Conv2D("conv17").set_input_x(conv_16).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_18 = op::Conv2D("conv18").set_input_x(conv_17).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_19 = op::Conv2D("conv19").set_input_x(conv_18).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_20 = op::Conv2D("conv20").set_input_x(conv_19).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_21 = op::Conv2D("conv21").set_input_x(conv_20).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_22 = op::Conv2D("conv22").set_input_x(conv_21).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_23 = op::Conv2D("conv23").set_input_x(conv_22).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_24 = op::Conv2D("conv24").set_input_x(conv_23).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_25 = op::Conv2D("conv25").set_input_x(conv_24).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_26 = op::Conv2D("conv26").set_input_x(conv_25).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_27 = op::Conv2D("conv27").set_input_x(conv_26).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_28 = op::Conv2D("conv28").set_input_x(conv_27).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_29 = op::Conv2D("conv29").set_input_x(conv_28).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_30 = op::Conv2D("conv30").set_input_x(conv_29).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_31 = op::Conv2D("conv31").set_input_x(conv_30).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_32 = op::Conv2D("conv32").set_input_x(conv_31).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_33 = op::Conv2D("conv33").set_input_x(conv_32).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_34 = op::Conv2D("conv34").set_input_x(conv_33).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_35 = op::Conv2D("conv35").set_input_x(conv_34).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_36 = op::Conv2D("conv36").set_input_x(conv_35).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_37 = op::Conv2D("conv37").set_input_x(conv_36).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_38 = op::Conv2D("conv38").set_input_x(conv_37).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_39 = op::Conv2D("conv39").set_input_x(conv_38).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_40 = op::Conv2D("conv40").set_input_x(conv_39).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_41 = op::Conv2D("conv41").set_input_x(conv_40).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_42 = op::Conv2D("conv42").set_input_x(conv_41).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_43 = op::Conv2D("conv43").set_input_x(conv_42).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_44 = op::Conv2D("conv44").set_input_x(conv_43).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_45 = op::Conv2D("conv45").set_input_x(conv_44).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_46 = op::Conv2D("conv46").set_input_x(conv_45).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_47 = op::Conv2D("conv47").set_input_x(conv_46).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_48 = op::Conv2D("conv48").set_input_x(conv_47).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_49 = op::Conv2D("conv49").set_input_x(conv_48).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_50 = op::Conv2D("conv50").set_input_x(conv_49).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_51 = op::Conv2D("conv51").set_input_x(conv_50).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_52 = op::Conv2D("conv52").set_input_x(conv_51).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_53 = op::Conv2D("conv53").set_input_x(conv_52).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_54 = op::Conv2D("conv54").set_input_x(conv_53).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_55 = op::Conv2D("conv55").set_input_x(conv_54).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_56 = op::Conv2D("conv56").set_input_x(conv_55).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_57 = op::Conv2D("conv57").set_input_x(conv_56).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_58 = op::Conv2D("conv58").set_input_x(conv_57).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_59 = op::Conv2D("conv59").set_input_x(conv_58).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_60 = op::Conv2D("conv60").set_input_x(conv_59).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_61 = op::Conv2D("conv61").set_input_x(conv_60).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_62 = op::Conv2D("conv62").set_input_x(conv_61).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_63 = op::Conv2D("conv63").set_input_x(conv_62).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_64 = op::Conv2D("conv64").set_input_x(conv_63).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_65 = op::Conv2D("conv65").set_input_x(conv_64).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_66 = op::Conv2D("conv66").set_input_x(conv_65).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_67 = op::Conv2D("conv67").set_input_x(conv_66).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_68 = op::Conv2D("conv68").set_input_x(conv_67).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_69 = op::Conv2D("conv69").set_input_x(conv_68).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_70 = op::Conv2D("conv70").set_input_x(conv_69).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_71 = op::Conv2D("conv71").set_input_x(conv_70).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_72 = op::Conv2D("conv72").set_input_x(conv_71).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_73 = op::Conv2D("conv73").set_input_x(conv_72).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_74 = op::Conv2D("conv74").set_input_x(conv_73).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_75 = op::Conv2D("conv75").set_input_x(conv_74).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_76 = op::Conv2D("conv76").set_input_x(conv_75).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_77 = op::Conv2D("conv77").set_input_x(conv_76).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_78 = op::Conv2D("conv78").set_input_x(conv_77).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_79 = op::Conv2D("conv79").set_input_x(conv_78).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_80 = op::Conv2D("conv80").set_input_x(conv_79).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_81 = op::Conv2D("conv81").set_input_x(conv_80).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_82 = op::Conv2D("conv82").set_input_x(conv_81).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_83 = op::Conv2D("conv83").set_input_x(conv_82).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_84 = op::Conv2D("conv84").set_input_x(conv_83).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_85 = op::Conv2D("conv85").set_input_x(conv_84).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_86 = op::Conv2D("conv86").set_input_x(conv_85).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_87 = op::Conv2D("conv87").set_input_x(conv_86).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_88 = op::Conv2D("conv88").set_input_x(conv_87).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_89 = op::Conv2D("conv89").set_input_x(conv_88).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_90 = op::Conv2D("conv90").set_input_x(conv_89).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_91 = op::Conv2D("conv91").set_input_x(conv_80).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_92 = op::Conv2D("conv92").set_input_x(conv_91).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_93 = op::Conv2D("conv93").set_input_x(conv_92).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_94 = op::Conv2D("conv94").set_input_x(conv_93).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_95 = op::Conv2D("conv95").set_input_x(conv_94).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_96 = op::Conv2D("conv96").set_input_x(conv_95).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_97 = op::Conv2D("conv97").set_input_x(conv_96).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_98 = op::Conv2D("conv98").set_input_x(conv_97).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_99 = op::Conv2D("conv99").set_input_x(conv_98).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_100 = op::Conv2D("conv100").set_input_x(conv_99).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_101 = op::Conv2D("conv101").set_input_x(conv_100).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_102 = op::Conv2D("conv102").set_input_x(conv_101).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_103 = op::Conv2D("conv103").set_input_x(conv_102).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_104 = op::Conv2D("conv104").set_input_x(conv_103).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_105 = op::Conv2D("conv105").set_input_x(conv_104).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_106 = op::Conv2D("conv106").set_input_x(conv_105).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_107 = op::Conv2D("conv107").set_input_x(conv_106).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_108 = op::Conv2D("conv108").set_input_x(conv_107).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_109 = op::Conv2D("conv109").set_input_x(conv_108).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_110 = op::Conv2D("conv110").set_input_x(conv_109).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_111 = op::Conv2D("conv111").set_input_x(conv_110).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_112 = op::Conv2D("conv112").set_input_x(conv_111).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_113 = op::Conv2D("conv113").set_input_x(conv_112).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_114 = op::Conv2D("conv114").set_input_x(conv_113).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_115 = op::Conv2D("conv115").set_input_x(conv_114).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_116 = op::Conv2D("conv116").set_input_x(conv_115).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_117 = op::Conv2D("conv117").set_input_x(conv_116).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_118 = op::Conv2D("conv118").set_input_x(conv_117).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_119 = op::Conv2D("conv119").set_input_x(conv_118).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_120 = op::Conv2D("conv120").set_input_x(conv_119).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_121 = op::Conv2D("conv121").set_input_x(conv_120).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_122 = op::Conv2D("conv122").set_input_x(conv_121).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_123 = op::Conv2D("conv123").set_input_x(conv_122).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_124 = op::Conv2D("conv124").set_input_x(conv_123).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_125 = op::Conv2D("conv125").set_input_x(conv_124).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_126 = op::Conv2D("conv126").set_input_x(conv_125).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_127 = op::Conv2D("conv127").set_input_x(conv_126).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_128 = op::Conv2D("conv128").set_input_x(conv_127).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_129 = op::Conv2D("conv129").set_input_x(conv_128).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-  auto conv_130 = op::Conv2D("conv130").set_input_x(conv_129).set_input_filter(weight).set_attr_pads({0,0,0,0}).set_attr_strides({1,1,1,1});
-
-  std::vector<Operator> inputs{data};
-  std::vector<Operator> outputs{conv_130};
-  graph.SetInputs(inputs).SetOutputs(outputs);
-}
-
-int GetDatTypeSize(DataType dt) {
-  int dailation = 1;
-  if (dt == ge::DT_FLOAT)
-    dailation = 4;
-  else if (dt == ge::DT_FLOAT16)
-    dailation = 2;
-  else if (dt == ge::DT_INT16)
-    dailation = 2;
-  else if (dt == ge::DT_UINT16)
-    dailation = 2;
-  else if (dt == ge::DT_INT32)
-    dailation = 4;
-  else if (dt == ge::DT_UINT32)
-    dailation = 4;
-  else if (dt == ge::DT_INT64)
-    dailation = 8;
-  else if (dt == ge::DT_UINT64)
-    dailation = 8;
-  else if (dt == ge::DT_INT8)
-    dailation = 1;
-
-  return dailation;
-}
-
-int buildConvGraph_new(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var, int flag,
-                       Format format) {
-  auto data_x_shape = op::Data("xShape").set_attr_index(0);
-  auto var = op::Variable(name_var[0]);
-  auto var1 = op::Variable(name_var[1]);    //add one seat of ApplyMomentum()
-  auto label1 = op::Variable(name_var[2]);  //add one seat of ApplyMomentum()
-  auto conv2dgrad = op::Conv2DBackpropFilterD("output_1");
-  auto test2 = op::ApplyMomentum();
-
-  var.update_output_desc_y(desc_var[0]);
-  var1.update_output_desc_y(desc_var[1]);
-  label1.update_output_desc_y(desc_var[2]);
-
-  graph.AddOp(var);
-  graph.AddOp(var1);
-  graph.AddOp(label1);
-
-  auto conv2d = op::Conv2D().set_input_x(data_x_shape).set_input_filter(var).set_attr_strides({1, 1, 1, 1}).set_attr_pads({0,0,0,0});
-  update_op_format(conv2d, format);
-  ge::TensorDesc tensor_desc_w = conv2d.GetInputDesc("filter");
-  tensor_desc_w.SetFormat(format);
-  conv2d.UpdateInputDesc("filter", tensor_desc_w);
-
-  if (flag >= 1) {
-    conv2dgrad.set_input_x(data_x_shape)
-        .set_attr_filter_size(desc_var[0].GetShape().GetDims())
-        .set_input_out_backprop(conv2d)
-        .set_attr_strides({1, 1, 1, 1})
-        .set_attr_pads({0, 0, 0, 0});
-    update_op_format(conv2dgrad, format);
-    graph.AddOp(conv2dgrad);
-  }
-  if (flag >= 2) {
-    // set conv2dgrad var
-    test2.set_input_accum(var1)
-        .set_input_grad(conv2dgrad)
-        .set_input_lr(label1)
-        .set_input_momentum(label1)
-        .set_input_var(var);
-    graph.AddOp(test2);
-  }
-
-  std::vector<Operator> inputs{data_x_shape};  // set all val
-  std::vector<Operator> outputs{conv2d};
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  graph.AddOp(conv2d);
-
-  return 0;
-}
-
-/// load bin data_fail
-/// input_path: path of bin data_file
-/// shapes: the shape of Tensor
-/// ft: the format of Tensor
-/// dt: the dataType of Tensor
-Tensor load_variable_input_data(string input_path, std::vector<int64_t> shapes, Format ft, DataType dt) {
-  vector<uint64_t> dim_info1;
-
-  uint8_t *input_data = (uint8_t *)readTestDataFile(input_path, dim_info1);  // common.h
-  TensorDesc input_tensor_desc = TensorDesc(ge::Shape(shapes), ft, dt);
-  input_tensor_desc.SetRealDimCnt(shapes.size());
-  Tensor input_tensor = Tensor(input_tensor_desc, input_data, GetDatTypeSize(dt) * dim_info1[dim_info1[0] + 1]);
-  return input_tensor;
-}
diff --git a/tests/st/resnet50/common.h b/tests/st/resnet50/common.h
deleted file mode 100644
index 75805db7..00000000
--- a/tests/st/resnet50/common.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ST_RESNET50_GE_COMMON_H_
-#define ST_RESNET50_GE_COMMON_H_
-#include "common/ge_inner_error_codes.h"
-#include "utils/tensor_utils.h"
-
-#define MY_USER_GE_LOGI(...) GE_LOG_INFO(1, __VA_ARGS__)
-#define MY_USER_GE_LOGW(...) GE_LOG_WARN(1, __VA_ARGS__)
-#define MY_USER_GE_LOGE(...) GE_LOG_ERROR(1, 3, __VA_ARGS__)
-
-#ifndef USER_GE_LOGI
-#define USER_GE_LOGI MY_USER_GE_LOGI
-#endif  // USER_GE_LOGI
-
-#ifndef USER_GE_LOGW
-#define USER_GE_LOGW MY_USER_GE_LOGW
-#endif  // USER_GE_LOGW
-
-#ifndef USER_GE_LOGE
-#define USER_GE_LOGE MY_USER_GE_LOGE
-#endif  // USER_GE_LOGE
-
-/// train_flag is 0 when infer, train_flag is 1 when train.this param is set for RunGranph_readData() and
-/// RunGraph_initData()
-#define TRAIN_FLAG_INFER "infer"
-#define TRAIN_FLAG_TRAIN "train"
-
-#include <string.h>
-#include <unistd.h>
-#include <algorithm>
-#include <chrono>
-#include <iostream>
-#include <thread>
-#include <vector>
-
-#include "ge_api.h"
-#include "graph.h"
-#include "ptest.h"
-#include "ops/all_ops.h"
-using namespace std;
-using namespace ge;
-
-// read bin file and compile result
-void update_op_format(Operator ops, Format format = ge::FORMAT_NCHW);
-void getDimInfo(FILE *fp, std::vector<uint64_t> &dim_info);
-void *readTestDataFile(std::string infile, std::vector<uint64_t> &dim_info);
-void *readUint8TestDataFile(std::string infile, int size);
-bool allclose(float *a, float *b, uint64_t count, float rtol, float atol);
-bool compFp32WithTData(float *actual_output_data, std::string expected_data_file, float rtol, float atol);
-Tensor load_variable_input_data(string input_path, std::vector<int64_t> shapes, Format ft = ge::FORMAT_NCHW,
-                                DataType dt = ge::DT_FLOAT);
-// constructor Tensor
-int GetDatTypeSize(DataType dt);
-ge::Tensor genTensor(std::vector<int64_t> tensor_shape, Format format = ge::FORMAT_NCHW, DataType dt = ge::DT_FLOAT);
-ge::Tensor genTensor_withVaule(std::vector<int64_t> tensor_shape, float value = 1);
-Tensor genTesnor_Shape_as_data(std::vector<int64_t> tensor_shape);
-// Init GE
-ge::Status GEInitialize_api(string train_flag = "0", string run_mode_path = "0");
-ge::Status GEInitialize_api_new(string train_flag = "infer", string run_mode = "fe");
-ge::Status GEFinalize_api();
-// constructor session and build graph
-ge::Session *create_aipp_session();
-ge::Session *create_session();
-ge::Status session_add_and_run_graph(ge::Session *session, uint32_t graphId, Graph &graph, std::vector<Tensor> inputs,
-                                     std::vector<Tensor> &outputs);
-
-// common interface for infer
-int RunGraph_initData(Graph &graph, string op_name, map<string, std::vector<int64_t>> attr_test,
-                      string train_flag = "infer", string run_mode_path = "fe");
-void Inputs_load_Data(string op_name, std::vector<Tensor> &input, map<string, std::vector<int64_t>> attr_test,
-                      Format format = ge::FORMAT_NCHW, DataType dt = ge::DT_FLOAT);
-bool comparaData(std::vector<Tensor> &output, string op_name, map<string, std::vector<int64_t>> attr_test);
-int RunGraph_readData(Graph &graph, string op_name, map<string, std::vector<int64_t>> attr_test,
-                      string train_flag = "infer", string run_mode_path = "fe", Format format = ge::FORMAT_NCHW,
-                      DataType dt = ge::DT_FLOAT);
-
-// common interface for train
-int buildCheckPointGraph(Graph &graph, map<string, TensorDesc> variables);
-int buildInitGraph(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var,
-                   std::vector<float> values_var);
-int buildInitGraph_other_dataType(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var);
-
-bool build_multi_input_multi_output_graph(Graph &graph);
-void build_big_graph(Graph &graph, map<string, std::vector<int64_t>> attr);
-int buildConvGraph_new(Graph &graph, std::vector<TensorDesc> desc_var, std::vector<std::string> name_var, int flag = 2);
-
-#endif  // ST_RESNET50_GE_COMMON_H_
diff --git a/tests/st/resnet50/ptest.h b/tests/st/resnet50/ptest.h
deleted file mode 100644
index 568969f8..00000000
--- a/tests/st/resnet50/ptest.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ST_RESNET50_PTEST_H_
-#define ST_RESNET50_PTEST_H_
-
-#include <stdarg.h>
-#include <string.h>
-#include <exception>
-#include <functional>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-
-namespace ptest {
-class assertion_error : public std::exception {
- public:
-  const char *what() const throw() { return "Assertion Exception"; }
-};
-
-class TestFixture {
- public:
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-  void Run() { _func(); }
-  void BindFunction(std::function<void(void)> function) { _func = function; }
-  void SetName(const std::string &name) { _name = name; }
-  std::string Name() const { return _name; }
-  virtual ~TestFixture() {}
-
- private:
-  std::function<void(void)> _func;
-  std::string _name;
-};
-
-enum TestResult { SUCCESS, FAILED, UNAVAILABLE, UNKNOWN, NOCASEFOUND };
-
-class TestManager {
- public:
-  static TestManager &GetSingleton() {
-    static TestManager instance;
-    return instance;
-  }
-  void RegisterTest(const std::string &name, TestFixture *fixture) { _testfixtures[name] = fixture; }
-
-  const std::string GetRunningTestcaseName() const { return _running_testcase_name; }
-
-  const std::list<std::string> GetAllTestNames() const {
-    std::list<std::string> result;
-    for (auto &t : _testfixtures) {
-      result.push_back(t.first);
-    }
-    return result;
-  }
-
-  TestResult RunTest(const std::string &name) {
-    if (_testfixtures.find(name) == _testfixtures.end()) {
-      return NOCASEFOUND;
-    }
-
-    _running_testcase_name = name;
-
-    do {
-      SetTestResult(name, UNKNOWN);
-      _testfixtures[name]->SetUp();
-      if (_testresults[name] == FAILED) {
-        _testresults[name] = UNAVAILABLE;
-        break;
-      }
-      SetTestResult(name, SUCCESS);
-      try {
-        _testfixtures[name]->Run();
-      } catch (assertion_error &e) {
-        // Do nothing as the error has been handled by the TestManager.
-      }
-      _testfixtures[name]->TearDown();
-    } while (0);
-
-    return _testresults[name];
-  }
-  void SetTestResult(const std::string &name, TestResult result) { _testresults[name] = result; }
-  TestResult GetTestResult(const std::string &name) { return _testresults[name]; }
-
- private:
-  std::map<std::string, TestFixture *> _testfixtures;
-  std::map<std::string, TestResult> _testresults;
-  std::string _running_testcase_name;
-};
-
-class TestFixtureRegister {
- public:
-  TestFixtureRegister(const std::string &name, TestFixture *fixture, std::function<void(void)> function) {
-    fixture->BindFunction(function);
-    fixture->SetName(name);
-    TestManager::GetSingleton().RegisterTest(name, fixture);
-  }
-};
-}  // namespace ptest
-
-#define _STR(x) #x
-#define _EMPTY_NAMESPACE
-
-#define _TEST(NAMESPACE, FIXTURECLASS, TESTNAME, CASENAME)                                              \
-  void g_func_##TESTNAME##_##CASENAME(void);                                                            \
-  NAMESPACE::FIXTURECLASS g_fixture_##TESTNAME##_##CASENAME;                                            \
-  ptest::TestFixtureRegister g_register_##TESTNAME##_##CASENAME(                                        \
-      _STR(TESTNAME##_##CASENAME), &g_fixture_##TESTNAME##_##CASENAME, g_func_##TESTNAME##_##CASENAME); \
-  void g_func_##TESTNAME##_##CASENAME(void)
-
-#define TEST(TESTNAME, CASENAME) _TEST(ptest, TestFixture, TESTNAME, CASENAME)
-
-#define TEST_F(TESTFIXTURE, CASENAME) _TEST(_EMPTY_NAMESPACE, TESTFIXTURE, TESTFIXTURE, CASENAME)
-
-#define EXPECT_TRUE(X)                                                                    \
-  do {                                                                                    \
-    if (!(X)) {                                                                           \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName(); \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);          \
-      std::cerr << #X << "Expectation Failed\n"                                           \
-                << "Testcase Name: " << test_name << "\n"                                  \
-                << "File: " __FILE__ << "\tLine:" << __LINE__ << std::endl;               \
-    }                                                                                     \
-  } while (0);
-
-// With the macro definition ensures that the compiler can detect compiler warning.
-#define Max_Log_Len 1024
-#define PRINT_ERR(lpszFormat, ...)                              \
-  do {                                                          \
-    char szTmpBuf[Max_Log_Len + 1] = {0};                       \
-    snprintf(szTmpBuf, Max_Log_Len, lpszFormat, ##__VA_ARGS__); \
-    std::cerr << szTmpBuf << std::endl;                         \
-  } while (0)
-
-// Increase the content of print error messages and error to facilitate rapid analysis
-#define EXPECT_TRUE_C(X, ERR_TYPE, format, ...)                                                             \
-  do {                                                                                                      \
-    if (!(X)) {                                                                                             \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName();                   \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);                            \
-      std::cerr << #X << " Expectation Failed."                                                             \
-                << "Testcase Name: " << test_name << " File:" __FILE__ << " Line:" << __LINE__ << std::endl; \
-      PRINT_ERR("[" ERR_TYPE "]" format, ##__VA_ARGS__);                                                    \
-    }                                                                                                       \
-  } while (0)
-
-#define ASSERT_TRUE(X)                                                                    \
-  do {                                                                                    \
-    if (!(X)) {                                                                           \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName(); \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);          \
-      std::cerr << #X << "Assertion Failed\n"                                             \
-                << "Testcase Name: " << test_name << "\n"                                  \
-                << "File: " __FILE__ << "\tLine:" << __LINE__ << std::endl;               \
-      throw ptest::assertion_error();                                                     \
-    }                                                                                     \
-  } while (0);
-
-// Add printing error information and error line content for quick analysis
-#define ASSERT_TRUE_C(X, ERR_TYPE, format, ...)                                                             \
-  do {                                                                                                      \
-    if (!(X)) {                                                                                             \
-      std::string test_name = ptest::TestManager::GetSingleton().GetRunningTestcaseName();                   \
-      ptest::TestManager::GetSingleton().SetTestResult(test_name, ptest::FAILED);                            \
-      std::cerr << #X << " Assertion Failed."                                                               \
-                << "Testcase Name: " << test_name << " File:" __FILE__ << " Line:" << __LINE__ << std::endl; \
-      PRINT_ERR("[" ERR_TYPE "]" format, ##__VA_ARGS__);                                                    \
-      throw ptest::assertion_error();                                                                       \
-    }                                                                                                       \
-  } while (0);
-
-#define CONFIG_ERR "CONFIG_ERR"
-#define LOAD_MODEL_ERR "LOAD_MODEL_ERR"
-#define FILE_READ_ERR "FILE_READ_ERR"
-#define RUN_ERROR "RUN_ERROR"
-#define MEM_ERROR "MEM_ERROR"
-#define RESULT_ERR "RESULT_ERR"
-
-#define EXPECT_FALSE(X) EXPECT_TRUE(!(X))
-#define EXPECT_EQ(X, Y) EXPECT_TRUE(((X) == (Y)))
-#define EXPECT_NE(X, Y) EXPECT_TRUE(((X) != (Y)))
-#define EXPECT_GT(X, Y) EXPECT_TRUE(((X) > (Y)))
-#define EXPECT_GE(X, Y) EXPECT_TRUE(((X) >= (Y)))
-#define EXPECT_LT(X, Y) EXPECT_TRUE(((X) < (Y)))
-#define EXPECT_LE(X, Y) EXPECT_TRUE(((X) <= (Y)))
-
-#define EXPECT_FALSE_C(X, ERR_TYPE, format, ...) EXPECT_TRUE_C(!(X), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_EQ_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) == (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_NE_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) != (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_GT_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) > (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_GE_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) >= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_LT_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) < (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define EXPECT_LE_C(X, Y, ERR_TYPE, format, ...) EXPECT_TRUE_C(((X) <= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-
-#define ASSERT_FALSE(X) ASSERT_TRUE(!(X))
-#define ASSERT_EQ(X, Y) ASSERT_TRUE(((X) == (Y)))
-#define ASSERT_NE(X, Y) ASSERT_TRUE(((X) != (Y)))
-#define ASSERT_GT(X, Y) ASSERT_TRUE(((X) > (Y)))
-#define ASSERT_GE(X, Y) ASSERT_TRUE(((X) >= (Y)))
-#define ASSERT_LT(X, Y) ASSERT_TRUE(((X) < (Y)))
-#define ASSERT_LE(X, Y) ASSERT_TRUE(((X) <= (Y)))
-
-#define ASSERT_FALSE_C(X, ERR_TYPE, format, ...) ASSERT_TRUE_C(!(X), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_EQ_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) == (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_NE_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) != (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_GT_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) > (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_GE_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) >= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_LT_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) < (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-#define ASSERT_LE_C(X, Y, ERR_TYPE, format, ...) ASSERT_TRUE_C(((X) <= (Y)), ERR_TYPE, format, ##__VA_ARGS__)
-
-#endif  // ST_RESNET50_PTEST_H_
diff --git a/tests/st/resnet50/resnet50_train.cc b/tests/st/resnet50/resnet50_train.cc
deleted file mode 100644
index f1d1e58d..00000000
--- a/tests/st/resnet50/resnet50_train.cc
+++ /dev/null
@@ -1,852 +0,0 @@
-﻿/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <assert.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <chrono>
-#include <ctime>
-#include <sstream>
-
-#include "common.h"
-#include "ge_api.h"
-#include "graph.h"
-#include "ops/all_ops.h"
-#include "types.h"
-#include "utils/tensor_utils.h"
-
-using namespace std;
-using namespace ge;
-using namespace op;
-
-typedef bool (*Func)(Graph &graph);
-
-#define PADDING_MODE 6
-#define GRAD_PADDING_MODE 3
-vector<int64_t> pad_1{1, 1, 1, 1};
-vector<int64_t> pad_0{0, 0, 0, 0};
-vector<int64_t> stride_1{1, 1};
-vector<int64_t> stride_2{2, 2};
-
-// (int out_channels, int h, int w, vector<uint_64> stride{1,1}, vector<uint_64> pad{1,1,1,1}, op::Data() input)
-#define GENERATE_CONV_VAR(LAYER, BLK, OPNUM, in_channels, out_channels, h, w, stride, pad, input)                     \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;                                                                      \
-                                                                                                                      \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({out_channels, in_channels, h, w}), FORMAT_NCHW, DT_FLOAT);     \
-  auto LAYER##_##BLK##_##OPNUM##_weight = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_weight");   \
-  LAYER##_##BLK##_##OPNUM##_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                              \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_weight =                                                                         \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_weight");                                   \
-  LAYER##_##BLK##_##OPNUM##_mom_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                          \
-  LAYER##_##BLK##_##OPNUM##_mom_weight.update_input_desc_x(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                      \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM) << "'s weight shape is:" << in_channels << out_channels << h \
-       << w << endl;                                                                                                  \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM)                                                              \
-       << "'s input_x op's shape is:" << input.GetOutputDesc("y").GetShape().GetDim(2) << endl;                       \
-  auto LAYER##_##BLK##_##OPNUM##_tmp_dims = input.GetOutputDesc("y").GetShape().GetDims();                            \
-  for (auto LAYER##_##BLK##_##OPNUM##_tmp_it = LAYER##_##BLK##_##OPNUM##_tmp_dims.begin();                            \
-       LAYER##_##BLK##_##OPNUM##_tmp_it != LAYER##_##BLK##_##OPNUM##_tmp_dims.end();                                  \
-       LAYER##_##BLK##_##OPNUM##_tmp_it++) {                                                                          \
-    cout << *LAYER##_##BLK##_##OPNUM##_tmp_it;                                                                        \
-  }                                                                                                                   \
-  cout << endl;                                                                                                       \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM = op::Conv2D(string(#LAYER) + string(#BLK) + string(#OPNUM))                           \
-                                     .set_input_x(input, "y")                                                         \
-                                     .set_input_filter(LAYER##_##BLK##_##OPNUM##_weight)                              \
-                                     .set_attr_strides({1, 1, stride[0], stride[1]})                                  \
-                                     .set_attr_pads(pad)                                                              \
-                                     .set_attr_data_format("NCHW");                                                   \
-  update_op_format(LAYER##_##BLK##_##OPNUM);
-
-#define GENERATE_CONSTANT(LAYER, BLK, OPNUM, CONSTNAME)                                                           \
-  Tensor LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor;                                                          \
-  float *LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data = new float[LAYER##_##BLK##_##OPNUM##_size];                \
-  for (int i = 0; i < (int)LAYER##_##BLK##_##OPNUM##_size; i++) {                                                 \
-    *(LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data + i) = 0.01;                                                   \
-  }                                                                                                               \
-  LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor.SetData((uint8_t *)LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data, \
-                                                         LAYER##_##BLK##_##OPNUM##_size * sizeof(float));         \
-  LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor.SetTensorDesc(LAYER##_##BLK##_##OPNUM##_desc);                   \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_constant =                                                         \
-      op::Constant().set_attr_value(LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_tensor);                              \
-  LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_constant.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);          \
-  delete[] LAYER##_##BLK##_##OPNUM##_##CONSTNAME##_data;
-
-#define GENERATE_CONV_VAR_VAR(LAYER, BLK, OPNUM, in_channels, out_channels, h, w, stride, pad, input)               \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({out_channels, in_channels, h, w}), FORMAT_NCHW, DT_FLOAT);   \
-  uint32_t LAYER##_##BLK##_##OPNUM##_size = LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetShapeSize();               \
-  auto LAYER##_##BLK##_##OPNUM##_weight = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_weight"); \
-  LAYER##_##BLK##_##OPNUM##_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                            \
-                                                                                                                    \
-  auto LAYER##_##BLK##_##OPNUM##_mom_weight =                                                                       \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_weight");                                 \
-  LAYER##_##BLK##_##OPNUM##_mom_weight.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                        \
-                                                                                                                    \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, weight);                                                                     \
-  auto LAYER##_##BLK##_##OPNUM##_weight_assign = op::Assign()                                                       \
-                                                     .set_input_ref(LAYER##_##BLK##_##OPNUM##_weight)               \
-                                                     .set_input_value(LAYER##_##BLK##_##OPNUM##_weight_constant);   \
-                                                                                                                    \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mom_weight);                                                                 \
-  auto LAYER##_##BLK##_##OPNUM##_mom_weight_assign =                                                                \
-      op::Assign()                                                                                                  \
-          .set_input_ref(LAYER##_##BLK##_##OPNUM##_mom_weight)                                                      \
-          .set_input_value(LAYER##_##BLK##_##OPNUM##_mom_weight_constant);                                          \
-                                                                                                                    \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_weight);                                                                \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mom_weight);
-
-// (int out_channels, Operator& input)
-#define GENERATE_BN_VAR(LAYER, BLK, OPNUM, out_channels, input)                                                   \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;                                                                  \
-                                                                                                                  \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({1, out_channels, 1, 1}), FORMAT_NCHW, DT_FLOAT);           \
-  auto LAYER##_##BLK##_##OPNUM##_scale = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_scale"); \
-  LAYER##_##BLK##_##OPNUM##_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_mom_scale =                                                                      \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_scale");                                \
-  LAYER##_##BLK##_##OPNUM##_mom_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                       \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_b");         \
-  LAYER##_##BLK##_##OPNUM##_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                               \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_mom_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_b"); \
-  LAYER##_##BLK##_##OPNUM##_mom_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM##_mean = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mean");   \
-  LAYER##_##BLK##_##OPNUM##_mean.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                            \
-  auto LAYER##_##BLK##_##OPNUM##_variance =                                                                       \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_variance");                                 \
-  LAYER##_##BLK##_##OPNUM##_variance.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                        \
-                                                                                                                  \
-  auto LAYER##_##BLK##_##OPNUM = op::FusedBatchNorm(string(#LAYER) + string(#BLK) + string(#OPNUM))               \
-                                     .set_input_x(input, "y")                                                     \
-                                     .set_input_scale(LAYER##_##BLK##_##OPNUM##_scale)                            \
-                                     .set_input_b(LAYER##_##BLK##_##OPNUM##_b)                                    \
-                                     .set_input_mean(LAYER##_##BLK##_##OPNUM##_mean)                              \
-                                     .set_input_variance(LAYER##_##BLK##_##OPNUM##_variance)                      \
-                                     .set_attr_mode(1)                                                            \
-                                     .set_attr_epsilon(1e-5)                                                      \
-                                     .set_attr_is_training(true);
-
-#define GENERATE_BN_VAR_VAR(LAYER, BLK, OPNUM, out_channels, input)                                                   \
-  TensorDesc LAYER##_##BLK##_##OPNUM##_desc(ge::Shape({1, out_channels, 1, 1}), FORMAT_NCHW, DT_FLOAT);               \
-  uint32_t LAYER##_##BLK##_##OPNUM##_size = LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetShapeSize();                 \
-  auto LAYER##_##BLK##_##OPNUM##_scale = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_scale");     \
-  LAYER##_##BLK##_##OPNUM##_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                               \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_scale =                                                                          \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_scale");                                    \
-  LAYER##_##BLK##_##OPNUM##_mom_scale.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                           \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_b");             \
-  LAYER##_##BLK##_##OPNUM##_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                                   \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_b = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mom_b");     \
-  LAYER##_##BLK##_##OPNUM##_mom_b.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                               \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mean = op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_mean");       \
-  LAYER##_##BLK##_##OPNUM##_mean.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                                \
-  auto LAYER##_##BLK##_##OPNUM##_variance =                                                                           \
-      op::Variable(string(#LAYER) + string(#BLK) + string(#OPNUM) + "_variance");                                     \
-  LAYER##_##BLK##_##OPNUM##_variance.update_output_desc_y(LAYER##_##BLK##_##OPNUM##_desc);                            \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, scale);                                                                        \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_scale_assign = op::Assign()                                                          \
-                                                    .set_input_ref(LAYER##_##BLK##_##OPNUM##_scale)                   \
-                                                    .set_input_value(LAYER##_##BLK##_##OPNUM##_scale_constant);       \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mom_scale);                                                                    \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_scale_assign =                                                                   \
-      op::Assign()                                                                                                    \
-          .set_input_ref(LAYER##_##BLK##_##OPNUM##_mom_scale)                                                         \
-          .set_input_value(LAYER##_##BLK##_##OPNUM##_mom_scale_constant);                                             \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, b);                                                                            \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_b_assign =                                                                           \
-      op::Assign().set_input_ref(LAYER##_##BLK##_##OPNUM##_b).set_input_value(LAYER##_##BLK##_##OPNUM##_b_constant);  \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mom_b);                                                                        \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mom_b_assign = op::Assign()                                                          \
-                                                    .set_input_ref(LAYER##_##BLK##_##OPNUM##_mom_b)                   \
-                                                    .set_input_value(LAYER##_##BLK##_##OPNUM##_mom_b_constant);       \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, mean);                                                                         \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_mean_assign = op::Assign()                                                           \
-                                                   .set_input_ref(LAYER##_##BLK##_##OPNUM##_mean)                     \
-                                                   .set_input_value(LAYER##_##BLK##_##OPNUM##_mean_constant);         \
-                                                                                                                      \
-  GENERATE_CONSTANT(LAYER, BLK, OPNUM, variance);                                                                     \
-                                                                                                                      \
-  auto LAYER##_##BLK##_##OPNUM##_variance_assign = op::Assign()                                                       \
-                                                       .set_input_ref(LAYER##_##BLK##_##OPNUM##_variance)             \
-                                                       .set_input_value(LAYER##_##BLK##_##OPNUM##_variance_constant); \
-                                                                                                                      \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_scale);                                                                   \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mom_scale);                                                               \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_b);                                                                       \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mom_b);                                                                   \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_mean);                                                                    \
-  input.push_back(LAYER##_##BLK##_##OPNUM##_variance);
-
-// (int out_channels, Operator& input)
-#define GENERATE_RELU_VAR(LAYER, BLK, OPNUM, input) \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;    \
-  auto LAYER##_##BLK##_##OPNUM = op::Relu(string(#LAYER) + string(#BLK) + string(#OPNUM)).set_input_x(input, "y");
-
-// (int out_channels, Operator& input)
-#define GENERATE_MAXPOOL_VAR(LAYER, BLK, OPNUM, input)                                                 \
-  auto &LAYER##_##BLK##_##OPNUM##_input = input;                                                       \
-                                                                                                       \
-  auto LAYER##_##BLK##_##OPNUM = op::MaxPoolWithArgmax(string(#LAYER) + string(#BLK) + string(#OPNUM)) \
-                                     .set_input_x(input, "y")                                          \
-                                     .set_attr_ksize({1, 3, 3, 1})                                     \
-                                     .set_attr_padding("SAME")                                         \
-                                     .set_attr_strides({1, 2, 2, 1});
-
-// (int out_channels, Operator& input)
-#define GENERATE_ADD_VAR(LAYER, BLK, OPNUM, input_x1, input_x2) \
-  auto LAYER##_##BLK##_##OPNUM =                                \
-      op::Add(string(#LAYER) + string(#BLK) + string(#OPNUM)).set_input_x1(input_x1, "y").set_input_x2(input_x2, "y");
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_RESIDUAL_BLOCK(LAYER, BLK, in_channels, out_channels, stride, input)                                 \
-  auto &LAYER##_##BLK##_input = input;                                                                            \
-  auto &LAYER##_##BLK##_stride = stride;                                                                          \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv1);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu1, LAYER##_##BLK##_bn1);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                    LAYER##_##BLK##_relu1);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv2);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu2, LAYER##_##BLK##_bn2);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0,             \
-                    LAYER##_##BLK##_relu2);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn3, out_channels, LAYER##_##BLK##_conv3);                                          \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv4, in_channels, out_channels, 1, 1, stride, pad_0, input);                    \
-  GENERATE_BN_VAR(LAYER, BLK, bn4, out_channels, LAYER##_##BLK##_conv4);                                          \
-                                                                                                                  \
-  GENERATE_ADD_VAR(LAYER, BLK, add5, LAYER##_##BLK##_bn3, LAYER##_##BLK##_bn4);                                   \
-  GENERATE_RELU_VAR(LAYER, BLK, relu5, LAYER##_##BLK##_add5);                                                     \
-                                                                                                                  \
-  auto &LAYER##_##BLK##_output = LAYER##_##BLK##_relu5;                                                           \
-  auto &LAYER##_##BLK##_output_label = "y";
-
-#define MAKE_RESIDUAL_BLOCK_VAR(LAYER, BLK, in_channels, out_channels, stride, input)                                 \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                    \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                        input);                                                                                       \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0, input);     \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn3, out_channels, input);                                                          \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv4, in_channels, out_channels, 1, 1, stride, pad_0, input);                    \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn4, out_channels, input);
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_NORMAL_BLOCK(LAYER, BLK, in_channels, out_channels, stride, input)                                   \
-  auto &LAYER##_##BLK##_input = input;                                                                            \
-  auto &LAYER##_##BLK##_stride = stride;                                                                          \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv1);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu1, LAYER##_##BLK##_bn1);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                    LAYER##_##BLK##_relu1);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_conv2);                              \
-  GENERATE_RELU_VAR(LAYER, BLK, relu2, LAYER##_##BLK##_bn2);                                                      \
-                                                                                                                  \
-  GENERATE_CONV_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0,             \
-                    LAYER##_##BLK##_relu2);                                                                       \
-  GENERATE_BN_VAR(LAYER, BLK, bn3, out_channels, LAYER##_##BLK##_conv3);                                          \
-                                                                                                                  \
-  GENERATE_ADD_VAR(LAYER, BLK, add5, LAYER##_##BLK##_bn3, input);                                                 \
-  GENERATE_RELU_VAR(LAYER, BLK, relu5, LAYER##_##BLK##_add5);                                                     \
-                                                                                                                  \
-  auto &LAYER##_##BLK##_output = LAYER##_##BLK##_relu5;                                                           \
-  auto &LAYER##_##BLK##_output_label = "y";
-
-#define MAKE_NORMAL_BLOCK_VAR(LAYER, BLK, in_channels, out_channels, stride, input)                                   \
-  int LAYER##_##BLK##_out_chls = out_channels / 4;                                                                    \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv1, in_channels, LAYER##_##BLK##_out_chls, 1, 1, stride, pad_0, input);        \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn1, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv2, LAYER##_##BLK##_out_chls, LAYER##_##BLK##_out_chls, 3, 3, stride_1, pad_1, \
-                        input);                                                                                       \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn2, LAYER##_##BLK##_out_chls, input);                                              \
-                                                                                                                      \
-  GENERATE_CONV_VAR_VAR(LAYER, BLK, conv3, LAYER##_##BLK##_out_chls, out_channels, 1, 1, stride_1, pad_0, input);     \
-  GENERATE_BN_VAR_VAR(LAYER, BLK, bn3, out_channels, input);
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_RESIDUAL_LAYER(LAYER, in_channels, out_channels, stride, input)  \
-  MAKE_RESIDUAL_BLOCK(LAYER, blk1, in_channels, out_channels, stride, input); \
-                                                                              \
-  auto &LAYER##_output = LAYER##_blk1_output;                                 \
-  auto &LAYER##_output_label = LAYER##_blk1_output_label;
-
-#define MAKE_RESIDUAL_LAYER_VAR(LAYER, in_channels, out_channels, stride, input) \
-  MAKE_RESIDUAL_BLOCK_VAR(LAYER, blk1, in_channels, out_channels, stride, input);
-
-// (int in_channels, int out_channels,vector<int64_t> stride{1,1}, Operator& input)
-#define MAKE_NORMAL_LAYER(LAYER, in_channels, out_channels, stride, input)  \
-  MAKE_NORMAL_BLOCK(LAYER, blk1, in_channels, out_channels, stride, input); \
-                                                                            \
-  auto &LAYER##_output = LAYER##_blk1_output;                               \
-  auto &LAYER##_output_label = LAYER##_blk1_output_label;
-
-#define MAKE_NORMAL_LAYER_VAR(LAYER, in_channels, out_channels, stride, input) \
-  MAKE_NORMAL_BLOCK_VAR(LAYER, blk1, in_channels, out_channels, stride, input);
-
-#define MAKE_RESNET50(input)                                         \
-  MAKE_RESIDUAL_LAYER(layer1, 64, 256, stride_1, input)              \
-  MAKE_NORMAL_LAYER(layer2, 256, 256, stride_1, layer1_output)       \
-  MAKE_NORMAL_LAYER(layer3, 256, 256, stride_1, layer2_output)       \
-  MAKE_RESIDUAL_LAYER(layer4, 256, 512, stride_2, layer3_output)     \
-  MAKE_NORMAL_LAYER(layer5, 512, 512, stride_1, layer4_output)       \
-  MAKE_NORMAL_LAYER(layer6, 512, 512, stride_1, layer5_output)       \
-  MAKE_NORMAL_LAYER(layer7, 512, 512, stride_1, layer6_output)       \
-  MAKE_RESIDUAL_LAYER(layer8, 512, 1024, stride_2, layer7_output)    \
-  MAKE_NORMAL_LAYER(layer9, 1024, 1024, stride_1, layer8_output)     \
-  MAKE_NORMAL_LAYER(layer10, 1024, 1024, stride_1, layer9_output)    \
-  MAKE_NORMAL_LAYER(layer11, 1024, 1024, stride_1, layer10_output)   \
-  MAKE_NORMAL_LAYER(layer12, 1024, 1024, stride_1, layer11_output)   \
-  MAKE_NORMAL_LAYER(layer13, 1024, 1024, stride_1, layer12_output)   \
-  MAKE_RESIDUAL_LAYER(layer14, 1024, 2048, stride_2, layer13_output) \
-  MAKE_NORMAL_LAYER(layer15, 2048, 2048, stride_1, layer14_output)   \
-  MAKE_NORMAL_LAYER(layer16, 2048, 2048, stride_1, layer15_output)   \
-                                                                     \
-  auto &resnet50_output = layer16_output;                            \
-  auto &resnet50_output_label = layer16_output_label;
-
-#define MAKE_RESNET50_VAR(inputs)                                \
-  MAKE_RESIDUAL_LAYER_VAR(layer1, 64, 256, stride_1, inputs)     \
-  MAKE_NORMAL_LAYER_VAR(layer2, 256, 256, stride_1, inputs)      \
-  MAKE_NORMAL_LAYER_VAR(layer3, 256, 256, stride_1, inputs)      \
-  MAKE_RESIDUAL_LAYER_VAR(layer4, 256, 512, stride_2, inputs)    \
-  MAKE_NORMAL_LAYER_VAR(layer5, 512, 512, stride_1, inputs)      \
-  MAKE_NORMAL_LAYER_VAR(layer6, 512, 512, stride_1, inputs)      \
-  MAKE_NORMAL_LAYER_VAR(layer7, 512, 512, stride_1, inputs)      \
-  MAKE_RESIDUAL_LAYER_VAR(layer8, 512, 1024, stride_2, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer9, 1024, 1024, stride_1, inputs)    \
-  MAKE_NORMAL_LAYER_VAR(layer10, 1024, 1024, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer11, 1024, 1024, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer12, 1024, 1024, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer13, 1024, 1024, stride_1, inputs)   \
-  MAKE_RESIDUAL_LAYER_VAR(layer14, 1024, 2048, stride_2, inputs) \
-  MAKE_NORMAL_LAYER_VAR(layer15, 2048, 2048, stride_1, inputs)   \
-  MAKE_NORMAL_LAYER_VAR(layer16, 2048, 2048, stride_1, inputs)   \
-//---------------------------------------------------------------------------------------------
-
-// (Operator& input)
-#define GENERATE_BIASADD_GRAD(LAYER, BLK, OPNUM, input)                                \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                                                \
-      op::BiasAddGrad(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-          .set_input_x(input, input.name_out_dx());
-
-// (Operator& input)
-#define GENERATE_MATMUL_GRAD(LAYER, BLK, OPNUM, input) \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                \
-      op::MatMul(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")).set_input_x1(input);
-
-// (Operator& input)
-#define GENERATE_RESHAPE_GRAD(LAYER, BLK, OPNUM, input) \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                 \
-      op::Reshape(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")).set_input_tensor(input);
-
-// (Operator& input_grad, Operator& input_maxpool)
-#define GENERATE_MAXPOOL_GRAD(LAYER, BLK, OPNUM, input_grad, input_maxpool)                      \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                                                          \
-      op::MaxPoolGradWithArgmax(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-          .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y")                                     \
-          .set_input_grad(input_grad)                                                            \
-          .set_input_argmax(input_maxpool, input_maxpool.name_out_argmax())                      \
-          .set_attr_ksize({1, 1, 3, 3})                                                          \
-          .set_attr_strides({1, 1, 2, 2})                                                        \
-          .set_attr_padding("SAME");
-
-// (Operator& input_dy)
-#define GENERATE_RELU_GRAD(LAYER, BLK, OPNUM, input_dy, dy_label)                                                     \
-  auto LAYER##_##BLK##_##OPNUM##_grad = op::ReluGrad(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-                                            .set_input_gradients(input_dy, dy_label)                                  \
-                                            .set_input_features(LAYER##_##BLK##_##OPNUM, "y");
-
-// (Operator& input_dy)
-#define GENERATE_BN_GRAD(LAYER, BLK, OPNUM, input_dy)                                                         \
-  auto LAYER##_##BLK##_##OPNUM##_grad =                                                                       \
-      op::FusedBatchNormGrad(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad"))                 \
-          .set_input_dy(input_dy, "backprops")                                                                \
-          .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y")                                                  \
-          .set_input_scale(LAYER##_##BLK##_##OPNUM##_scale)                                                   \
-          .set_input_save_mean(LAYER##_##BLK##_##OPNUM, "save_mean")                                          \
-          .set_input_save_inv_variance(LAYER##_##BLK##_##OPNUM, "save_inv_variance")                          \
-          .set_attr_epsilon(0.0001);                                                                          \
-                                                                                                              \
-  auto LAYER##_##BLK##_##OPNUM##_momentum_scale =                                                             \
-      op::ApplyMomentum()                                                                                     \
-          .set_input_accum(LAYER##_##BLK##_##OPNUM##_mom_scale)                                               \
-          .set_input_grad(LAYER##_##BLK##_##OPNUM##_grad, LAYER##_##BLK##_##OPNUM##_grad.name_out_bn_scale()) \
-          .set_input_lr(label1)                                                                               \
-          .set_input_momentum(label1)                                                                         \
-          .set_input_var(LAYER##_##BLK##_##OPNUM##_scale);                                                    \
-                                                                                                              \
-  auto LAYER##_##BLK##_##OPNUM##_momentum_b =                                                                 \
-      op::ApplyMomentum()                                                                                     \
-          .set_input_accum(LAYER##_##BLK##_##OPNUM##_mom_b)                                                   \
-          .set_input_grad(LAYER##_##BLK##_##OPNUM##_grad, LAYER##_##BLK##_##OPNUM##_grad.name_out_bn_bias())  \
-          .set_input_lr(label1)                                                                               \
-          .set_input_momentum(label1)                                                                         \
-          .set_input_var(LAYER##_##BLK##_##OPNUM##_b);
-
-// (Operator& input)
-#define GENERATE_CONV_PROP_FILTER(LAYER, BLK, OPNUM, input_bngrad, stride)                                    \
-  auto LAYER##_##BLK##_##OPNUM##_propfilter =                                                                 \
-      op::Conv2DBackpropFilterD(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("_propfilter"))       \
-          .set_input_x(LAYER##_##BLK##_##OPNUM##_input, "y")                                                  \
-          .set_attr_filter_size(LAYER##_##BLK##_##OPNUM##_desc.GetShape().GetDims())                          \
-          .set_input_out_backprop(input_bngrad, input_bngrad.name_out_dx())                                   \
-          .set_attr_strides(stride)                                                                           \
-          .set_attr_pads({1, 1, 1, 1});                                                                       \
-                                                                                                              \
-  update_op_format(LAYER##_##BLK##_##OPNUM##_propfilter);                                                     \
-  auto LAYER##_##BLK##_##OPNUM##_momentum_weight = op::ApplyMomentum()                                        \
-                                                       .set_input_accum(LAYER##_##BLK##_##OPNUM##_mom_weight) \
-                                                       .set_input_grad(LAYER##_##BLK##_##OPNUM##_propfilter)  \
-                                                       .set_input_lr(label1)                                  \
-                                                       .set_input_momentum(label1)                            \
-                                                       .set_input_var(LAYER##_##BLK##_##OPNUM##_weight);
-
-///.set_attr_input_size({input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(0),LAYER##_##BLK##_##OPNUM##_weight.GetOutputDesc().GetShape().GetDim(1),
-///input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(2)*stride[2],
-///input_bngrad.name_out_dx().GetOutputDesc().GetShape().GetDim(3)*stride[3]})
-#define GENERATE_CONV_PROP_INPUT(LAYER, BLK, OPNUM, input_bngrad, stride)                                           \
-  auto LAYER##_##BLK##_##OPNUM##_propinput =                                                                        \
-      op::Conv2DBackpropInputD(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("_propinput"))               \
-          .set_attr_input_size(LAYER##_##BLK##_##OPNUM##_input.GetOutputDesc("y").GetShape().GetDims())             \
-          .set_input_filter(LAYER##_##BLK##_##OPNUM##_weight)                                                       \
-          .set_input_out_backprop(input_bngrad, input_bngrad.name_out_dx())                                         \
-          .set_attr_strides(stride)                                                                                 \
-          .set_attr_pads({1, 1, 1, 1});                                                                             \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM) + "_propinput"                                             \
-       << "'s input_x op's shape is:" << input_bngrad.GetOutputDesc("dx").GetShape().GetDim(3) * stride[3] << endl; \
-  cout << string(#LAYER) + string(#BLK) + string(#OPNUM) + "_propinput"                                             \
-       << "'s input_x op's shape is:" << input_bngrad.GetOutputDesc("dx").GetShape().GetDim(2) * stride[2] << endl; \
-                                                                                                                    \
-  update_op_format(LAYER##_##BLK##_##OPNUM##_propinput);                                                            \
-  auto &LAYER##_##BLK##_##OPNUM##_propinput_label = "y"
-
-// (int out_channels, Operator& input)
-#define GENERATE_ADD_GRAD(LAYER, BLK, OPNUM, input_x1, input_x1_label, input_x2, input_x2_label)                 \
-  auto LAYER##_##BLK##_##OPNUM##_grad = op::Add(string(#LAYER) + string(#BLK) + string(#OPNUM) + string("grad")) \
-                                            .set_input_x1(input_x1, input_x1_label)                              \
-                                            .set_input_x2(input_x2, input_x2_label);
-
-// (Operator& input)
-#define MAKE_RESIDUAL_BLOCK_GRAD(LAYER, BLK, input_dy, dy_label)                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu5, input_dy, dy_label);                                                  \
-                                                                                                              \
-  GENERATE_BN_GRAD(LAYER, BLK, bn4, LAYER##_##BLK##_relu5_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv4, LAYER##_##BLK##_bn4_grad, LAYER##_##BLK##_stride);             \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv4, LAYER##_##BLK##_bn4_grad, LAYER##_##BLK##_stride);              \
-                                                                                                              \
-  GENERATE_BN_GRAD(LAYER, BLK, bn3, LAYER##_##BLK##_relu5_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu2, LAYER##_##BLK##_conv3_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn2, LAYER##_##BLK##_relu2_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu1, LAYER##_##BLK##_conv2_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn1, LAYER##_##BLK##_relu1_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);             \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);              \
-                                                                                                              \
-  GENERATE_ADD_GRAD(LAYER, BLK, add5, LAYER##_##BLK##_conv1_propinput, LAYER##_##BLK##_conv1_propinput_label, \
-                    LAYER##_##BLK##_conv4_propinput, LAYER##_##BLK##_conv4_propinput_label);                  \
-                                                                                                              \
-  auto &LAYER##_##BLK##_grad_output = LAYER##_##BLK##_add5_grad;                                              \
-  auto &LAYER##_##BLK##_grad_output_label = "y"
-
-// (Operator& input)
-#define MAKE_NORMAL_BLOCK_GRAD(LAYER, BLK, input_dy, dy_label)                                                \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu5, input_dy, dy_label);                                                  \
-                                                                                                              \
-  GENERATE_BN_GRAD(LAYER, BLK, bn3, LAYER##_##BLK##_relu5_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv3, LAYER##_##BLK##_bn3_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu2, LAYER##_##BLK##_conv3_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn2, LAYER##_##BLK##_relu2_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                           \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv2, LAYER##_##BLK##_bn2_grad, stride_1);                            \
-                                                                                                              \
-  GENERATE_RELU_GRAD(LAYER, BLK, relu1, LAYER##_##BLK##_conv2_propinput, "y");                                \
-  GENERATE_BN_GRAD(LAYER, BLK, bn1, LAYER##_##BLK##_relu1_grad);                                              \
-  GENERATE_CONV_PROP_FILTER(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);             \
-  GENERATE_CONV_PROP_INPUT(LAYER, BLK, conv1, LAYER##_##BLK##_bn1_grad, LAYER##_##BLK##_stride);              \
-                                                                                                              \
-  GENERATE_ADD_GRAD(LAYER, BLK, add5, LAYER##_##BLK##_conv1_propinput, LAYER##_##BLK##_conv1_propinput_label, \
-                    input_dy, dy_label);                                                                      \
-                                                                                                              \
-  auto &LAYER##_##BLK##_grad_output = LAYER##_##BLK##_add5_grad;                                              \
-  auto &LAYER##_##BLK##_grad_output_label = "y"
-
-// (Operator& input_dy)
-#define MAKE_RESIDUAL_LAYER_GRAD(LAYER, input_dy, dy_label)  \
-  MAKE_RESIDUAL_BLOCK_GRAD(LAYER, blk1, input_dy, dy_label); \
-                                                             \
-  auto &LAYER##_grad_output = LAYER##_blk1_grad_output;      \
-  auto &LAYER##_grad_output_label = LAYER##_blk1_grad_output_label;
-
-// (Operator& input_dy)
-#define MAKE_NORMAL_LAYER_GRAD(LAYER, input_dy, dy_label)  \
-  MAKE_NORMAL_BLOCK_GRAD(LAYER, blk1, input_dy, dy_label); \
-                                                           \
-  auto &LAYER##_grad_output = LAYER##_blk1_grad_output;    \
-  auto &LAYER##_grad_output_label = LAYER##_blk1_grad_output_label;
-
-#define MAKE_RESNET50_GRAD(input_dy, dy_label)                                      \
-  MAKE_NORMAL_LAYER_GRAD(layer16, input_dy, dy_label)                               \
-  MAKE_NORMAL_LAYER_GRAD(layer15, layer16_grad_output, layer16_grad_output_label)   \
-  MAKE_RESIDUAL_LAYER_GRAD(layer14, layer15_grad_output, layer15_grad_output_label) \
-  MAKE_NORMAL_LAYER_GRAD(layer13, layer14_grad_output, layer14_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer12, layer13_grad_output, layer13_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer11, layer12_grad_output, layer12_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer10, layer11_grad_output, layer11_grad_output_label)   \
-  MAKE_NORMAL_LAYER_GRAD(layer9, layer10_grad_output, layer10_grad_output_label)    \
-  MAKE_RESIDUAL_LAYER_GRAD(layer8, layer9_grad_output, layer9_grad_output_label)    \
-  MAKE_NORMAL_LAYER_GRAD(layer7, layer8_grad_output, layer8_grad_output_label)      \
-  MAKE_NORMAL_LAYER_GRAD(layer6, layer7_grad_output, layer7_grad_output_label)      \
-  MAKE_NORMAL_LAYER_GRAD(layer5, layer6_grad_output, layer6_grad_output_label)      \
-  MAKE_RESIDUAL_LAYER_GRAD(layer4, layer5_grad_output, layer5_grad_output_label)    \
-  MAKE_NORMAL_LAYER_GRAD(layer3, layer4_grad_output, layer4_grad_output_label)      \
-  MAKE_NORMAL_LAYER_GRAD(layer2, layer3_grad_output, layer3_grad_output_label)      \
-  MAKE_RESIDUAL_LAYER_GRAD(layer1, layer2_grad_output, layer2_grad_output_label)    \
-                                                                                    \
-  auto &resnet50_grad_output = layer1_grad_output;                                  \
-  auto &resnet50_grad_output_label = layer1_grad_output_label;
-
-bool resnet50(Graph &graph) {
-  auto data = op::Data().set_attr_index(0);
-  auto data1 = op::Data().set_attr_index(1);
-  TensorDesc shape_desc(ge::Shape({32, 3, 224, 224}), FORMAT_NCHW, DT_FLOAT);
-  data.update_output_desc_y(shape_desc);
-
-  TensorDesc desc(ge::Shape({64, 3, 7, 7}), FORMAT_NCHW, DT_FLOAT);
-
-  auto var = op::Variable("conv2d_var");
-  var.update_output_desc_y(desc);
-  var.update_input_desc_x(desc);
-
-  auto varw1 = op::Variable("conv2d_varw1");
-  varw1.update_output_desc_y(desc);
-
-  auto conv2d = op::Conv2D("Translate")
-                    .set_input_x(data)
-                    .set_input_filter(var)
-                    .set_attr_strides({1, 1, 2, 2})
-                    .set_attr_pads({2, 3, 2, 3})
-                    .set_attr_data_format("NCHW");
-  TensorDesc desc_y;
-  desc_y.SetFormat(FORMAT_NCHW); // shape: 32 64 112 112
-  conv2d.update_output_desc_y(desc_y);
-
-  TensorDesc desc1(ge::Shape({1, 64, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  auto var1 = op::Variable("bn_var1");
-  var1.update_output_desc_y(desc1);
-
-  auto var2 = op::Variable("bn_var2");
-  var2.update_output_desc_y(desc1);
-
-  auto var3 = op::Variable("bn_var3");
-  var3.update_output_desc_y(desc1);
-
-  auto var4 = op::Variable("bn_var4");
-  var4.update_output_desc_y(desc1);
-
-  TensorDesc desc2(ge::Shape({2048, 1001}), FORMAT_NCHW, DT_FLOAT);
-
-  auto var5 = op::Variable("var5");
-  var5.update_output_desc_y(desc2);
-
-  auto var6 = op::Variable("var6");
-  var6.update_output_desc_y(desc2);
-
-  TensorDesc desclabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-
-  auto label1 = op::Variable("label1");
-  label1.update_output_desc_y(desclabel);
-
-  TensorDesc descmatlabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  auto matvar = op::Variable("matvar");
-  matvar.update_output_desc_y(descmatlabel);
-
-  auto matvar1 = op::Variable("matvar1");
-  matvar1.update_output_desc_y(descmatlabel);
-
-  auto bn = op::FusedBatchNorm()
-                .set_input_x(conv2d, "y")
-                .set_input_scale(var1)
-                .set_input_b(var2)
-                .set_input_mean(var3)
-                .set_input_variance(var4)
-                .set_attr_mode(1)
-                .set_attr_epsilon(1e-5)
-                .set_attr_is_training(true)
-                .set_attr_is_training_fusion(true)
-                .set_attr_moving_average_fraction(994352128);
-
-  auto relu = op::Relu().set_input_x(bn, "y");
-
-  auto maxpool = op::MaxPoolWithArgmax()
-                     .set_input_x(relu, "y")
-                     .set_attr_ksize({1, 3, 3, 1})
-                     .set_attr_padding("SAME")
-                     .set_attr_strides({1, 2, 2, 1});
-
-  MAKE_RESNET50(maxpool);
-  std::vector<Operator> inputs{data};  //,var,var1,layer1_blk1_bn1_b,var3,var4};
-  std::vector<Operator> outputs{};
-
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return true;
-}
-
-#define GENERATE_CONSTANT_USE_DESC(OPNUM, desc, val)                                 \
-  uint32_t OPNUM##_size = desc.GetShape().GetShapeSize();                            \
-  Tensor OPNUM##_tensor;                                                             \
-  OPNUM##_tensor.SetTensorDesc(desc);                                                \
-  if (desc.GetDataType() == DT_FLOAT) {                                              \
-    float *OPNUM##_data = new float[OPNUM##_size];                                   \
-    for (int i = 0; i < (int)OPNUM##_size; i++) {                                    \
-      *(OPNUM##_data + i) = val;                                                     \
-    }                                                                                \
-    OPNUM##_tensor.SetData((uint8_t *)OPNUM##_data, OPNUM##_size * sizeof(float));   \
-    delete[] OPNUM##_data;                                                           \
-  }                                                                                  \
-  if (desc.GetDataType() == DT_INT64) {                                              \
-    int64_t *OPNUM##_data = new int64_t[OPNUM##_size];                               \
-    for (int i = 0; i < (int)OPNUM##_size; i++) {                                    \
-      *(OPNUM##_data + i) = val;                                                     \
-    }                                                                                \
-    OPNUM##_tensor.SetData((uint8_t *)OPNUM##_data, OPNUM##_size * sizeof(int64_t)); \
-    delete[] OPNUM##_data;                                                           \
-  }                                                                                  \
-  auto OPNUM##_constant = op::Constant().set_attr_value(OPNUM##_tensor);             \
-  OPNUM##_constant.update_output_desc_y(desc);
-
-#define GENERATE_VAR_LAYER(OPNUM, desc, input)                                                        \
-  auto OPNUM##_weight = op::Variable(string(#OPNUM));                                                 \
-  OPNUM##_weight.update_output_desc_y(desc);                                                          \
-  auto OPNUM##_assign = op::Assign().set_input_ref(OPNUM##_weight).set_input_value(OPNUM##_constant); \
-                                                                                                      \
-  input.push_back(OPNUM##_weight);
-
-#define GENERATE_VAR_LAYER_1(OPNUM, desc, var_format, input, name)                                    \
-  auto OPNUM##_weight = op::Variable(string(name));                                                   \
-  OPNUM##_weight.update_output_desc_y(desc);                                                          \
-  auto OPNUM##_assign = op::Assign().set_input_ref(OPNUM##_weight).set_input_value(OPNUM##_constant); \
-                                                                                                      \
-  input.push_back(OPNUM##_weight);
-
-int BuildInitVarGraph(Graph &graph) {
-  std::vector<Operator> inputs{};
-  std::vector<Operator> outputs{};
-
-  TensorDesc desc(ge::Shape({64, 3, 7, 7}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(conv2d_var, desc, 0.01);
-  GENERATE_VAR_LAYER(conv2d_var, desc, inputs);
-
-  GENERATE_CONSTANT_USE_DESC(conv2d_varw1, desc, 0.01);
-  GENERATE_VAR_LAYER(conv2d_varw1, desc, inputs);
-
-  TensorDesc desc1(ge::Shape({1, 64, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(bn_var1, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var1, desc1, inputs);
-  GENERATE_CONSTANT_USE_DESC(bn_var2, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var2, desc1, inputs);
-  GENERATE_CONSTANT_USE_DESC(bn_var3, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var3, desc1, inputs);
-  GENERATE_CONSTANT_USE_DESC(bn_var4, desc1, 0.01);
-  GENERATE_VAR_LAYER(bn_var4, desc1, inputs);
-
-  TensorDesc desc2(ge::Shape({2048, 1001}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(var5, desc2, 0.01);
-  GENERATE_VAR_LAYER(var5, desc2, inputs);
-  GENERATE_CONSTANT_USE_DESC(var6, desc2, 0.01);
-  GENERATE_VAR_LAYER(var6, desc2, inputs);
-
-  TensorDesc desclabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(label1, desclabel, 0.1);
-  GENERATE_VAR_LAYER(label1, desclabel, inputs);
-
-  TensorDesc descmatlabel(ge::Shape({1, 1001, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  GENERATE_CONSTANT_USE_DESC(matvar, descmatlabel, 0.01);
-  GENERATE_VAR_LAYER(matvar, descmatlabel, inputs);
-  GENERATE_CONSTANT_USE_DESC(matvar1, descmatlabel, 0.01);
-  GENERATE_VAR_LAYER(matvar1, descmatlabel, inputs);
-
-  MAKE_RESNET50_VAR(inputs);
-
-  TensorDesc ctrl(ge::Shape({1, 1, 1, 1}), FORMAT_NCHW, DT_INT64);
-
-  GENERATE_CONSTANT_USE_DESC(iterations_per_loop, ctrl, 100);
-  GENERATE_VAR_LAYER_1(iterations_per_loop, ctrl, "4D", inputs, "npu_runconfig/iterations_per_loop");
-  GENERATE_CONSTANT_USE_DESC(loop_cond, ctrl, 0);
-  GENERATE_VAR_LAYER_1(loop_cond, ctrl, "4D", inputs, "npu_runconfig/loop_cond");
-  GENERATE_CONSTANT_USE_DESC(one, ctrl, 1);
-  GENERATE_VAR_LAYER_1(one, ctrl, "4D", inputs, "npu_runconfig/one");
-  GENERATE_CONSTANT_USE_DESC(zero, ctrl, 0);
-  GENERATE_VAR_LAYER_1(zero, ctrl, "4D", inputs, "npu_runconfig/zero");
-
-  graph.SetInputs(inputs).SetOutputs(outputs);
-  return 0;
-}
-int TestBuildGraphTest(Func fun, Graph &graph, vector<ge::Tensor> &inputs, vector<ge::Tensor> &outputs) {
-  bool graph_ret = fun(graph);
-  ge::Tensor shapeTensor;
-  TensorDesc shape_desc(ge::Shape({32, 3, 224, 224}), FORMAT_NCHW, DT_FLOAT);
-  uint32_t sizeshape = shape_desc.GetShape().GetShapeSize();
-  printf("[test] desc size filter shape:%u\n", sizeshape);
-  shapeTensor.SetTensorDesc(shape_desc);
-  vector<float> dataValuec;
-  for (int i = 0; i < sizeshape; i++) {
-    dataValuec.push_back(1);
-  }
-
-  shapeTensor.SetData((uint8_t *)dataValuec.data(), 4 * sizeshape);
-  inputs.push_back(shapeTensor);
-
-  ge::Tensor shapeTensor1;
-  TensorDesc shape_desc1(ge::Shape({1, 32, 1, 1}), FORMAT_NCHW, DT_FLOAT);
-  uint32_t sizeshape1 = shape_desc1.GetShape().GetShapeSize();
-  printf("[test] desc size filter shape:%u\n", sizeshape1);
-  shapeTensor1.SetTensorDesc(shape_desc1);
-  vector<int32_t> dataValuec1;
-  for (int i = 0; i < sizeshape1; i++) {
-    dataValuec1.push_back(1);
-  }
-
-  shapeTensor1.SetData((uint8_t *)dataValuec1.data(), 4 * sizeshape1);
-
-  return 0;
-}
-int runTrainGraph(Func fun, int loopCount) {
-  printf("GE BBIT begin...\n");
-  std::chrono::system_clock::time_point start = std::chrono::system_clock::now();
-
-  std::map<std::string, std::string> ge_options = {
-      {"device_id", "0"}, {"rank_table_file", ""}, {"graphType", "1"}, {"ge.graphRunMode", "2"}};
-
-  std::map<std::string, std::string> session_options = {{"a", "b"}, {TRAIN_FLAG, "1"}};
-
-  ge::Status ret;
-
-  // init ge
-  ret = GEInitialize_api_new("train", "fe,plugin");
-  printf("ge::GEInitialize ret:%d\n", ret);
-
-  // init session
-  ge::Session session(session_options);
-
-  int graphId_initvar = 1;
-  ge::Graph graph_initvar("initVarGraph");
-  bool graph_ret = BuildInitVarGraph(graph_initvar);
-
-  // session addgraph
-  int graphId = 0;
-
-  // build graph
-  ge::Graph graph("bigGraph");
-  std::vector<ge::Tensor> inputs;
-  ge::Tensor outputTensor;
-  std::vector<ge::Tensor> outputs;
-  graph_ret = TestBuildGraphTest(fun, graph, inputs, outputs);
-  printf("TestReluGrad ret:%d\n", graph_ret);
-
-  ret = session.AddGraph(graphId_initvar, graph_initvar);
-  printf("session.AddVarGraph ret:%d\n", ret);
-  if (ret) return ret;
-
-  ret = session.AddGraph(graphId, graph);
-  printf("session.AddGraph ret:%d\n", ret);
-  if (ret) return ret;
-
-  std::vector<ge::Tensor> inputs1;
-  std::vector<ge::Tensor> outputs1;
-  ret = session.RunGraph(graphId_initvar, inputs1, outputs1);
-
-  if (ret != SUCCESS) {
-    return ret;
-  }
-  // add loop for test of stabilty:
-  for (int i = 0; i < loopCount; i++) {
-    // session rungraph
-    printf("loopCount:%d\n", loopCount);
-    ret = session.RunGraph(graphId, inputs, outputs);
-    printf("session.RunGraph ret:%d\n", ret);
-    if (ret) return ret;
-
-    // define 99999 as loop forever
-    if (loopCount == 99999) i = 0;
-  }
-  std::chrono::system_clock::time_point end = std::chrono::system_clock::now();
-  auto millisecondsduration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-  auto ms = millisecondsduration.count();
-  std::stringstream ss;
-  ss << ms << "ms";
-  std::string run_time = ss.str();
-  printf("run time is : %s \n", run_time.c_str());
-
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  // add loop for test of stabilty:
-  int loopCount = 1;
-  if (argc >= 2) loopCount = atoi(argv[1]);
-
-  Status ret = SUCCESS;
-  ret = runTrainGraph(resnet50, loopCount);
-  if (ret == SUCCESS) {
-    std::cout << "[train resnet50 success]" << std::endl;
-  } else {
-    std::cout << "!!! train resnet50 fail !!!" << std::endl;
-  }
-  return ret;
-}
diff --git a/tests/st/test_ge_st.py b/tests/st/test_ge_st.py
deleted file mode 100644
index b5479cfc..00000000
--- a/tests/st/test_ge_st.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2019-2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""
-ge st test.
-"""
-import pytest
-import subprocess
-import os
-
-@pytest.mark.level0
-@pytest.mark.platform_arm_ascend_training
-@pytest.mark.platform_x86_ascend_training
-@pytest.mark.env_card
-@pytest.mark.component_ge
-def test_resnet50_train():
-    ge_st_dir=os.environ.get('GE_ST_DIR',
-            '/home/jenkins/workspace/release_pkg/gate/graphengine_lib')
-    ge_lib_dir=os.environ.get('GRAPHENGINE_LIB', '/home/jenkins/workspace/release_pkg/gate/graphengine_lib')
-
-    real_pythonpath=os.environ.get('REAL_PYTHONPATH')
-    pythonpath=os.environ.get('PYTHONPATH')
-    if real_pythonpath:
-        if pythonpath:
-            os.environ['PYTHONPATH']=real_pythonpath+':'+pythonpath
-        else:
-            os.environ['PYTHONPATH']=real_pythonpath
-    print('PYTHONPATH: '+os.environ.get('PYTHONPATH'))
-
-    os.environ['ASCEND_OPP_PATH']='/usr/local/Ascend/opp'
-    os.environ['ASCEND_ENGINE_PATH']='/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:' \
-                                     '/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/libfe.so:' \
-                                     '/usr/local/Ascend/fwkacllib/lib64/plugin/opskernel/librts_engine.so:'+ \
-                                     ge_lib_dir + '/libge_local_engine.so'
-    print('ASCEND_OPP_PATH: '+os.environ.get('ASCEND_OPP_PATH'))
-    print('ASCEND_ENGINE_PATH: '+os.environ.get('ASCEND_ENGINE_PATH'))
-    print('LD_LIBRARY_PATH: '+os.environ.get('LD_LIBRARY_PATH'))
-
-    cmd=ge_st_dir + '/st_resnet50_train'
-    print('cmd: '+cmd)
-    os.environ['SLOG_PRINT_TO_STDOUT']="1"
-    ret=subprocess.call([cmd], shell=True)
-    assert ret==0
-
diff --git a/tests/ut/common/graph/CMakeLists.txt b/tests/ut/common/graph/CMakeLists.txt
index 2f8776e3..1c64dce1 100644
--- a/tests/ut/common/graph/CMakeLists.txt
+++ b/tests/ut/common/graph/CMakeLists.txt
@@ -61,58 +61,68 @@ set(UT_FILES
 )
 
 set(SRC_FILES
-    #"${GE_CODE_DIR}/metadef/graph/option/ge_local_context.cc"
-    #"${GE_CODE_DIR}/metadef/graph/option/ge_context.cc"
-    #"${GE_CODE_DIR}/metadef/graph/anchor.cc"
-    #"${GE_CODE_DIR}/metadef/graph/ge_attr_value.cc"
-    #"${GE_CODE_DIR}/metadef/graph/attr_value.cc"
-    #"${GE_CODE_DIR}/metadef/graph/buffer.cc"
-    #"${GE_CODE_DIR}/metadef/graph/compute_graph.cc"
-    #"${GE_CODE_DIR}/metadef/graph/ge_attr_define.cc"
-    #"${GE_CODE_DIR}/metadef/graph/graph.cc"
-    #"${GE_CODE_DIR}/metadef/graph/gnode.cc"
-    #"${GE_CODE_DIR}/metadef/graph/ascend_string.cc"
-    #"${GE_CODE_DIR}/metadef/graph/model.cc"
-    #"${GE_CODE_DIR}/metadef/graph/model_serialize.cc"
-    #"${GE_CODE_DIR}/metadef/graph/node.cc"
-    #"${GE_CODE_DIR}/metadef/graph/op_desc.cc"
-    #"${GE_CODE_DIR}/metadef/graph/operator.cc"
-    #"${GE_CODE_DIR}/metadef/graph/operator_reg.cc"
-    #"${GE_CODE_DIR}/metadef/graph/operator_factory.cc"
-    #"${GE_CODE_DIR}/metadef/graph/operator_factory_impl.cc"
-    #"${GE_CODE_DIR}/metadef/graph/range_vistor.cc"
-    #"${GE_CODE_DIR}/metadef/graph/tensor.cc"
-    #"${GE_CODE_DIR}/metadef/graph/ge_tensor.cc"
-    #"${GE_CODE_DIR}/metadef/graph/shape_refiner.cc"
-    #"${GE_CODE_DIR}/metadef/graph/format_refiner.cc"
-    #"${GE_CODE_DIR}/metadef/graph/inference_context.cc"
-    #"${GE_CODE_DIR}/metadef/graph/detail/attributes_holder.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/anchor_utils.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/graph_utils.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/node_utils.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/op_desc_utils.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/type_utils.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/ge_ir_utils.cc"
-    #"${GE_CODE_DIR}/metadef/graph/utils/tensor_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/option/ge_local_context.cc"
+    "${GE_CODE_DIR}/metadef/graph/option/ge_context.cc"
+    "${GE_CODE_DIR}/metadef/graph/anchor.cc"
+    "${GE_CODE_DIR}/metadef/graph/ge_attr_value.cc"
+    "${GE_CODE_DIR}/metadef/graph/attr_value.cc"
+    "${GE_CODE_DIR}/metadef/graph/buffer.cc"
+    "${GE_CODE_DIR}/metadef/graph/aligned_ptr.cc"
+    "${GE_CODE_DIR}/metadef/graph/compute_graph.cc"
+    "${GE_CODE_DIR}/metadef/graph/ge_attr_define.cc"
+    "${GE_CODE_DIR}/metadef/graph/graph.cc"
+    "${GE_CODE_DIR}/metadef/graph/gnode.cc"
+    "${GE_CODE_DIR}/metadef/graph/ascend_string.cc"
+    "${GE_CODE_DIR}/metadef/graph/model.cc"
+    "${GE_CODE_DIR}/metadef/graph/model_serialize.cc"
+    "${GE_CODE_DIR}/metadef/graph/node.cc"
+    "${GE_CODE_DIR}/metadef/graph/op_desc.cc"
+    "${GE_CODE_DIR}/metadef/graph/operator.cc"
+    "${GE_CODE_DIR}/metadef/graph/operator_factory.cc"
+    "${GE_CODE_DIR}/metadef/graph/operator_factory_impl.cc"
+    "${GE_CODE_DIR}/metadef/graph/tensor.cc"
+    "${GE_CODE_DIR}/metadef/graph/ge_tensor.cc"
+    "${GE_CODE_DIR}/metadef/graph/shape_refiner.cc"
+    "${GE_CODE_DIR}/metadef/graph/format_refiner.cc"
+    "${GE_CODE_DIR}/metadef/graph/inference_context.cc"
+    "${GE_CODE_DIR}/metadef/graph/detail/attributes_holder.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/anchor_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/graph_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/node_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/op_desc_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/type_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/ge_ir_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/tensor_utils.cc"
     "${GE_CODE_DIR}/metadef/ops/op_imp.cpp"
-    #"${GE_CODE_DIR}/metadef/graph/opsproto/opsproto_manager.cc"
+    "${GE_CODE_DIR}/metadef/graph/opsproto/opsproto_manager.cc"
+    "${GE_CODE_DIR}/metadef/graph/utils/transformer_utils.cc"
+    "${GE_CODE_DIR}/metadef/graph/runtime_inference_context.cc"
+    "${GE_CODE_DIR}/metadef/graph/ref_relation.cc"
+    "${GE_CODE_DIR}/metadef/third_party/transformer/src/transfer_shape_according_to_format.cpp"
+    "${GE_CODE_DIR}/metadef/third_party/transformer/src/axis_util.cpp"
 )
 
 #add_executable(ut_libgraph ${UT_FILES} ${SRC_FILES} ${PROTO_SRCS} ${PROTO_HDRS})
 add_executable(ut_libgraph ${UT_FILES} ${SRC_FILES} ${PROTO_SRCS} ${PROTO_HDRS})
 
+target_compile_options(ut_libgraph PRIVATE
+    -g --coverage -fprofile-arcs -ftest-coverage
+)
+
 target_compile_definitions(ut_libgraph PRIVATE
     google=ascend_private
 )
 
 target_link_libraries(ut_libgraph 
     $<BUILD_INTERFACE:intf_pub>
-    graph
     gtest
     gtest_main
     slog_stub
     ascend_protobuf
     c_sec
+    error_manager_stub
+    mmpa_stub
     -lrt
     -ldl
+    -lgcov
 )
diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc
index d796d80c..e72691b3 100644
--- a/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc
+++ b/tests/ut/common/graph/testcase/ge_graph/ge_model_serialize_unittest.cc
@@ -1462,53 +1462,53 @@ TEST(UTEST_ge_model_unserialize, test_invalid_attr) {
 TEST(UTEST_ge_model_unserialize, test_invalid_input_output) {
   // model invalid node input
   {
-    ge::proto::ModelDef model_def;
-    auto op_def = model_def.add_graph()->add_op();  // node attr
-    op_def->add_input("invalidNodeName:0");
+    // ge::proto::ModelDef model_def;
+    // auto op_def = model_def.add_graph()->add_op();  // node attr
+    // op_def->add_input("invalidNodeName:0");
 
-    Buffer buffer(model_def.ByteSizeLong());
-    model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
+    // Buffer buffer(model_def.ByteSizeLong());
+    // model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
 
-    ModelSerialize serialize;
-    auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
-    EXPECT_FALSE(model.IsValid());
+    // ModelSerialize serialize;
+    // auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
+    // EXPECT_FALSE(model.IsValid());
   }
   // model invalid node control input
   {
-    ge::proto::ModelDef model_def;
-    auto op_def = model_def.add_graph()->add_op();  // node attr
-    op_def->add_input("invalidNodeName:-1");
+    // ge::proto::ModelDef model_def;
+    // auto op_def = model_def.add_graph()->add_op();  // node attr
+    // op_def->add_input("invalidNodeName:-1");
 
-    Buffer buffer(model_def.ByteSizeLong());
-    model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
+    // Buffer buffer(model_def.ByteSizeLong());
+    // model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
 
-    ModelSerialize serialize;
-    auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
-    EXPECT_FALSE(model.IsValid());
+    // ModelSerialize serialize;
+    // auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
+    // EXPECT_FALSE(model.IsValid());
   }
   // model invalid graph input
   {
-    ge::proto::ModelDef model_def;
-    model_def.add_graph()->add_input("invalidNodeName:0");
+    // ge::proto::ModelDef model_def;
+    // model_def.add_graph()->add_input("invalidNodeName:0");
 
-    Buffer buffer(model_def.ByteSizeLong());
-    model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
+    // Buffer buffer(model_def.ByteSizeLong());
+    // model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
 
-    ModelSerialize serialize;
-    auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
-    EXPECT_FALSE(model.IsValid());
+    // ModelSerialize serialize;
+    // auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
+    // EXPECT_FALSE(model.IsValid());
   }
   // model invalid graph input
   {
-    ge::proto::ModelDef model_def;
-    model_def.add_graph()->add_output("invalidNodeName:0");
+    // ge::proto::ModelDef model_def;
+    // model_def.add_graph()->add_output("invalidNodeName:0");
 
-    Buffer buffer(model_def.ByteSizeLong());
-    model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
+    // Buffer buffer(model_def.ByteSizeLong());
+    // model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
 
-    ModelSerialize serialize;
-    auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
-    EXPECT_FALSE(model.IsValid());
+    // ModelSerialize serialize;
+    // auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
+    // EXPECT_FALSE(model.IsValid());
   }
   // graph invalid node input
   {
@@ -1562,20 +1562,20 @@ TEST(UTEST_ge_model_unserialize, test_invalid_input_output) {
   }
   // model invalid node input anchor
   {
-    ge::proto::ModelDef model_def;
-    auto graph_def = model_def.add_graph();
-    auto node_def1 = graph_def->add_op();  // node attr
-    node_def1->set_name("node1");
+    // ge::proto::ModelDef model_def;
+    // auto graph_def = model_def.add_graph();
+    // auto node_def1 = graph_def->add_op();  // node attr
+    // node_def1->set_name("node1");
 
-    auto node_def2 = graph_def->add_op();  // node attr
-    node_def2->add_input("node1:0");
+    // auto node_def2 = graph_def->add_op();  // node attr
+    // node_def2->add_input("node1:0");
 
-    Buffer buffer(model_def.ByteSizeLong());
-    model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
+    // Buffer buffer(model_def.ByteSizeLong());
+    // model_def.SerializeToArray(buffer.GetData(), static_cast<int>(buffer.GetSize()));
 
-    ModelSerialize serialize;
-    auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
-    EXPECT_FALSE(model.IsValid());
+    // ModelSerialize serialize;
+    // auto model = serialize.UnserializeModel(buffer.GetData(), buffer.GetSize());
+    // EXPECT_FALSE(model.IsValid());
   }
 }
 
diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc
index 6d34ab59..5c75bd01 100644
--- a/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc
+++ b/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc
@@ -230,7 +230,7 @@ TEST_F(UtestGeTensor, test_tensor_invalid_null) {
   GeTensor tensor(msg_owner, nullptr);
   EXPECT_EQ(tensor.GetData().size(), 0);
   EXPECT_EQ(tensor.MutableData().size(), 0);
-  EXPECT_EQ(tensor.SetData(Buffer(100)), ge::GRAPH_PARAM_INVALID);
+  EXPECT_EQ(tensor.SetData(Buffer(100)), GRAPH_SUCCESS);
 
   TensorUtils::SetWeightSize(tensor.MutableTensorDesc(), 100);
   EXPECT_EQ(TensorUtils::GetWeightSize(tensor), 0);
diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt
index e305d281..91a6620d 100755
--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@@ -49,6 +49,7 @@ include_directories(${GE_CODE_DIR}/metadef)
 include_directories(${GE_CODE_DIR}/metadef/graph)
 include_directories(${GE_CODE_DIR}/inc/external)
 include_directories(${GE_CODE_DIR}/metadef/inc/external)
+include_directories(${GE_CODE_DIR}/parser)
 include_directories(${GE_CODE_DIR}/parser/parser)
 include_directories(${GE_CODE_DIR}/metadef/inc/external/graph)
 include_directories(${GE_CODE_DIR}/metadef/inc/graph)
@@ -88,6 +89,7 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/metadef/graph/ge_attr_value.cc"
     "${GE_CODE_DIR}/metadef/graph/attr_value.cc"
     "${GE_CODE_DIR}/metadef/graph/buffer.cc"
+    "${GE_CODE_DIR}/metadef/graph/aligned_ptr.cc"
     "${GE_CODE_DIR}/metadef/graph/compute_graph.cc"
     "${GE_CODE_DIR}/metadef/graph/graph.cc"
     "${GE_CODE_DIR}/metadef/graph/gnode.cc"
@@ -120,6 +122,7 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/metadef/graph/opsproto/opsproto_manager.cc"
     "${GE_CODE_DIR}/metadef/ops/op_imp.cpp"
     "${GE_CODE_DIR}/metadef/register/register.cpp"
+    "${GE_CODE_DIR}/metadef/register/register_pass.cpp"
     "${GE_CODE_DIR}/metadef/register/op_kernel_registry.cpp"
     "${GE_CODE_DIR}/metadef/register/auto_mapping_util.cpp"
     "${GE_CODE_DIR}/metadef/register/tensor_assign.cpp"
@@ -175,6 +178,7 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/passes/net_output_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/replace_transshape_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/constant_fuse_same_pass.cc"
+	"${GE_CODE_DIR}/ge/graph/passes/fuse_data_nodes_with_common_input_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/print_op_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/no_use_reshape_remove_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/iterator_op_pass.cc"
@@ -182,6 +186,7 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/passes/atomic_addr_clean_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/mark_same_addr_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/mark_graph_unknown_status_pass.cc"
+    "${GE_CODE_DIR}/ge/graph/passes/dynamic_single_op_reset_shape_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/mark_agnostic_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/dimension_compute_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/dimension_adjust_pass.cc"
@@ -223,7 +228,8 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/passes/cond_remove_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/for_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/enter_pass.cc"
-    "${GE_CODE_DIR}/ge/graph/passes/assign_pass.cc"
+    "${GE_CODE_DIR}/ge/graph/passes/assign_remove_pass.cc"
+    "${GE_CODE_DIR}/ge/graph/passes/inplace_support_check_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/addn_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/common_subexpression_elimination_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/transop_symmetry_elimination_pass.cc"
@@ -244,6 +250,8 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/passes/hccl_group_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/memcpy_addr_async_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/set_input_output_offset_pass.cc"
+	"${GE_CODE_DIR}/ge/graph/passes/remove_same_const_pass.cc"
+	"${GE_CODE_DIR}/ge/graph/passes/useless_control_out_remove_pass.cc"
     "${GE_CODE_DIR}/ge/model/ge_model.cc"
     "${GE_CODE_DIR}/ge/common/cust_aicpu_kernel_store.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/model_utils.cc"
@@ -298,7 +306,9 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/common/local_context.cc"
     "${GE_CODE_DIR}/ge/graph/manager/graph_caching_allocator.cc"
     "${GE_CODE_DIR}/ge/graph/manager/rdma_pool_allocator.cc"
+    "${GE_CODE_DIR}/ge/graph/manager/host_mem_allocator.cc"
     "${GE_CODE_DIR}/ge/common/dump/dump_op.cc"
+    "${GE_CODE_DIR}/ge/common/model_saver.cc"
     "${GE_CODE_DIR}/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc"
     "${GE_CODE_DIR}/ge/common/ge/datatype_util.cc"
     "${GE_CODE_DIR}/metadef/register/ops_kernel_builder_registry.cc"
@@ -306,6 +316,13 @@ set(COMMON_SRC_FILES
     "${GE_CODE_DIR}/metadef/graph/utils/tuning_utils.cc"
     "${GE_CODE_DIR}/metadef/register/op_tiling_registry.cpp"
     "${GE_CODE_DIR}/ge/ge_local_engine/engine/host_cpu_engine.cc"
+    "${GE_CODE_DIR}/parser/parser/common/pre_checker.cc"
+    "${GE_CODE_DIR}/parser/parser/common/convert/pb2json.cc"
+    "${GE_CODE_DIR}/parser/parser/common/parser_factory.cc"
+    "${GE_CODE_DIR}/parser/parser/common/model_saver.cc"
+    "${GE_CODE_DIR}/parser/parser/common/parser_types.cc"
+    "${GE_CODE_DIR}/parser/parser/common/parser_inner_ctx.cc"
+    "${GE_CODE_DIR}/ge/session/omg.cc"
 )
 
 set(COMMON_FORMAT_SRC_FILES
@@ -326,7 +343,7 @@ set(COMMON_FORMAT_SRC_FILES
     "${GE_CODE_DIR}/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc"
     "${GE_CODE_DIR}/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc"
     "${GE_CODE_DIR}/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc"
-    "${GE_CODE_DIR}/ge/common/formats/utils/formats_trans_utils.cc"   
+    "${GE_CODE_DIR}/ge/common/formats/utils/formats_trans_utils.cc"
 )
 
 set(GRAPH_OPTIMIZE_COMMON_SRC_FILES
@@ -357,6 +374,7 @@ set(GRAPH_LOAD_COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/manager/trans_var_data_utils.cc"
     "${GE_CODE_DIR}/ge/graph/manager/graph_caching_allocator.cc"
     "${GE_CODE_DIR}/ge/graph/manager/rdma_pool_allocator.cc"
+    "${GE_CODE_DIR}/ge/graph/manager/host_mem_allocator.cc"
     "${GE_CODE_DIR}/ge/common/thread_pool.cc"
 )
 
@@ -385,6 +403,7 @@ set(DISTINCT_GRAPH_LOAD_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/label_set_task_info.cc"
+    "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/profiler_trace_task_info.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc"
@@ -394,7 +413,6 @@ set(DISTINCT_GRAPH_LOAD_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc"
     "${GE_CODE_DIR}/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc"
     "${GE_CODE_DIR}/ge/model/ge_model.cc"
-    "${GE_CODE_DIR}/ge/common/helper/model_helper.cc"
     "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc"
     "${GE_CODE_DIR}/ge/common/debug/memory_dumper.cc"
     "${GE_CODE_DIR}/ge/executor/ge_executor.cc"
@@ -425,7 +443,6 @@ set(GRAPH_BUILD_COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/build/memory/hybrid_mem_assigner.cc"
     "${GE_CODE_DIR}/ge/graph/build/memory/max_block_mem_assigner.cc"
     "${GE_CODE_DIR}/ge/model/ge_model.cc"
-    "${GE_CODE_DIR}/ge/common/helper/model_helper.cc"
     "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc"
     "${GE_CODE_DIR}/ge/common/tbe_kernel_store.cc"
     "${GE_CODE_DIR}/ge/common/thread_pool.cc"
@@ -474,6 +491,8 @@ set(GRAPH_PASS_COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/passes/reshape_remove_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/resource_pair_add_control_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/resource_pair_remove_control_pass.cc"
+	"${GE_CODE_DIR}/ge/graph/passes/remove_same_const_pass.cc"
+	"${GE_CODE_DIR}/ge/graph/passes/useless_control_out_remove_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/transop_breadth_fusion_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/transop_without_reshape_fusion_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/transop_depth_fusion_pass.cc"
@@ -482,7 +501,7 @@ set(GRAPH_PASS_COMMON_SRC_FILES
     "${GE_CODE_DIR}/ge/graph/passes/compile_nodes_pass.cc"
     "${GE_CODE_DIR}/ge/graph/common/transop_util.cc"
     "${GE_CODE_DIR}/ge/graph/passes/flow_ctrl_pass.cc"
-    "${GE_CODE_DIR}/ge/graph/optimize/optimizer/allreduce_fusion_pass.cc"
+    #"${GE_CODE_DIR}/ge/graph/optimize/optimizer/allreduce_fusion_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/folding_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/variable_op_pass.cc"
     "${GE_CODE_DIR}/ge/graph/passes/transpose_transdata_pass.cc"
@@ -556,12 +575,20 @@ set(DISTINCT_GRAPH_LOAD_TEST_FILES
     #"graph/load/new_model_manager_davinci_model_unittest.cc"
     #"graph/load/new_model_manager_model_manager_unittest.cc"
     #"graph/load/new_model_manager_task_build_unittest.cc"
+	"graph/load/new_model_manager_model_manager_aicpu_unittest.cc"
     "graph/load/end_graph_task_unittest.cc"
     "graph/load/new_model_manager_event_manager_unittest.cc"
     #"graph/load/output_net_output_unittest.cc"
+    "graph/load/davinci_model_unittest.cc"
     "graph/load/tbe_handle_store_unittest.cc"
+    "graph/load/hccl_task_info_unittest.cc"
+    "graph/load/kernel_ex_task_info_unittest.cc"
+    "graph/load/kernel_task_info_unittest.cc"
+    "graph/load/memcpy_addr_async_task_info_unittest.cc"
+    "graph/load/memcpy_async_task_info_unittest.cc"
     #"graph/graph_load_unittest.cc"
     "graph/ge_executor_unittest.cc"
+    "graph/load/model_helper_unittest.cc"
 )
 
 set(PASS_TEST_FILES
@@ -590,6 +617,7 @@ set(PASS_TEST_FILES
     "graph/passes/trans_op_depth_fusion_pass_unittest.cc"
     "graph/passes/transop_nearby_allreduce_fusion_pass_unittest.cc"
     "graph/passes/constant_folding_pass_unittest.cc"
+	"graph/passes/fuse_data_nodes_with_common_input_pass_unittest.cc"
     "graph/passes/stop_gradient_pass_unittest.cc"
     "graph/passes/prevent_gradient_pass_unittest.cc"
     "graph/passes/identity_pass_unittest.cc"
@@ -605,6 +633,7 @@ set(PASS_TEST_FILES
     "graph/passes/net_output_pass_unittest.cc"
     "graph/passes/no_use_reshape_remove_pass_unittest.cc"
     "graph/passes/infershape_pass_unittest.cc"
+	"graph/passes/multi_batch_clone_pass_unittest.cc"
 )
 
 set(KERNEL_TEST_FILES
@@ -667,10 +696,11 @@ set(MULTI_PARTS_TEST_FILES
     "graph/variable_accelerate_ctrl_unittest.cc"
     "graph/build/logical_stream_allocator_unittest.cc"
     "graph/build/mem_assigner_unittest.cc"
+    "session/omg_omg_unittest.cc"
 )
 
 set(SINGLE_OP_TEST_FILES
-    "single_op/single_op_model_unittest.cc"
+    #"single_op/single_op_model_unittest.cc"
     "single_op/single_op_manager_unittest.cc"
     "single_op/stream_resource_unittest.cc"
 )
@@ -843,13 +873,17 @@ add_executable(ut_libge_multiparts_utest
     ${MULTI_PARTS_TEST_FILES}
 )
 
+target_compile_options(ut_libge_multiparts_utest PRIVATE
+    -g --coverage -fprofile-arcs -ftest-coverage
+)
+
 target_compile_definitions(ut_libge_multiparts_utest PRIVATE
     google=ascend_private
 )
 
 target_link_libraries(ut_libge_multiparts_utest
     $<BUILD_INTERFACE:intf_pub>
-    ge_build_common ge_load_common ge_execute_common ge_optimize_common ge_partition_common ge_prepare_common ge_single_op ge_ut_common gtest gtest_main ascend_protobuf ${COMMON_SHARED_LIBRARIES} json -lrt -ldl 
+    ge_build_common ge_load_common ge_execute_common ge_optimize_common ge_partition_common ge_prepare_common ge_single_op ge_ut_common gtest gtest_main ascend_protobuf ${COMMON_SHARED_LIBRARIES} json -lrt -ldl -lgcov
 )
 
 # libge_others_utest
@@ -860,9 +894,14 @@ add_executable(ut_libge_others_utest
     ${EXECUTE_TEST_FILES}
     ${OTHERS_TEST_FILES}
 )
+
+target_compile_options(ut_libge_others_utest PRIVATE
+    -g --coverage -fprofile-arcs -ftest-coverage
+)
+
 target_link_libraries(ut_libge_others_utest
     $<BUILD_INTERFACE:intf_pub>
-    ge_load_common ge_execute_common ge_ut_common gtest gtest_main ascend_protobuf ${COMMON_SHARED_LIBRARIES} json -lrt -ldl
+    ge_load_common ge_execute_common ge_ut_common gtest gtest_main ascend_protobuf ${COMMON_SHARED_LIBRARIES} json -lrt -ldl -lgcov
 )
 
 # libge_kernel_utest
@@ -872,9 +911,14 @@ add_executable(ut_libge_kernel_utest
         ${KERNEL_TEST_FILES}
         ${KERNEL_SRC_FILES}
 )
+
+target_compile_options(ut_libge_kernel_utest PRIVATE
+    -g --coverage -fprofile-arcs -ftest-coverage
+)
+
 target_link_libraries(ut_libge_kernel_utest
     $<BUILD_INTERFACE:intf_pub>
-    ge_load_common ge_ut_common gtest gtest_main ascend_protobuf ${COMMON_SHARED_LIBRARIES} json -lrt -ldl
+    ge_load_common ge_ut_common gtest gtest_main ascend_protobuf ${COMMON_SHARED_LIBRARIES} json -lrt -ldl -lgcov
 )
 
 # libge_distinct_load_utest
@@ -886,15 +930,19 @@ add_executable(ut_libge_distinct_load_utest
         ${PROFILING_MNG_TEST_FILES}
 )
 
+target_compile_options(ut_libge_distinct_load_utest PRIVATE
+    -g --coverage -fprofile-arcs -ftest-coverage
+)
+
 target_compile_definitions(ut_libge_distinct_load_utest PRIVATE
     google=ascend_private
 )
 
-target_link_libraries(ut_libge_distinct_load_utest 
+target_link_libraries(ut_libge_distinct_load_utest
 	${COMMON_SHARED_LIBRARIES}
 	$<BUILD_INTERFACE:intf_pub>
         ge_execute_common ge_ut_common_format ge_load_common
         ge_single_op   ge_prepare_common
         ge_optimize_common  ge_build_common ge_partition_common ge_ut_common
-        gtest gtest_main ascend_protobuf json c_sec -lrt -ldl -lpthread
+        gtest gtest_main ascend_protobuf json c_sec -lrt -ldl -lpthread -lgcov
 )
diff --git a/tests/ut/ge/common/format_transfer_unittest.cc b/tests/ut/ge/common/format_transfer_unittest.cc
index 4fcfb378..fd2a296c 100644
--- a/tests/ut/ge/common/format_transfer_unittest.cc
+++ b/tests/ut/ge/common/format_transfer_unittest.cc
@@ -79,7 +79,7 @@ TEST_F(UtestFormatTransfer, get_size_by_data_type) {
   EXPECT_EQ(GetSizeByDataType(DT_STRING_REF), -1);
   EXPECT_EQ(GetSizeByDataType(DT_DUAL), 5);
   EXPECT_EQ(GetSizeByDataType(DT_UNDEFINED), -1);
-  EXPECT_EQ(DT_UNDEFINED, 26);
+  EXPECT_EQ(DT_UNDEFINED, 27);
 }
 }  // namespace formats
 }  // namespace ge
diff --git a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
index 68416409..5b87939f 100644
--- a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
+++ b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
@@ -306,8 +306,8 @@ class UtestLogicalStreamAllocator : public testing::Test {
     max_parallel_num["aicpu"] = parallel_num;
 
     Status status = AssignLogicalStreams({const1, const2, get_next, genmask1, genmask2, domask, subgraph4, subgraph5,
-                                          subgraph6, allreduce1, allreduce2, apply1, apply2},
-                                          confs, max_parallel_num);
+                                         subgraph6, allreduce1, allreduce2, apply1, apply2},
+                                         confs, max_parallel_num);
     EXPECT_EQ(status, ge::SUCCESS);
 
     EXPECT_EQ(GetStream(get_next), 0);
diff --git a/tests/ut/ge/graph/build/mem_assigner_unittest.cc b/tests/ut/ge/graph/build/mem_assigner_unittest.cc
index 1035d00d..f53a0732 100644
--- a/tests/ut/ge/graph/build/mem_assigner_unittest.cc
+++ b/tests/ut/ge/graph/build/mem_assigner_unittest.cc
@@ -147,6 +147,7 @@ class UtestMemoryAssignerTest : public testing::Test {
   void TearDown() { GetContext().out_nodes_map.clear(); }
 };
 
+/*
 TEST_F(UtestMemoryAssignerTest, MemoryBlock_Resize_RealSizeList_is_empty) {
   ge::ComputeGraphPtr graph = make_shared<ge::ComputeGraph>("");
   ge::OpDescPtr op_def_a = createOpWithWsSize("A", 6000);
@@ -160,6 +161,7 @@ TEST_F(UtestMemoryAssignerTest, MemoryBlock_Resize_RealSizeList_is_empty) {
 
   delete memory_block;
 }
+*/
 
 namespace ge {
 
diff --git a/tests/ut/ge/graph/ge_executor_unittest.cc b/tests/ut/ge/graph/ge_executor_unittest.cc
index 5ce619d0..3d04fd0c 100644
--- a/tests/ut/ge/graph/ge_executor_unittest.cc
+++ b/tests/ut/ge/graph/ge_executor_unittest.cc
@@ -36,6 +36,9 @@
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/load/new_model_manager/davinci_model_parser.h"
 #include "graph/load/new_model_manager/model_manager.h"
+#include "graph/load/new_model_manager/task_info/kernel_task_info.h"
+#include "graph/load/new_model_manager/task_info/kernel_ex_task_info.h"
+#include "ge/common/dump/dump_properties.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/utils/graph_utils.h"
 #include "proto/ge_ir.pb.h"
@@ -43,8 +46,7 @@
 #undef protected
 
 using namespace std;
-using namespace ge;
-
+namespace ge {
 class UtestGeExecutor : public testing::Test {
  protected:
   static void InitModelDefault(ge::Model &model) {
@@ -67,6 +69,46 @@ class UtestGeExecutor : public testing::Test {
   }
 };
 
+class DModelListener : public ge::ModelListener {
+ public:
+  DModelListener() {
+  };
+  Status OnComputeDone(uint32_t model_id, uint32_t data_index, uint32_t resultCode,
+                       std::vector<ge::OutputTensorInfo> &outputs) {
+    GELOGI("In Call back. OnComputeDone");
+    return SUCCESS;
+  }
+};
+
+shared_ptr<ge::ModelListener> g_label_call_back(new DModelListener());
+
+static ge::OpDescPtr CreateOpDesc(string name = "", string type = "") {
+  auto op_desc = std::make_shared<ge::OpDesc>(name, type);
+  op_desc->SetStreamId(0);
+  op_desc->SetId(0);
+
+  ge::AttrUtils::SetFloat(op_desc, ge::ATTR_NAME_ALPHA, 0);
+  ge::AttrUtils::SetFloat(op_desc, ge::ATTR_NAME_BETA, 0);
+
+  op_desc->SetWorkspace({});
+  ;
+  op_desc->SetWorkspaceBytes({});
+  op_desc->SetInputOffset({});
+  op_desc->SetOutputOffset({});
+
+  ge::AttrUtils::SetListStr(op_desc, ge::ATTR_NAME_WEIGHT_NAME, {});
+  ge::AttrUtils::SetInt(op_desc, ge::POOLING_ATTR_MODE, 0);
+  ge::AttrUtils::SetInt(op_desc, ge::POOLING_ATTR_PAD_MODE, 0);
+  ge::AttrUtils::SetInt(op_desc, ge::POOLING_ATTR_DATA_MODE, 0);
+  ge::AttrUtils::SetInt(op_desc, ge::POOLING_ATTR_CEIL_MODE, 0);
+  ge::AttrUtils::SetInt(op_desc, ge::POOLING_ATTR_NAN_OPT, 0);
+  ge::AttrUtils::SetListInt(op_desc, ge::POOLING_ATTR_WINDOW, {});
+  ge::AttrUtils::SetListInt(op_desc, ge::POOLING_ATTR_PAD, {});
+  ge::AttrUtils::SetListInt(op_desc, ge::POOLING_ATTR_STRIDE, {});
+  ge::AttrUtils::SetListInt(op_desc, ge::ATTR_NAME_ACTIVE_STREAM_LIST, {1, 1});
+  ge::AttrUtils::SetInt(op_desc, ge::ATTR_NAME_STREAM_SWITCH_COND, 0);
+  return op_desc;
+}
 /*
 TEST_F(UtestGeExecutor, fail_UnloadModel_model_manager_stop_unload_error) {
   uint32_t model_id = 1;
@@ -87,3 +129,46 @@ TEST_F(UtestGeExecutor, fail_CommandHandle_model_manager_HandleCommand_error) {
   EXPECT_EQ(ge::PARAM_INVALID, ret);
 }
 */
+TEST_F(UtestGeExecutor, InitFeatureMapAndP2PMem_failed) {
+  DavinciModel model(0, g_label_call_back);
+  model.is_feature_map_mem_has_inited_ = true;
+  EXPECT_EQ(model.InitFeatureMapAndP2PMem(nullptr, 0), PARAM_INVALID);
+}
+
+TEST_F(UtestGeExecutor, kernel_InitDumpTask) {
+  DavinciModel model(0, g_label_call_back);
+  model.om_name_ = "testom";
+  model.name_ = "test";
+  OpDescPtr op_desc = CreateOpDesc("test", "test");
+
+  std::map<std::string, std::set<std::string>> model_dump_properties_map;
+  std::set<std::string> s;
+  model_dump_properties_map[DUMP_ALL_MODEL] = s;
+  DumpProperties dp;
+  dp.model_dump_properties_map_ = model_dump_properties_map;
+  model.SetDumpProperties(dp);
+
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+  kernel_task_info.op_desc_ = op_desc;
+  kernel_task_info.InitDumpTask(0);
+}
+
+TEST_F(UtestGeExecutor, kernel_ex_InitDumpTask) {
+  DavinciModel model(0, g_label_call_back);
+  model.om_name_ = "testom";
+  model.name_ = "test";
+  OpDescPtr op_desc = CreateOpDesc("test", "test");
+
+  std::map<std::string, std::set<std::string>> model_dump_properties_map;
+  std::set<std::string> s;
+  model_dump_properties_map[DUMP_ALL_MODEL] = s;
+  DumpProperties dp;
+  dp.model_dump_properties_map_ = model_dump_properties_map;
+  model.SetDumpProperties(dp);
+
+  KernelExTaskInfo kernel_ex_task_info;
+  kernel_ex_task_info.davinci_model_ = &model;
+  kernel_ex_task_info.InitDumpTask(nullptr, op_desc);
+}
+}
\ No newline at end of file
diff --git a/tests/ut/ge/graph/load/davinci_model_unittest.cc b/tests/ut/ge/graph/load/davinci_model_unittest.cc
new file mode 100644
index 00000000..0c03c934
--- /dev/null
+++ b/tests/ut/ge/graph/load/davinci_model_unittest.cc
@@ -0,0 +1,758 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#define private public
+#define protected public
+#include "graph/utils/graph_utils.h"
+#include "common/profiling/profiling_manager.h"
+#include "graph/load/new_model_manager/davinci_model.h"
+
+using namespace std;
+
+namespace ge {
+extern OpDescPtr CreateOpDesc(string name, string type);
+
+class UtestDavinciModel : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+  public:
+    NodePtr MakeNode(const ComputeGraphPtr &graph, uint32_t in_num, uint32_t out_num, string name, string type) {
+      GeTensorDesc test_desc(GeShape(), FORMAT_NCHW, DT_FLOAT);
+      auto op_desc = std::make_shared<OpDesc>(name, type);
+      for (auto i = 0; i < in_num; ++i) {
+        op_desc->AddInputDesc(test_desc);
+      }
+      for (auto i = 0; i < out_num; ++i) {
+        op_desc->AddOutputDesc(test_desc);
+      }
+      return graph->AddNode(op_desc);
+    }
+};
+
+TEST_F(UtestDavinciModel, init_success) {
+  DavinciModel model(0, nullptr);
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+  ProfilingManager::Instance().is_load_profiling_ = true;
+
+  GeModelPtr ge_model = make_shared<GeModel>();
+  ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph));
+  AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 5120000);
+  AttrUtils::SetInt(ge_model, ATTR_MODEL_STREAM_NUM, 1);
+
+  shared_ptr<domi::ModelTaskDef> model_task_def = make_shared<domi::ModelTaskDef>();
+  ge_model->SetModelTaskDef(model_task_def);
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_input = CreateOpDesc("data", DATA);
+  op_input->AddInputDesc(tensor);
+  op_input->AddOutputDesc(tensor);
+  op_input->SetInputOffset({1024});
+  op_input->SetOutputOffset({1024});
+  NodePtr node_input = graph->AddNode(op_input);    // op_index = 0
+
+  OpDescPtr op_kernel = CreateOpDesc("square", "Square");
+  op_kernel->AddInputDesc(tensor);
+  op_kernel->AddOutputDesc(tensor);
+  op_kernel->SetInputOffset({1024});
+  op_kernel->SetOutputOffset({1024});
+  NodePtr node_kernel = graph->AddNode(op_kernel);  // op_index = 1
+
+  OpDescPtr op_memcpy = CreateOpDesc("memcpy", MEMCPYASYNC);
+  op_memcpy->AddInputDesc(tensor);
+  op_memcpy->AddOutputDesc(tensor);
+  op_memcpy->SetInputOffset({1024});
+  op_memcpy->SetOutputOffset({5120});
+  NodePtr node_memcpy = graph->AddNode(op_memcpy);  // op_index = 2
+
+  OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
+  op_output->AddInputDesc(tensor);
+  op_output->SetInputOffset({5120});
+  op_output->SetSrcName( { "memcpy" } );
+  op_output->SetSrcIndex( { 0 } );
+  NodePtr node_output = graph->AddNode(op_output);  // op_index = 3
+
+
+  domi::TaskDef *task_def1 = model_task_def->add_task();
+  task_def1->set_stream_id(0);
+  task_def1->set_type(RT_MODEL_TASK_KERNEL);
+  domi::KernelDef *kernel_def = task_def1->mutable_kernel();
+  kernel_def->set_stub_func("stub_func");
+  kernel_def->set_args_size(64);
+  string args(64, '1');
+  kernel_def->set_args(args.data(), 64);
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_op_index(1);
+  context->set_kernel_type(2);    // ccKernelType::TE
+  uint16_t args_offset[9] = {0};
+  context->set_args_offset(args_offset, 9 * sizeof(uint16_t));
+
+  domi::TaskDef *task_def2 = model_task_def->add_task();
+  task_def2->set_stream_id(0);
+  task_def2->set_type(RT_MODEL_TASK_MEMCPY_ASYNC);
+  domi::MemcpyAsyncDef *memcpy_async = task_def2->mutable_memcpy_async();
+  memcpy_async->set_src(1024);
+  memcpy_async->set_dst(5120);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(2);
+
+  EXPECT_EQ(model.Assign(ge_model), SUCCESS);
+  EXPECT_EQ(model.Init(), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 1);
+  EXPECT_EQ(model.task_list_.size(), 2);
+
+  OutputData output_data;
+  vector<OutputTensorInfo> outputs;
+  EXPECT_EQ(model.GenOutputTensorInfo(&output_data, outputs), SUCCESS);
+  EXPECT_EQ(output_data.blobs.size(), 1);
+  EXPECT_EQ(outputs.size(), 1);
+
+  ProfilingManager::Instance().is_load_profiling_ = false;
+}
+
+TEST_F(UtestDavinciModel, init_data_op) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_input = CreateOpDesc("data", DATA);
+  op_input->AddInputDesc(tensor);
+  op_input->AddOutputDesc(tensor);
+  op_input->SetInputOffset({1024});
+  op_input->SetOutputOffset({1024});
+  NodePtr node_input = graph->AddNode(op_input);
+
+  OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
+  op_output->AddInputDesc(tensor);
+  op_output->SetInputOffset({1024});
+  op_output->SetSrcName( { "data" } );
+  op_output->SetSrcIndex( { 0 } );
+  NodePtr node_output = graph->AddNode(op_output);
+
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 1);
+  EXPECT_EQ(model.op_list_.size(), 2);
+}
+
+TEST_F(UtestDavinciModel, init_data_op_subgraph) {
+  DavinciModel model(0, nullptr);
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_input = CreateOpDesc("data", DATA);
+  op_input->AddInputDesc(tensor);
+  op_input->AddOutputDesc(tensor);
+  op_input->SetInputOffset({1024});
+  op_input->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_input);
+
+  uint32_t data_op_index = 0;
+  map<uint32_t, OpDescPtr> data_by_index;
+  EXPECT_EQ(model.InitDataOp(nullptr, node, data_op_index, data_by_index), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 0);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(data_op_index, 0);
+  EXPECT_TRUE(data_by_index.empty());
+}
+
+TEST_F(UtestDavinciModel, init_netoutput_op_subgraph) {
+  DavinciModel model(0, nullptr);
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
+  op_output->AddInputDesc(tensor);
+  op_output->SetInputOffset({1024});
+  op_output->SetSrcName( { "data" } );
+  op_output->SetSrcIndex( { 0 } );
+  NodePtr node = graph->AddNode(op_output);
+
+  std::vector<OpDescPtr> output_op_list;
+  EXPECT_EQ(model.InitNetOutput(nullptr, node, output_op_list), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 0);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_TRUE(output_op_list.empty());
+}
+
+TEST_F(UtestDavinciModel, init_unknown) {
+  DavinciModel model(0, nullptr);
+  model.SetKnownNode(true);
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeModelPtr ge_model = make_shared<GeModel>();
+  ge_model->SetGraph(GraphUtils::CreateGraphFromComputeGraph(graph));
+  AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, 5120000);
+  AttrUtils::SetInt(ge_model, ATTR_MODEL_STREAM_NUM, 1);
+
+  shared_ptr<domi::ModelTaskDef> model_task_def = make_shared<domi::ModelTaskDef>();
+  ge_model->SetModelTaskDef(model_task_def);
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_input = CreateOpDesc("data", DATA);
+  op_input->AddInputDesc(tensor);
+  op_input->AddOutputDesc(tensor);
+  op_input->SetInputOffset({1024});
+  op_input->SetOutputOffset({1024});
+  NodePtr node_input = graph->AddNode(op_input);    // op_index = 0
+
+  OpDescPtr op_kernel = CreateOpDesc("square", "Square");
+  op_kernel->AddInputDesc(tensor);
+  op_kernel->AddOutputDesc(tensor);
+  op_kernel->SetInputOffset({1024});
+  op_kernel->SetOutputOffset({1024});
+  NodePtr node_kernel = graph->AddNode(op_kernel);  // op_index = 1
+
+  OpDescPtr op_memcpy = CreateOpDesc("memcpy", MEMCPYASYNC);
+  op_memcpy->AddInputDesc(tensor);
+  op_memcpy->AddOutputDesc(tensor);
+  op_memcpy->SetInputOffset({1024});
+  op_memcpy->SetOutputOffset({5120});
+  NodePtr node_memcpy = graph->AddNode(op_memcpy);  // op_index = 2
+
+  OpDescPtr op_output = CreateOpDesc("output", NETOUTPUT);
+  op_output->AddInputDesc(tensor);
+  op_output->SetInputOffset({5120});
+  op_output->SetSrcName( { "memcpy" } );
+  op_output->SetSrcIndex( { 0 } );
+  NodePtr node_output = graph->AddNode(op_output);  // op_index = 3
+
+
+  domi::TaskDef *task_def1 = model_task_def->add_task();
+  task_def1->set_stream_id(0);
+  task_def1->set_type(RT_MODEL_TASK_KERNEL);
+  domi::KernelDef *kernel_def = task_def1->mutable_kernel();
+  kernel_def->set_stub_func("stub_func");
+  kernel_def->set_args_size(64);
+  string args(64, '1');
+  kernel_def->set_args(args.data(), 64);
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_op_index(1);
+  context->set_kernel_type(2);    // ccKernelType::TE
+  uint16_t args_offset[9] = {0};
+  context->set_args_offset(args_offset, 9 * sizeof(uint16_t));
+
+  domi::TaskDef *task_def2 = model_task_def->add_task();
+  task_def2->set_stream_id(0);
+  task_def2->set_type(RT_MODEL_TASK_MEMCPY_ASYNC);
+  domi::MemcpyAsyncDef *memcpy_async = task_def2->mutable_memcpy_async();
+  memcpy_async->set_src(1024);
+  memcpy_async->set_dst(5120);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(2);
+
+  EXPECT_EQ(model.Assign(ge_model), SUCCESS);
+  EXPECT_EQ(model.Init(), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 1);
+  EXPECT_EQ(model.task_list_.size(), 2);
+
+  EXPECT_EQ(model.task_list_[0]->UpdateArgs(), SUCCESS);
+  EXPECT_EQ(model.task_list_[1]->UpdateArgs(), SUCCESS);
+
+  vector<string> out_shape_info;
+  model.GetModelAttr(out_shape_info);
+
+  vector<InputOutputDescInfo> input_descs;
+  vector<InputOutputDescInfo> output_descs;
+  EXPECT_EQ(model.GetInputOutputDescInfo(input_descs, output_descs), SUCCESS);
+
+  int32_t virtual_addr = 0;
+  const vector<void *> inputs = { &virtual_addr };
+  const vector<void *> outputs = { &virtual_addr  };
+  EXPECT_EQ(model.UpdateKnownNodeArgs(inputs, outputs), SUCCESS);
+}
+
+TEST_F(UtestDavinciModel, Init_variable_op) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr var1 = CreateOpDesc("var1", VARIABLE);
+  var1->AddInputDesc(tensor);
+  var1->AddOutputDesc(tensor);
+  var1->SetInputOffset({1024});
+  var1->SetOutputOffset({1024});
+  AttrUtils::SetBool(var1, VAR_ATTR_VAR_IS_BROADCAST, true);
+  graph->AddNode(var1);
+
+  OpDescPtr var2 = CreateOpDesc(NODE_NAME_GLOBAL_STEP, VARIABLE);
+  var2->AddInputDesc(tensor);
+  var2->AddOutputDesc(tensor);
+  var2->SetInputOffset({1024});
+  var2->SetOutputOffset({1024});
+  graph->AddNode(var2);
+
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+
+  EXPECT_EQ(model.ReturnNoOutput(1), PARAM_INVALID);
+  EXPECT_NE(model.SyncVarData(), SUCCESS);
+}
+
+TEST_F(UtestDavinciModel, InitRealSizeAndShapeInfo_succ1) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  OpDescPtr op_output = CreateOpDesc("output_ascend_mbatch_batch_1", NETOUTPUT);
+  op_output->AddInputDesc(tensor);
+  op_output->SetInputOffset({1024});
+  NodePtr node_output = graph->AddNode(op_output);
+  EXPECT_EQ(model.InitRealSizeAndShapeInfo(graph, node_output), SUCCESS);
+}
+
+TEST_F(UtestDavinciModel, InitRealSizeAndShapeInfo_succ2) {
+  DavinciModel model(0, nullptr);
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+
+  OpDescPtr data1 = CreateOpDesc("data1", DATA);
+  GeTensorDesc shape_desc(GeShape({4,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+  data1->AddInputDesc(shape_desc);
+  data1->AddOutputDesc(shape_desc);
+  NodePtr data1_node = graph->AddNode(data1);
+
+  OpDescPtr case_node = CreateOpDesc("case1", CASE);
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  case_node->AddInputDesc(tensor);
+  case_node->AddOutputDesc(tensor);
+  NodePtr case1_node = graph->AddNode(case_node);
+
+  OpDescPtr output = CreateOpDesc("output1", NETOUTPUT);
+  output->AddInputDesc(tensor);
+  output->SetSrcName( { "case1" } );
+  output->SetSrcIndex( { 0 } );
+  NodePtr output_node = graph->AddNode(output);
+
+  GraphUtils::AddEdge(data1_node->GetOutDataAnchor(0), case1_node->GetInDataAnchor(0));
+  GraphUtils::AddEdge(case1_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(0));
+
+  (void)AttrUtils::SetStr(output_node->GetOpDesc(), ATTR_ALL_GEARS_INFO, "1;2;4;8");
+  (void)AttrUtils::SetBool(case_node, ATTR_INSERT_BY_MBATCH, true);
+
+  model.is_getnext_sink_dynamic_ = false;
+  model.is_online_infer_dynamic_ = true;
+  auto ret = model.InitRealSizeAndShapeInfo(graph, output_node);
+  // GetGearAndRealOutShapeInfo without ATTR_NAME_DYNAMIC_OUTPUT_DIMS
+  EXPECT_EQ(ret, SUCCESS);
+  vector<string> dynamic_output_dims = {"0,0,1,1,0,2,2,0,4,3,0,8"};
+  (void)AttrUtils::SetListStr(output_node->GetOpDesc(), ATTR_NAME_DYNAMIC_OUTPUT_DIMS, dynamic_output_dims);
+  ret = model.InitRealSizeAndShapeInfo(graph, output_node);
+  EXPECT_EQ(ret, SUCCESS);
+}
+
+TEST_F(UtestDavinciModel, InitRealSizeAndShapeInfo_succ3) {
+  DavinciModel model(0, nullptr);
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+
+  OpDescPtr data1 = CreateOpDesc("data1", DATA);
+  GeTensorDesc shape_desc(GeShape({4,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+  data1->AddInputDesc(shape_desc);
+  data1->AddOutputDesc(shape_desc);
+  NodePtr data1_node = graph->AddNode(data1);
+
+  OpDescPtr shape_node = CreateOpDesc("ascend_mbatch_get_dynamic_dims_node", GETDYNAMICDIMS);
+  GeTensorDesc in_tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  GeTensorDesc out_tensor(GeShape({4,3}), FORMAT_NCHW, DT_FLOAT);
+  shape_node->AddInputDesc(in_tensor);
+  shape_node->AddOutputDesc(out_tensor);
+  NodePtr get_dynamic_dims_node = graph->AddNode(shape_node);
+
+  OpDescPtr output = CreateOpDesc("output1", NETOUTPUT);
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  output->AddInputDesc(tensor);
+  output->SetSrcName( { "data1", "ascend_mbatch_get_dynamic_dims_node" } );
+  output->SetSrcIndex( { 0, 1 } );
+  NodePtr output_node = graph->AddNode(output);
+  GraphUtils::AddEdge(data1_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(0));
+  GraphUtils::AddEdge(get_dynamic_dims_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(1));
+
+  (void)AttrUtils::SetStr(output_node->GetOpDesc(), ATTR_ALL_GEARS_INFO, "1,3;;4,3;,3");
+
+  model.is_getnext_sink_dynamic_ = true;
+  model.is_online_infer_dynamic_ = false;
+  auto ret = model.InitRealSizeAndShapeInfo(graph, output_node);
+  EXPECT_EQ(ret, SUCCESS);
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 4;
+  ret = model.InitRealSizeAndShapeInfo(graph, output_node);
+  EXPECT_EQ(ret, SUCCESS);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_info) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);
+
+  GeAttrValue::NAMED_ATTRS aipp_attr;
+  aipp_attr.SetAttr("aipp_mode", GeAttrValue::CreateFrom<GeAttrValue::INT>(domi::AippOpParams::dynamic));
+  aipp_attr.SetAttr("related_input_rank", GeAttrValue::CreateFrom<GeAttrValue::INT>(0));
+  aipp_attr.SetAttr("max_src_image_size", GeAttrValue::CreateFrom<GeAttrValue::INT>(2048));
+  aipp_attr.SetAttr("support_rotation", GeAttrValue::CreateFrom<GeAttrValue::INT>(1));
+  EXPECT_TRUE(AttrUtils::SetNamedAttrs(op_desc, ATTR_NAME_AIPP, aipp_attr));
+
+  AippConfigInfo aipp_info;
+  EXPECT_EQ(model.GetAippInfo(0, aipp_info), ACL_ERROR_GE_AIPP_NOT_EXIST);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetAippInfo(0, aipp_info), SUCCESS);
+  EXPECT_EQ(aipp_info.aipp_mode, domi::AippOpParams::dynamic);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_static) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);
+
+  AttrUtils::SetStr(op_desc, ATTR_DATA_RELATED_AIPP_MODE, "static_aipp");
+
+  InputAippType aipp_type;
+  size_t aipp_index = 0;
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), PARAM_INVALID);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), SUCCESS);
+  EXPECT_EQ(aipp_type, DATA_WITH_STATIC_AIPP);
+  EXPECT_EQ(aipp_index, 0xFFFFFFFFu);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_dynamic) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+  AttrUtils::SetStr(op_desc, ATTR_DATA_RELATED_AIPP_MODE, "dynamic_aipp");
+  AttrUtils::SetStr(op_desc, ATTR_DATA_AIPP_DATA_NAME_MAP, "releated_aipp");
+
+  InputAippType aipp_type;
+  size_t aipp_index = 0;
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), PARAM_INVALID);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_releated) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  {
+    OpDescPtr op_desc = CreateOpDesc("data", DATA);
+    op_desc->AddInputDesc(tensor);
+    op_desc->AddOutputDesc(tensor);
+    op_desc->SetInputOffset({1024});
+    op_desc->SetOutputOffset({1024});
+    NodePtr node = graph->AddNode(op_desc);   // op_index 0
+    AttrUtils::SetStr(op_desc, ATTR_DATA_RELATED_AIPP_MODE, "dynamic_aipp");
+    AttrUtils::SetStr(op_desc, ATTR_DATA_AIPP_DATA_NAME_MAP, "releated_aipp");
+  }
+  {
+    OpDescPtr op_desc = CreateOpDesc("releated_aipp", DATA);
+    op_desc->AddInputDesc(tensor);
+    op_desc->AddOutputDesc(tensor);
+    op_desc->SetInputOffset({1024});
+    op_desc->SetOutputOffset({1024});
+    NodePtr node = graph->AddNode(op_desc);   // op_index 1
+  }
+
+  InputAippType aipp_type;
+  size_t aipp_index = 0;
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), PARAM_INVALID);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), SUCCESS);
+  EXPECT_EQ(aipp_type, DATA_WITH_DYNAMIC_AIPP);
+  EXPECT_EQ(aipp_index, 1);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 2);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 2);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_dynamic_conf) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+  AttrUtils::SetStr(op_desc, ATTR_DATA_RELATED_AIPP_MODE, "dynamic_aipp_conf");
+
+  InputAippType aipp_type;
+  size_t aipp_index = 0;
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), PARAM_INVALID);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), SUCCESS);
+  EXPECT_EQ(aipp_type, DYNAMIC_AIPP_NODE);
+  EXPECT_EQ(aipp_index, 0xFFFFFFFFU);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_dynamic_invalid) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+  AttrUtils::SetStr(op_desc, ATTR_DATA_RELATED_AIPP_MODE, "dynamic_aipp_invalid");
+
+  InputAippType aipp_type;
+  size_t aipp_index = 0;
+  EXPECT_EQ(model.GetAippType(0, aipp_type, aipp_index), PARAM_INVALID);
+  EXPECT_EQ(model.InitNodes(graph), ACL_ERROR_GE_AIPP_MODE_INVALID);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_input_info_empty) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+
+  vector<string> inputs = {};
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_INPUTS, inputs);
+  vector<string> outputs = {};
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_OUTPUTS, outputs);
+
+  OriginInputInfo orig_input_info;
+  EXPECT_EQ(model.GetOrigInputInfo(0, orig_input_info), ACL_ERROR_GE_AIPP_NOT_EXIST);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetOrigInputInfo(0, orig_input_info), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_input_info_normal) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+
+  vector<string> inputs = { "NCHW:DT_FLOAT:TensorName:TensorSize:3:1,2,8" };
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_INPUTS, inputs);
+  vector<string> outputs = { "NCHW:DT_FLOAT:TensorName:TensorSize:3:1,2,8" };
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_OUTPUTS, outputs);
+
+  OriginInputInfo orig_input_info;
+  EXPECT_EQ(model.GetOrigInputInfo(0, orig_input_info), ACL_ERROR_GE_AIPP_NOT_EXIST);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetOrigInputInfo(0, orig_input_info), SUCCESS);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_input_info_invalid) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+
+  vector<string> inputs = { "NCHW:DT_FLOAT:TensorName" };     // Invalid
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_INPUTS, inputs);
+  vector<string> outputs = { "NCHW:DT_FLOAT:TensorName:TensorSize:3:1,2,8" };
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_OUTPUTS, outputs);
+
+  OriginInputInfo orig_input_info;
+  EXPECT_EQ(model.GetOrigInputInfo(0, orig_input_info), ACL_ERROR_GE_AIPP_NOT_EXIST);
+  EXPECT_EQ(model.InitNodes(graph), ACL_ERROR_GE_AIPP_MODE_INVALID);
+  EXPECT_EQ(model.GetOrigInputInfo(0, orig_input_info), ACL_ERROR_GE_AIPP_NOT_EXIST);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+
+TEST_F(UtestDavinciModel, init_data_aipp_input_dims_normal) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = make_shared<GeModel>();   // for CustAICPUKernelStore::GetCustAICPUKernelStore()
+  model.runtime_param_.mem_base = (uint8_t *)0x08000000;
+  model.runtime_param_.mem_size = 5120000;
+  ComputeGraphPtr graph = make_shared<ComputeGraph>("default");
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(tensor, 512);
+
+  OpDescPtr op_desc = CreateOpDesc("data", DATA);
+  op_desc->AddInputDesc(tensor);
+  op_desc->AddOutputDesc(tensor);
+  op_desc->SetInputOffset({1024});
+  op_desc->SetOutputOffset({1024});
+  NodePtr node = graph->AddNode(op_desc);   // op_index 0
+
+  vector<string> inputs = { "NCHW:DT_FLOAT:TensorName:TensorSize:3:1,2,8" };
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_INPUTS, inputs);
+  vector<string> outputs = { "NCHW:DT_FLOAT:TensorName:TensorSize:3:1,2,8" };
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_AIPP_OUTPUTS, outputs);
+
+  vector<InputOutputDims> input_dims;
+  vector<InputOutputDims> output_dims;
+  EXPECT_EQ(model.GetAllAippInputOutputDims(0, input_dims, output_dims), ACL_ERROR_GE_AIPP_NOT_EXIST);
+  EXPECT_EQ(model.InitNodes(graph), SUCCESS);
+  EXPECT_EQ(model.GetAllAippInputOutputDims(0, input_dims, output_dims), SUCCESS);
+  EXPECT_EQ(input_dims.size(), 1);
+  EXPECT_EQ(output_dims.size(), 1);
+
+  EXPECT_EQ(model.input_addrs_list_.size(), 1);
+  EXPECT_EQ(model.output_addrs_list_.size(), 0);
+  EXPECT_EQ(model.op_list_.size(), 1);
+}
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/hccl_task_info_unittest.cc b/tests/ut/ge/graph/load/hccl_task_info_unittest.cc
new file mode 100644
index 00000000..5c056007
--- /dev/null
+++ b/tests/ut/ge/graph/load/hccl_task_info_unittest.cc
@@ -0,0 +1,134 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#define private public
+#define protected public
+
+#include "graph/load/new_model_manager/davinci_model.h"
+#include "graph/load/new_model_manager/task_info/hccl_task_info.h"
+
+namespace ge {
+class UtestHcclTaskInfo : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+
+// test success GetTaskID
+TEST_F(UtestHcclTaskInfo, success_get_task_id) {
+  domi::ModelTaskDef model_task_def;
+  domi::TaskDef *task = model_task_def.add_task();
+  task->set_type(RT_MODEL_TASK_KERNEL);
+  TaskInfoPtr task_info = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task->type()));
+
+  EXPECT_EQ(task_info->GetTaskID(), 0);
+
+  HcclTaskInfo hccl_task_info;
+  EXPECT_EQ(hccl_task_info.GetTaskID(), 0);
+}
+
+// test init EventRecordTaskInfo
+TEST_F(UtestHcclTaskInfo, success_create_stream) {
+  DavinciModel model(0, nullptr);
+
+  HcclTaskInfo hccl_task_info;
+  EXPECT_EQ(hccl_task_info.CreateStream(3, &model, 0), SUCCESS);
+}
+
+// test hccl_Distribute
+TEST_F(UtestHcclTaskInfo, success_distribute7) {
+  DavinciModel model(0, nullptr);
+
+  domi::ModelTaskDef model_task_def;
+  domi::TaskDef *task7 = model_task_def.add_task();
+  task7->set_type(RT_MODEL_TASK_HCCL);
+  TaskInfoPtr task_info7 = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task7->type()));
+  Status ret = task_info7->Init(task7[0], &model);
+  EXPECT_EQ(FAILED, ret);
+
+  std::vector<TaskInfoPtr> task_list;
+  task_list.push_back(task_info7);
+  model.task_list_ = task_list;
+
+  EXPECT_EQ(task_info7->Release(), SUCCESS);
+}
+
+// test hccl_Distribute
+TEST_F(UtestHcclTaskInfo, success_distribute7_with_hccl_type) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+
+  domi::TaskDef task_def;
+  HcclTaskInfo hccl_task_info;
+  EXPECT_EQ(hccl_task_info.Init(task_def, nullptr), PARAM_INVALID);
+
+
+  domi::KernelHcclDef *kernel_hccl_def = task_def.mutable_kernel_hccl();
+  kernel_hccl_def->set_op_index(0);
+  kernel_hccl_def->set_hccl_type("HcomBroadcast");
+  model.op_list_[0] = std::make_shared<OpDesc>("FrameworkOp", "FrameworkOp");
+  EXPECT_EQ(hccl_task_info.Init(task_def, &model), SUCCESS);
+
+  task_def.clear_kernel_hccl();
+}
+
+// test hccl_GetPrivateDefByTaskDef
+TEST_F(UtestHcclTaskInfo, success_hccl_get_private_def_by_task_def) {
+  DavinciModel model(0, nullptr);
+
+  domi::ModelTaskDef model_task_def;
+  domi::TaskDef *task7 = model_task_def.add_task();
+  task7->set_type(RT_MODEL_TASK_HCCL);
+  // for SetStream
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+  // for GetPrivateDefByTaskDef
+  task7->set_ops_kernel_store_ptr(10);
+  std::string value = "hccl_task";
+  task7->set_private_def(value);
+
+  TaskInfoPtr task_info7 = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task7->type()));
+  // for Distribute
+  EXPECT_EQ(task_info7->Init(task7[0], &model), PARAM_INVALID);
+
+  EXPECT_EQ(task_info7->Release(), SUCCESS);
+}
+
+// test hccl_task_TransToGETaskInfo
+TEST_F(UtestHcclTaskInfo, success_hccl_trans_to_ge_task_info) {
+  DavinciModel model(0, nullptr);
+
+  domi::ModelTaskDef model_task_def;
+  domi::TaskDef *task7 = model_task_def.add_task();
+  // for type
+  task7->set_type(RT_MODEL_TASK_HCCL);
+  TaskInfoPtr task_info7 = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task7->type()));
+
+  GETaskInfo ge_task;
+  HcclTaskInfo hccl_task_info;
+  hccl_task_info.TransToGETaskInfo(ge_task);
+
+  EXPECT_EQ(task_info7->Release(), SUCCESS);
+}
+
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/kernel_ex_task_info_unittest.cc b/tests/ut/ge/graph/load/kernel_ex_task_info_unittest.cc
new file mode 100644
index 00000000..443d2975
--- /dev/null
+++ b/tests/ut/ge/graph/load/kernel_ex_task_info_unittest.cc
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#define private public
+#define protected public
+
+#include "graph/load/new_model_manager/davinci_model.h"
+
+#include "graph/load/new_model_manager/task_info/kernel_ex_task_info.h"
+#include "cce/aicpu_engine_struct.h"
+
+namespace ge {
+extern OpDescPtr CreateOpDesc(string name, string type);
+
+class UtestKernelExTaskInfo : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+// test kernel_ex_task_Release
+TEST_F(UtestKernelExTaskInfo, success_kernel_ex_task_init) {
+  domi::TaskDef task_def;
+  KernelExTaskInfo kernel_ex_task_info;
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, nullptr), PARAM_INVALID);
+
+  DavinciModel model(0, nullptr);
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+  domi::KernelExDef *kernel_ex_def = task_def.mutable_kernel_ex();
+  kernel_ex_def->set_op_index(1);
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), INTERNAL_ERROR);
+
+  kernel_ex_def->clear_op_index();
+  kernel_ex_def->set_op_index(0);
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);
+
+  kernel_ex_def->set_task_info("KernelEx");
+  kernel_ex_def->set_task_info_size(1);
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);
+
+
+  constexpr uint32_t arg_size = sizeof(STR_FWK_OP_KERNEL);
+  string value1(arg_size, 'a');
+  kernel_ex_def->set_args_size(arg_size);
+  kernel_ex_def->set_args(value1);
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);
+
+
+  task_def.clear_kernel_ex();
+}
+
+// test kernel_ex_task_Release
+TEST_F(UtestKernelExTaskInfo, success_kernel_ex_task_release) {
+  KernelExTaskInfo kernel_ex_task_info;
+  EXPECT_EQ(kernel_ex_task_info.Release(), SUCCESS);
+
+  kernel_ex_task_info.kernel_buf_ = nullptr;
+  rtMalloc(&kernel_ex_task_info.input_output_addr_, 64, RT_MEMORY_HBM);
+  EXPECT_EQ(kernel_ex_task_info.Release(), SUCCESS);
+
+  kernel_ex_task_info.input_output_addr_ = nullptr;
+  rtMalloc(&kernel_ex_task_info.kernel_buf_, 64, RT_MEMORY_HBM);
+  EXPECT_EQ(kernel_ex_task_info.Release(), SUCCESS);
+
+  rtMalloc(&kernel_ex_task_info.kernel_buf_, 64, RT_MEMORY_HBM);
+  rtMalloc(&kernel_ex_task_info.input_output_addr_, 64, RT_MEMORY_HBM);
+  EXPECT_EQ(kernel_ex_task_info.Release(), SUCCESS);
+}
+
+// test kernel_ex_task_Release
+TEST_F(UtestKernelExTaskInfo, success_kernel_ex_task_info_copy) {
+  DavinciModel model(0, nullptr);
+  model.runtime_param_.mem_base = (uint8_t *)0x12345;
+  model.runtime_param_.mem_size = 100332000;
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+
+  domi::TaskDef task_def;
+  KernelExTaskInfo kernel_ex_task_info;
+
+  domi::KernelExDef *kernel_ex_def = task_def.mutable_kernel_ex();
+  kernel_ex_def->set_task_info_size(150);
+  kernel_ex_def->set_op_index(0);
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);  // workspace empty.
+
+  model.op_list_[0]->SetWorkspace({100331008});   // offset
+  model.op_list_[0]->SetWorkspaceBytes({0});      // length
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);  // workspace addr is null.
+
+  model.op_list_[0]->SetWorkspace({100331008});   // offset
+  model.op_list_[0]->SetWorkspaceBytes({10});     // length
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), FAILED);  // workspace addr is small.
+
+  model.op_list_[0]->SetWorkspace({100331008});   // offset
+  model.op_list_[0]->SetWorkspaceBytes({150});    // length
+  EXPECT_EQ(kernel_ex_task_info.Init(task_def, &model), SUCCESS);
+
+  task_def.clear_kernel_ex();
+  model.runtime_param_.mem_base = nullptr;
+}
+
+TEST_F(UtestKernelExTaskInfo, kernel_ex_task_info_calculate_args) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  domi::KernelExDef *kernel_ex_def = task_def.mutable_kernel_ex();
+  kernel_ex_def->set_op_index(0);
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  AttrUtils::SetStr(model.op_list_[0], ATTR_DYNAMIC_SHAPE_FIXED_ADDR, "Hello Mr Tree");
+
+  KernelExTaskInfo kernel_ex_task_info;
+  EXPECT_EQ(kernel_ex_task_info.CalculateArgs(task_def, &model), FAILED);
+}
+
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/kernel_task_info_unittest.cc b/tests/ut/ge/graph/load/kernel_task_info_unittest.cc
new file mode 100644
index 00000000..fe886b49
--- /dev/null
+++ b/tests/ut/ge/graph/load/kernel_task_info_unittest.cc
@@ -0,0 +1,1198 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#define private public
+#define protected public
+
+#include "graph/load/new_model_manager/davinci_model.h"
+#include "graph/load/new_model_manager/task_info/kernel_task_info.h"
+#include "graph/load/new_model_manager/task_info/hccl_task_info.h"
+
+namespace ge {
+extern OpDescPtr CreateOpDesc(string name, string type);
+
+class UtestKernelTaskInfo : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+// test KernelTaskInfo Init.
+TEST_F(UtestKernelTaskInfo, success_kernel_taskInfo_not_te) {
+  DavinciModel model(0, nullptr);
+  domi::ModelTaskDef model_task_def;
+  domi::TaskDef *task = model_task_def.add_task();
+  task->set_type(RT_MODEL_TASK_KERNEL);
+  TaskInfoPtr task_info = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task->type()));
+
+  task->stream_id_ = 0;
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+
+  domi::KernelDef *kernel_def = task->mutable_kernel();
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  model.op_list_[0] = CreateOpDesc("relu", RELU);
+  ctx->set_op_index(0);
+
+  EXPECT_EQ(task_info->Init(*task, &model), FAILED);
+
+  kernel_def->set_block_dim(10);
+  kernel_def->set_args("args111111", 10);
+  kernel_def->set_args_size(10);
+
+  ctx->set_kernel_type(0);
+  EXPECT_EQ(task_info->Init(*task, &model), INTERNAL_ERROR);
+
+  task_info->Release();
+}
+
+TEST_F(UtestKernelTaskInfo, success_init_kernel_task_info_fail) {
+  DavinciModel model(0, nullptr);
+  KernelTaskInfo kernel_task_info;
+  domi::TaskDef task_def;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+
+  model.op_list_[0] = CreateOpDesc("relu", RELU);
+  ctx->set_op_index(0);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+
+  // Failed by rtGetFunctionByName.
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), FAILED);
+}
+
+// test InitTVMTask failed
+TEST_F(UtestKernelTaskInfo, init_tvm_task_fail) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  EXPECT_EQ(kernel_task_info.InitTVMTask(0, *kernel_def), PARAM_INVALID);
+  task_def.clear_kernel();
+}
+
+// test InitTVMTask with kernel_type is TE
+TEST_F(UtestKernelTaskInfo, init_tvm_task_info_with_te_kernel_type) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+
+  // DavinciModel is nullptr
+  KernelTaskInfo kernel_task_info;
+  EXPECT_EQ(kernel_task_info.Init(task_def, nullptr), PARAM_INVALID);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  rtSmDesc_t l2CtrlInfo;
+  l2CtrlInfo.data[0].L2_mirror_addr = 1024;
+
+  kernel_def->set_args("args111111", 10);
+  kernel_def->set_args_size(10);
+  kernel_def->set_sm_desc(&l2CtrlInfo, sizeof(rtSmDesc_t));
+  kernel_def->set_flowtable("fl", 2);
+  kernel_def->set_block_dim(10);
+
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(2);
+  ctx->set_op_index(4);
+  ctx->set_args_offset("\0\0"); // args_offset = 0
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  ctx->clear_args_offset();
+  ctx->set_args_offset("args111111", 10);
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  ctx->clear_op_index();
+  ctx->set_op_index(0);
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), FAILED);
+
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask with kernel_type is CUSTOMIZED
+TEST_F(UtestKernelTaskInfo, init_kernel_task_info_with_customized_kernel_type) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+
+  rtSmDesc_t l2CtrlInfo;
+  l2CtrlInfo.data[0].L2_mirror_addr = 1024;
+
+  kernel_def->set_args("args111111", 10);
+  kernel_def->set_args_size(10);
+  kernel_def->set_sm_desc(&l2CtrlInfo, sizeof(rtSmDesc_t));
+  kernel_def->set_flowtable("fl", 2);
+  kernel_def->set_block_dim(10);
+
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(3);
+  ctx->set_op_index(4);
+  ctx->set_args_offset("\0\0"); // args_offset = 0
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  ctx->clear_args_offset();
+  ctx->set_args_offset("args111111", 10);
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  ctx->clear_args_offset();
+  ctx->set_op_index(0);
+
+  const char task[] = "opattr";
+  AttrUtils::SetBytes(model.op_list_[0], ATTR_NAME_OPATTR, Buffer::CopyFrom((uint8_t *)task, sizeof(task)));
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask failed
+TEST_F(UtestKernelTaskInfo, init_aicpu_custom_task_failed) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_args_offset("\0\0");
+  kernel_task_info.davinci_model_ = &model;
+
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), PARAM_INVALID);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  context->clear_args_offset();
+  context->set_args_offset("args111111", 10);
+  // KernelTaskInfo::StoreInputOutputTensor   -> SUCCESS
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), FAILED);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask failed
+TEST_F(UtestKernelTaskInfo, init_aicpu_custom_task_failed2) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  kernel_task_info.davinci_model_ = &model;
+
+  context->set_args_offset("\0\0");
+  // KernelTaskInfo::StoreInputOutputTensor   -> SUCCESS
+  // AttrUtils::GetBytes  -> true
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), PARAM_INVALID);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask failed
+TEST_F(UtestKernelTaskInfo, init_aicpu_custom_task_failed3) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  kernel_task_info.davinci_model_ = &model;
+
+  context->set_args_offset("\0\0");
+  // KernelTaskInfo::StoreInputOutputTensor   -> SUCCESS
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), PARAM_INVALID);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask failed
+TEST_F(UtestKernelTaskInfo, init_aicpu_custom_task_failed4) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  const char task[] = "opattr";
+  AttrUtils::SetBytes(model.op_list_[0], ATTR_NAME_OPATTR, Buffer::CopyFrom((uint8_t *)task, sizeof(task)));
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  kernel_task_info.davinci_model_ = &model;
+
+  context->set_args_offset("args111111", 10);
+  // KernelTaskInfo::StoreInputOutputTensor   -> SUCCESS
+  // rtMalloc RT_ERROR_NONE
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), FAILED);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask failed
+TEST_F(UtestKernelTaskInfo, init_aicpu_custom_task_failed5) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  const char task[] = "opattr";
+  AttrUtils::SetBytes(model.op_list_[0], ATTR_NAME_OPATTR, Buffer::CopyFrom((uint8_t *)task, sizeof(task)));
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  kernel_task_info.davinci_model_ = &model;
+
+  context->set_args_offset("args111111", 10);
+  // KernelTaskInfo::StoreInputOutputTensor   -> SUCCESS
+  // rtMalloc RT_ERROR_NONE
+  // rtMemcpy RT_ERROR_INVALID_VALIUE
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), FAILED);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test InitAICPUCustomTask failed
+TEST_F(UtestKernelTaskInfo, init_aicpu_custom_task_failed6) {
+  DavinciModel model(0, nullptr);
+
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  const char task[] = "opattr";
+  AttrUtils::SetBytes(model.op_list_[0], ATTR_NAME_OPATTR, Buffer::CopyFrom((uint8_t *)task, sizeof(task)));
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  kernel_task_info.davinci_model_ = &model;
+
+  context->set_args_offset("args111111", 10);
+  // KernelTaskInfo::StoreInputOutputTensor   -> SUCCESS
+  // rtMalloc RT_ERROR_NONE
+  // rtMemcpy RT_ERROR_NONE
+  EXPECT_EQ(kernel_task_info.InitAICPUCustomTask(0, *kernel_def), FAILED);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, init_kernel_taskInfo_with_aicpu_kernel_type) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+
+  task_def.set_type(RT_MODEL_TASK_KERNEL);
+  string args;
+  args.append(100, '1');
+  kernel_def->set_so_name("libDvpp.so");
+  kernel_def->set_kernel_name("DvppResize");
+  kernel_def->set_args(args.data(), 100);
+  kernel_def->set_args_size(100);
+
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(6);
+  ctx->set_op_index(0);
+
+  // ModelUtils::GetInputDataAddrs  -> ok
+  // ModelUtils::GetOutputDataAddrs -> ok
+  // rtMalloc -> RT_ERROR_NONE
+  // rtMemcpy -> RT_ERROR_NONE
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), SUCCESS);
+
+  EXPECT_EQ(kernel_task_info.Distribute(), SUCCESS);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, init_kernel_taskInfo_with_aicpu_kernel_type_fail) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+
+  task_def.set_type(RT_MODEL_TASK_KERNEL);
+  string args;
+  args.append(100, '1');
+  kernel_def->set_so_name("libDvpp.so");
+  kernel_def->set_kernel_name("DvppResize");
+  kernel_def->set_args(args.data(), 100);
+  kernel_def->set_args_size(100);
+
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(6);
+  ctx->set_op_index(0);
+
+  // ModelUtils::GetInputDataAddrs  -> ok
+  // ModelUtils::GetOutputDataAddrs -> ok
+  // rtMalloc -> RT_ERROR_NONE
+  // rtMemcpy -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), SUCCESS);
+
+  EXPECT_EQ(kernel_task_info.Distribute(), SUCCESS);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, init_kernel_taskInfo_with_aicpu_kernel_type_fail2) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+
+  task_def.set_type(RT_MODEL_TASK_KERNEL);
+  string args;
+  args.append(100, '1');
+  kernel_def->set_so_name("libDvpp.so");
+  kernel_def->set_kernel_name("DvppResize");
+  kernel_def->set_args(args.data(), 100);
+  kernel_def->set_args_size(100);
+
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(6);
+  ctx->set_op_index(0);
+
+  // ModelUtils::GetInputDataAddrs  -> ok
+  // ModelUtils::GetOutputDataAddrs -> ok
+  // rtMalloc -> RT_ERROR_INVALID_VALUE
+  // rtMemcpy -> RT_ERROR_NONE
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), SUCCESS);
+
+  EXPECT_EQ(kernel_task_info.Distribute(), SUCCESS);
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test StoreInputOutputTensor failed
+TEST_F(UtestKernelTaskInfo, store_input_output_tensor_fail) {
+  std::vector<void *> input_data_addrs;
+  std::vector<void *> output_data_addrs;
+  std::vector<::tagCcAICPUTensor> input_descs;
+  std::vector<::tagCcAICPUTensor> output_descs;
+
+  KernelTaskInfo kernel_task_info;
+  // rtMalloc -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.StoreInputOutputTensor(input_data_addrs, output_data_addrs, input_descs, output_descs), SUCCESS);
+}
+
+
+TEST_F(UtestKernelTaskInfo, store_input_output_tensor_fail2) {
+  std::vector<void *> input_data_addrs;
+  std::vector<void *> output_data_addrs;
+  std::vector<::tagCcAICPUTensor> input_descs;
+  std::vector<::tagCcAICPUTensor> output_descs;
+
+  KernelTaskInfo kernel_task_info;
+  // rtMalloc -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.StoreInputOutputTensor(input_data_addrs, output_data_addrs, input_descs, output_descs), SUCCESS);
+}
+
+// test InitCceTask success
+TEST_F(UtestKernelTaskInfo, kernel_task_info_init_cce_task) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  kernel_task_info.davinci_model_ = &model;
+
+  kernel_def->set_flowtable("InitCceTask");
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  rtSmDesc_t l2CtrlInfo;
+  l2CtrlInfo.data[0].L2_mirror_addr = 1024;
+  kernel_def->set_sm_desc(&l2CtrlInfo, sizeof(rtSmDesc_t));
+
+  model.runtime_param_.logic_mem_base = 0;
+  model.runtime_param_.mem_size = 0;
+  model.runtime_param_.logic_weight_base = 0;
+  model.runtime_param_.weight_size = 0;
+  model.runtime_param_.logic_var_base = 0;
+  model.runtime_param_.var_size = 0;
+
+  // KernelTaskInfo::UpdateCceArgs -> SUCCESS
+  // KernelTaskInfo::UpdateCceArgs -> SUCCESS
+  // rtMalloc -> RT_ERROR_NONE
+  // rtMemcpy -> RT_ERROR_NONE
+  // rtMemAllocManaged  -> RT_ERROR_NONE
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed1) {
+  DavinciModel model(0, nullptr);
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed2) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  // KernelTaskInfo::SetContext  -> SUCCESS
+
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed3) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  // KernelTaskInfo::SetContext  -> SUCCESS
+
+  kernel_def->set_flowtable("InitCceTask");
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  // KernelTaskInfo::UpdateCceArgs  -> CCE_FAILED
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed4) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  // KernelTaskInfo::SetContext  -> SUCCESS
+
+  kernel_def->set_flowtable("InitCceTask");
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  // KernelTaskInfo::UpdateCceArgs  -> SUCCESS
+  // KernelTaskInfo::SetFlowtable  -> RT_FAILED
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed5) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  // KernelTaskInfo::SetContext  -> SUCCESS
+
+  kernel_def->set_flowtable("InitCceTask");
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  // KernelTaskInfo::UpdateCceArgs  -> SUCCESS
+  // KernelTaskInfo::SetFlowtable  -> SUCCESS
+  // rtMalloc  -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed6) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  // KernelTaskInfo::SetContext  -> SUCCESS
+
+  kernel_def->set_flowtable("InitCceTask");
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  // KernelTaskInfo::UpdateCceArgs  -> SUCCESS
+  // KernelTaskInfo::SetFlowtable  -> SUCCESS
+  // rtMalloc  -> RT_ERROR_NONE
+  // rtMemcpy  -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_cce_task_failed7) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("", "");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  // KernelTaskInfo::SetContext  -> SUCCESS
+
+  kernel_def->set_flowtable("InitCceTask");
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_is_flowtable(true);
+
+  // KernelTaskInfo::UpdateCceArgs  -> SUCCESS
+  // KernelTaskInfo::SetFlowtable  -> SUCCESS
+  rtSmDesc_t l2CtrlInfo;
+  l2CtrlInfo.data[0].L2_mirror_addr = 1024;
+  kernel_def->set_sm_desc(&l2CtrlInfo, sizeof(rtSmDesc_t));
+
+  // rtMalloc  -> RT_ERROR_NONE
+  // rtMemcpy  -> RT_ERROR_NONE
+  // rtMemAllocManaged -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.InitCceTask(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test SetContext success
+TEST_F(UtestKernelTaskInfo, success_kernel_taskInfo_init_set_context) {
+  DavinciModel model(0, nullptr);
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_op_id(1);
+  context->set_kernel_func_id(1);
+  context->set_is_flowtable(true);
+  context->set_args_count(1);
+  context->set_args_offset("args111111", 10);
+
+  EXPECT_EQ(kernel_task_info.SetContext(*kernel_def), SUCCESS);
+
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test SetContext failed
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_set_context_failed1) {
+  DavinciModel model(0, nullptr);
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_op_id(1);
+  context->set_kernel_func_id(1);
+  context->set_is_flowtable(true);
+  context->set_args_count(0);
+
+  EXPECT_EQ(kernel_task_info.SetContext(*kernel_def), INTERNAL_ERROR);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_taskInfo_init_set_context_failed2) {
+  DavinciModel model(0, nullptr);
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+  context->set_op_id(1);
+  context->set_kernel_func_id(1);
+  context->set_is_flowtable(true);
+  context->set_args_count(5);
+  context->set_args_offset("\0\0");  // args_offset = 0
+
+  EXPECT_EQ(kernel_task_info.SetContext(*kernel_def), PARAM_INVALID);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+// test UpdateCceArgs success
+TEST_F(UtestKernelTaskInfo, kernel_task_info_update_cce_args) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+
+  string flowtable("InitCceTask");
+  string sm_desc("args");
+
+  uint8_t test = 2;
+  model.mem_base_ = &test;
+  model.runtime_param_.logic_mem_base = 0;
+
+  model.weights_mem_base_ = &test;
+  model.runtime_param_.logic_weight_base = 0;
+
+  uint8_t test1 = 16;
+  model.var_mem_base_ = &test1;
+  model.runtime_param_.logic_var_base = 0;
+
+  context->set_is_flowtable(true);
+  // KernelTaskInfo::CceUpdateKernelArgs ->SUCCESS
+  EXPECT_EQ(kernel_task_info.UpdateCceArgs(sm_desc, flowtable, *kernel_def), FAILED);
+
+
+  context->clear_is_flowtable();
+  context->set_is_flowtable(false);
+  // KernelTaskInfo::CceUpdateKernelArgs ->SUCCESS
+  EXPECT_EQ(kernel_task_info.UpdateCceArgs(sm_desc, flowtable, *kernel_def), FAILED);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+
+  model.mem_base_ = nullptr;
+  model.weights_mem_base_ = nullptr;
+  model.var_mem_base_ = nullptr;
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_update_cce_args_failed1) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+
+  string flowtable("InitCceTask");
+  string sm_desc("args");
+
+  uint8_t test = 2;
+  model.mem_base_ = &test;
+  model.runtime_param_.logic_mem_base = 0;
+
+  uint8_t test1 = 10;
+  model.weights_mem_base_ = &test1;
+  model.runtime_param_.logic_weight_base = 0;
+
+  model.var_mem_base_ = &test1;
+  model.runtime_param_.logic_var_base = 0;
+
+  context->set_is_flowtable(true);
+  // KernelTaskInfo::CceUpdateKernelArgs -> FAILED
+  EXPECT_EQ(kernel_task_info.UpdateCceArgs(sm_desc, flowtable, *kernel_def), FAILED);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+
+  model.mem_base_ = nullptr;
+  model.weights_mem_base_ = nullptr;
+  model.var_mem_base_ = nullptr;
+}
+
+// test SetFlowtable
+TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+
+  string flowtable("InitCceTask");
+  context->set_is_flowtable(false);
+  EXPECT_EQ(kernel_task_info.SetFlowtable(flowtable, *kernel_def), SUCCESS);
+
+
+  context->clear_is_flowtable();
+  context->set_is_flowtable(true);
+  // rtMalloc ->RT_ERROR_NONE
+  // rtMemcpy ->RT_ERROR_NONE
+  kernel_def->set_args("args111111", 10);
+  context->set_args_offset("\0\0");
+  EXPECT_EQ(kernel_task_info.SetFlowtable(flowtable, *kernel_def), SUCCESS);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable_failed1) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+
+  string flowtable("SetFlowtable");
+  context->set_is_flowtable(true);
+
+  // rtMalloc -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.SetFlowtable(flowtable, *kernel_def), FAILED);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable_failed2) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+
+  string flowtable("SetFlowtable");
+  context->set_is_flowtable(true);
+  // rtMalloc ->RT_ERROR_NONE
+  // rtMemcpy ->RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.SetFlowtable(flowtable, *kernel_def), FAILED);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_set_flowtable_failed3) {
+  DavinciModel model(0, nullptr);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_ = { stream };
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *context = kernel_def->mutable_context();
+
+  string flowtable("SetFlowtable");
+  context->set_is_flowtable(true);
+  // rtMalloc ->RT_ERROR_NONE
+  // rtMemcpy ->RT_ERROR_NONE
+  kernel_def->set_args("args", 4);
+  context->set_args_offset("args111111", 10);
+  EXPECT_EQ(kernel_task_info.SetFlowtable(flowtable, *kernel_def), FAILED);
+
+  kernel_def->clear_context();
+  task_def.clear_kernel();
+}
+
+TEST_F(UtestKernelTaskInfo, distribute_failed) {
+  KernelTaskInfo kernel_task_info;
+  DavinciModel model(0, nullptr);
+
+  domi::TaskDef task_def;
+
+  // Failed for SetStream
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), FAILED);
+
+  // rtKernelLaunchWithFlag -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.Distribute(), SUCCESS);
+}
+
+TEST_F(UtestKernelTaskInfo, distribute_success) {
+  KernelTaskInfo kernel_task_info;
+  DavinciModel model(0, nullptr);
+  model.op_list_[0] = CreateOpDesc("FrameworkOp", "FrameworkOp");
+
+  domi::TaskDef task_def;
+  // rtModelGetTaskId -> RT_ERROR_INVALID_VALUE
+  rtModel_t rt_model_handle = (rtModel_t *)0x12345678;
+  model.rt_model_handle_ = rt_model_handle;
+
+  // Failed for SetStream
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), FAILED);
+
+  // rtKernelLaunchWithFlag -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.Distribute(), SUCCESS);
+  model.rt_model_handle_ = nullptr;
+}
+
+// test success DistributeDumpTask
+TEST_F(UtestKernelTaskInfo, success_distribute_dump_task) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+
+  kernel_def->set_stub_func("kerneltaskinfo");
+  kernel_def->set_block_dim(10);
+  kernel_def->set_args("args111111", 10);
+  kernel_def->set_args_size(10);
+  rtSmDesc_t l2CtrlInfo;
+  l2CtrlInfo.data[0].L2_mirror_addr = 1024;
+  kernel_def->set_sm_desc((void *)&l2CtrlInfo, sizeof(rtSmDesc_t));
+
+  // for SetStream
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  std::vector<rtStream_t> stream_list = { stream };
+  EXPECT_EQ(kernel_task_info.SetStream(0, stream_list), SUCCESS);
+
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+
+  rtStreamDestroy(stream);
+  task_def.clear_kernel();
+}
+
+// test success GetTaskID
+TEST_F(UtestKernelTaskInfo, success_get_task_id) {
+  domi::ModelTaskDef model_task_def;
+  domi::TaskDef *task = model_task_def.add_task();
+  task->set_type(RT_MODEL_TASK_KERNEL);
+  TaskInfoPtr task_info = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task->type()));
+
+  EXPECT_EQ(task_info->GetTaskID(), 0);
+
+  KernelTaskInfo kernel_task_info;
+  EXPECT_EQ(kernel_task_info.GetTaskID(), 0);
+
+  HcclTaskInfo hccl_task_info;
+  EXPECT_EQ(hccl_task_info.GetTaskID(), 0);
+}
+
+// test StoreInputOutputTensor success
+TEST_F(UtestKernelTaskInfo, success_store_input_output_tensor) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  std::vector<void *> input_data_addrs;
+  std::vector<void *> output_data_addrs;
+  std::vector<::tagCcAICPUTensor> input_descs;
+  std::vector<::tagCcAICPUTensor> output_descs;
+
+  int test = 1;
+  int *addr = &test;
+  void *input = addr;
+  void *output = addr;
+  input_data_addrs.push_back(input);
+  output_data_addrs.push_back(output);
+
+  tagCcAICPUTensor input_desc;
+  tagCcAICPUTensor output_desc;
+  input_descs.push_back(input_desc);
+  output_descs.push_back(output_desc);
+
+  EXPECT_EQ(kernel_task_info.StoreInputOutputTensor(input_data_addrs, output_data_addrs, input_descs, output_descs), SUCCESS);
+
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+}
+
+// test KernelTaskInfo release fail
+TEST_F(UtestKernelTaskInfo, fail_release) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  std::vector<void *> input_data_addrs;
+  std::vector<void *> output_data_addrs;
+  std::vector<::tagCcAICPUTensor> input_descs;
+  std::vector<::tagCcAICPUTensor> output_descs;
+
+  int test = 1;
+  int *addr = &test;
+  void *input = addr;
+  void *output = addr;
+  input_data_addrs.push_back(input);
+  output_data_addrs.push_back(output);
+
+  tagCcAICPUTensor input_desc;
+  tagCcAICPUTensor output_desc;
+  input_descs.push_back(input_desc);
+  output_descs.push_back(output_desc);
+
+  EXPECT_EQ(kernel_task_info.StoreInputOutputTensor(input_data_addrs, output_data_addrs, input_descs, output_descs), SUCCESS);
+
+  // rtMemFreeManaged -> RT_ERROR_INVALID_VALUE
+  EXPECT_EQ(kernel_task_info.Release(), SUCCESS);
+}
+
+// test KernelTaskInfo release fail
+TEST_F(UtestKernelTaskInfo, update_l2data_success) {
+  DavinciModel model(0, nullptr);
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+  domi::KernelDef kernel_def;
+
+  EXPECT_EQ(kernel_task_info.UpdateL2Data(kernel_def), SUCCESS);
+}
+
+// test fusion_end_task Init
+TEST_F(UtestKernelTaskInfo, kernel_task_info_init_success) {
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+
+  DavinciModel model(0, nullptr);
+  auto model_def = MakeShared<domi::ModelTaskDef>();
+
+  model.model_id_ = 1;
+  model.name_ = "test";
+  model.version_ = 0x01;
+
+  model.stream_list_ = { stream };
+  model.ge_model_ = MakeShared<GeModel>();
+  model.ge_model_->SetModelTaskDef(model_def);
+
+  auto op_desc = CreateOpDesc("data", DATA);
+  op_desc->SetInputOffset({1});
+  op_desc->SetOutputOffset({100});
+
+  GeTensorDesc descin(GeShape({1, 1, 1, 1}), FORMAT_NCHW, DT_FLOAT);
+  TensorUtils::SetSize(descin, 4);
+  op_desc->AddInputDesc(descin);
+  GeTensorDesc descout(GeShape({1, 1, 1, 1}), FORMAT_NCHW, DT_FLOAT16);
+  TensorUtils::SetSize(descout, 32);
+  op_desc->AddOutputDesc(descout);
+  op_desc->SetId(0);
+
+  model.op_list_[0] = op_desc;
+
+  domi::TaskDef task_def;
+  task_def.set_stream_id(0);
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_op_index(0);
+  vector<string> original_op_names = { "conv", "add" };
+  AttrUtils::GetListStr(op_desc, ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES, original_op_names);
+
+  KernelTaskInfo kernel_task_info;
+  EXPECT_EQ(kernel_task_info.Init(task_def, &model), FAILED);
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_calculate_args_te) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(2);
+
+  KernelTaskInfo kernel_task_info;
+  EXPECT_EQ(kernel_task_info.CalculateArgs(task_def, &model), SUCCESS);
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_calculate_args_aicpu) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+
+  domi::KernelDef *kernel_def = task_def.mutable_kernel();
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_kernel_type(6);
+
+  KernelTaskInfo kernel_task_info;
+  EXPECT_EQ(kernel_task_info.CalculateArgs(task_def, &model), SUCCESS);
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_update_args_te) {
+  DavinciModel model(0, nullptr);
+
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.kernel_type_ = ccKernelType::TE;
+  kernel_task_info.davinci_model_ = &model;
+  EXPECT_EQ(kernel_task_info.UpdateArgs(), SUCCESS);
+}
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_update_args_aicpu) {
+  DavinciModel model(0, nullptr);
+
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.kernel_type_ = ccKernelType::TE;
+  kernel_task_info.davinci_model_ = &model;
+  kernel_task_info.args_size_ = 120;
+  kernel_task_info.args_addr = std::unique_ptr<uint8_t[]>(new (std::nothrow) uint8_t[kernel_task_info.args_size_]);
+  kernel_task_info.io_addrs_ = { (void*)0x12345678, (void*)0x22345678 };
+  rtMalloc(&kernel_task_info.args_, kernel_task_info.args_size_, RT_MEMORY_HBM);
+
+  EXPECT_EQ(kernel_task_info.UpdateArgs(), SUCCESS);
+}
+
+
+TEST_F(UtestKernelTaskInfo, kernel_task_info_super_kernel_info) {
+  DavinciModel model(0, nullptr);
+
+  KernelTaskInfo kernel_task_info;
+  kernel_task_info.davinci_model_ = &model;
+
+  EXPECT_EQ(kernel_task_info.SaveSuperKernelInfo(), SUCCESS);
+
+  kernel_task_info.UpdateSKTTaskId();
+
+  EXPECT_EQ(kernel_task_info.SKTFinalize(), SUCCESS);
+}
+
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/memcpy_addr_async_task_info_unittest.cc b/tests/ut/ge/graph/load/memcpy_addr_async_task_info_unittest.cc
new file mode 100644
index 00000000..9348d49e
--- /dev/null
+++ b/tests/ut/ge/graph/load/memcpy_addr_async_task_info_unittest.cc
@@ -0,0 +1,138 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#define private public
+#define protected public
+
+#include "graph/load/new_model_manager/davinci_model.h"
+#include "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h"
+
+namespace ge {
+class UtestMemcpyAddrAsyncTaskInfo : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+extern OpDescPtr CreateOpDesc(string name, string type);
+
+TEST_F(UtestMemcpyAddrAsyncTaskInfo, success_memcpy_addr_async_task_init) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  task_def.set_stream_id(0);
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(10);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(10);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_ADDR_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(6);
+
+  model.runtime_param_.logic_mem_base = 0x8003000;
+  model.runtime_param_.logic_weight_base = 0x8008000;
+  model.runtime_param_.logic_var_base = 0x800e000;
+  model.runtime_param_.mem_size = 0x5000;
+  model.runtime_param_.weight_size = 0x6000;
+  model.runtime_param_.var_size = 0x1000;
+
+  // DavinciModel is null
+  MemcpyAddrAsyncTaskInfo memcpy_addr_async_task_info;
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, nullptr), PARAM_INVALID);
+
+  // SetStream failed.
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, &model), FAILED);
+
+  // GetOpByIndex src failed
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, &model), INTERNAL_ERROR);
+
+  // GetRuntimeAddress src failed.
+  model.op_list_[6] = CreateOpDesc("memcpyaddrasync", MEMCPYADDRASYNC);
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  // GetRuntimeAddress dst failed.
+  memcpy_async->set_src(0x08003000);
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  memcpy_async->set_dst(0x08008000);
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, &model), SUCCESS);
+
+  task_def.clear_memcpy_async();
+}
+
+TEST_F(UtestMemcpyAddrAsyncTaskInfo, success_memcpy_async_task_init_failed) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+
+  task_def.set_stream_id(0);
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(10);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(10);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_ADDR_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(6);
+
+  model.runtime_param_.logic_mem_base = 0x8003000;
+  model.runtime_param_.logic_weight_base = 0x8008000;
+  model.runtime_param_.logic_var_base = 0x800e000;
+  model.runtime_param_.mem_size = 0x5000;
+  model.runtime_param_.weight_size = 0x6000;
+  model.runtime_param_.var_size = 0x1000;
+
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  model.op_list_[6] = CreateOpDesc("memcpyasync", MEMCPYADDRASYNC);
+  model.op_list_[6]->AddInputDesc(tensor);
+  model.op_list_[6]->AddOutputDesc(tensor);
+  model.op_list_[6]->SetInputOffset({1024});
+  model.op_list_[6]->SetOutputOffset({5120});
+
+  // DavinciModel is null
+  MemcpyAddrAsyncTaskInfo memcpy_addr_async_task_info;
+  EXPECT_EQ(memcpy_addr_async_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  task_def.clear_memcpy_async();
+}
+
+TEST_F(UtestMemcpyAddrAsyncTaskInfo, success_memcpy_async_calculate_args) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(0x08003000);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(0x08008000);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(0);
+
+  // DavinciModel is null
+  MemcpyAddrAsyncTaskInfo memcpy_addr_async_task_info;
+  EXPECT_EQ(memcpy_addr_async_task_info.CalculateArgs(task_def, &model), SUCCESS);
+}
+
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/memcpy_async_task_info_unittest.cc b/tests/ut/ge/graph/load/memcpy_async_task_info_unittest.cc
new file mode 100644
index 00000000..8769ec39
--- /dev/null
+++ b/tests/ut/ge/graph/load/memcpy_async_task_info_unittest.cc
@@ -0,0 +1,273 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#define private public
+#define protected public
+
+#include "graph/load/new_model_manager/davinci_model.h"
+#include "graph/load/new_model_manager/task_info/memcpy_async_task_info.h"
+
+
+namespace ge {
+class UtestMemcpyAsyncTaskInfo : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+OpDescPtr CreateOpDesc(string name = "", string type = "") {
+  auto op_desc = std::make_shared<OpDesc>(name, type);
+  op_desc->SetStreamId(0);
+  op_desc->SetId(0);
+
+  AttrUtils::SetFloat(op_desc, ATTR_NAME_ALPHA, 0);
+  AttrUtils::SetFloat(op_desc, ATTR_NAME_BETA, 0);
+
+  op_desc->SetWorkspace({});
+  op_desc->SetWorkspaceBytes({});
+  op_desc->SetInputOffset({});
+  op_desc->SetOutputOffset({});
+
+  AttrUtils::SetListStr(op_desc, ATTR_NAME_WEIGHT_NAME, {});
+  AttrUtils::SetInt(op_desc, POOLING_ATTR_MODE, 0);
+  AttrUtils::SetInt(op_desc, POOLING_ATTR_PAD_MODE, 0);
+  AttrUtils::SetInt(op_desc, POOLING_ATTR_DATA_MODE, 0);
+  AttrUtils::SetInt(op_desc, POOLING_ATTR_CEIL_MODE, 0);
+  AttrUtils::SetInt(op_desc, POOLING_ATTR_NAN_OPT, 0);
+  AttrUtils::SetListInt(op_desc, POOLING_ATTR_WINDOW, {});
+  AttrUtils::SetListInt(op_desc, POOLING_ATTR_PAD, {});
+  AttrUtils::SetListInt(op_desc, POOLING_ATTR_STRIDE, {});
+  AttrUtils::SetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, {1, 1});
+  AttrUtils::SetInt(op_desc, ATTR_NAME_STREAM_SWITCH_COND, 0);
+  return op_desc;
+}
+
+TEST_F(UtestMemcpyAsyncTaskInfo, success_memcpy_async_task_init) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  task_def.set_stream_id(0);
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(10);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(10);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(6);
+
+  model.runtime_param_.logic_mem_base = 0x8003000;
+  model.runtime_param_.logic_weight_base = 0x8008000;
+  model.runtime_param_.logic_var_base = 0x800e000;
+  model.runtime_param_.mem_size = 0x5000;
+  model.runtime_param_.weight_size = 0x6000;
+  model.runtime_param_.var_size = 0x1000;
+
+  MemcpyAsyncTaskInfo memcpy_async_task_info;
+
+  // GetOpByIndex src failed
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), INTERNAL_ERROR);
+
+  model.op_list_[6] = CreateOpDesc("memcpyasync", MEMCPYASYNC);
+  memcpy_async->set_src(0x08008000);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  // set OpDesc attr
+  std::vector<int64_t> memory_type = { RT_MEMORY_TS_4G };
+  AttrUtils::SetListInt(model.op_list_[6], ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memory_type);
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  model.op_list_[6]->AddInputDesc(tensor);
+  model.op_list_[6]->AddOutputDesc(tensor);
+  memcpy_async->set_dst_max(0);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), FAILED);
+
+  memcpy_async->set_dst_max(0);
+  model.op_list_[6]->SetInputOffset({1024});
+  model.op_list_[6]->SetOutputOffset({5120});
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), FAILED);
+
+
+  task_def.clear_memcpy_async();
+}
+
+TEST_F(UtestMemcpyAsyncTaskInfo, success_memcpy_async_task_init_failed) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+  task_def.set_stream_id(0);
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(10);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(10);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(6);
+
+  model.runtime_param_.logic_mem_base = 0x8003000;
+  model.runtime_param_.logic_weight_base = 0x8008000;
+  model.runtime_param_.logic_var_base = 0x800e000;
+  model.runtime_param_.mem_size = 0x5000;
+  model.runtime_param_.weight_size = 0x6000;
+  model.runtime_param_.var_size = 0x1000;
+
+
+  // DavinciModel is null
+  MemcpyAsyncTaskInfo memcpy_async_task_info;
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, nullptr), PARAM_INVALID);
+
+  // SetStream failed
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, nullptr), PARAM_INVALID);
+
+  // GetOpByIndex failed
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), INTERNAL_ERROR);
+
+  model.op_list_[6] = CreateOpDesc("memcpyasync", MEMCPYASYNC);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), PARAM_INVALID);
+  memcpy_async->set_src(0x08008000);
+
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), PARAM_INVALID);
+  memcpy_async->set_dst(0x08003000);
+
+  // set OpDesc attr
+  std::vector<int64_t> memory_type = { RT_MEMORY_TS_4G };
+  AttrUtils::SetListInt(model.op_list_[6], ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memory_type);
+  memcpy_async->set_dst_max(0);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, nullptr), PARAM_INVALID);
+  memcpy_async->set_dst_max(512);
+
+
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  model.op_list_[6]->AddInputDesc(tensor);
+  model.op_list_[6]->AddOutputDesc(tensor);
+  model.op_list_[6]->SetInputOffset({1024});
+  model.op_list_[6]->SetOutputOffset({5120});
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), SUCCESS);
+
+  memcpy_async->set_dst(0x08009000);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), SUCCESS);
+
+  task_def.clear_memcpy_async();
+}
+
+TEST_F(UtestMemcpyAsyncTaskInfo, success_memcpy_async_task_distribute) {
+  DavinciModel model(0, nullptr);
+  model.SetKnownNode(true);
+  domi::TaskDef task_def;
+  task_def.set_stream_id(0);
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(10);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(10);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(6);
+
+  model.runtime_param_.logic_mem_base = 0x8003000;
+  model.runtime_param_.logic_weight_base = 0x8008000;
+  model.runtime_param_.logic_var_base = 0x800e000;
+  model.runtime_param_.mem_size = 0x5000;
+  model.runtime_param_.weight_size = 0x6000;
+  model.runtime_param_.var_size = 0x1000;
+
+  MemcpyAsyncTaskInfo memcpy_async_task_info;
+
+  // GetOpByIndex src failed
+  rtStream_t stream = nullptr;
+  rtStreamCreate(&stream, 0);
+  model.stream_list_.push_back(stream);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), INTERNAL_ERROR);
+
+  model.op_list_[6] = CreateOpDesc("memcpyasync", MEMCPYASYNC);
+  memcpy_async->set_src(0x08008000);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), PARAM_INVALID);
+
+  // set OpDesc attr
+  AttrUtils::SetStr(model.op_list_[6], ATTR_DYNAMIC_SHAPE_FIXED_ADDR, "Hello Mr Tree");
+  GeTensorDesc tensor(GeShape(), FORMAT_NCHW, DT_FLOAT);
+  model.op_list_[6]->AddInputDesc(tensor);
+  model.op_list_[6]->AddOutputDesc(tensor);
+  memcpy_async->set_dst_max(0);
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), SUCCESS);
+
+  memcpy_async->set_dst_max(0);
+  model.op_list_[6]->SetInputOffset({1024});
+  model.op_list_[6]->SetOutputOffset({5120});
+  EXPECT_EQ(memcpy_async_task_info.Init(task_def, &model), SUCCESS);
+
+
+  task_def.clear_memcpy_async();
+}
+
+TEST_F(UtestMemcpyAsyncTaskInfo, success_distribute) {
+  DavinciModel model(0, nullptr);
+  model.ge_model_ = MakeShared<GeModel>();
+
+  auto model_task_def = MakeShared<domi::ModelTaskDef>();
+  domi::TaskDef *task_def = model_task_def->add_task();
+  task_def->set_type(RT_MODEL_TASK_MEMCPY_ASYNC);
+  domi::KernelDef *kernel_def = task_def->mutable_kernel();
+  domi::KernelContext *ctx = kernel_def->mutable_context();
+  ctx->set_op_index(0);
+  model.op_list_[0] = CreateOpDesc("memcpyasync", MEMCPYASYNC);
+  TaskInfoPtr task_info = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task_def->type()));
+
+  model.task_list_ = { task_info };
+  model.ge_model_->SetModelTaskDef(model_task_def);
+
+  EXPECT_EQ(model.DistributeTask(), SUCCESS);
+  EXPECT_EQ(task_info->Distribute(), SUCCESS);
+  task_info->Release();
+}
+
+TEST_F(UtestMemcpyAsyncTaskInfo, success_memcpy_async_calculate_args) {
+  DavinciModel model(0, nullptr);
+  domi::TaskDef task_def;
+
+  domi::MemcpyAsyncDef *memcpy_async = task_def.mutable_memcpy_async();
+  memcpy_async->set_dst(0x08003000);
+  memcpy_async->set_dst_max(512);
+  memcpy_async->set_src(0x08008000);
+  memcpy_async->set_count(1);
+  memcpy_async->set_kind(RT_MEMCPY_DEVICE_TO_DEVICE);
+  memcpy_async->set_op_index(0);
+
+  model.op_list_[0] = CreateOpDesc("memcpyasync", MEMCPYASYNC);
+  AttrUtils::SetStr(model.op_list_[0], ATTR_DYNAMIC_SHAPE_FIXED_ADDR, "Hello Mr Tree");
+
+  // DavinciModel is null
+  MemcpyAsyncTaskInfo memcpy_async_task_info;
+  EXPECT_EQ(memcpy_async_task_info.CalculateArgs(task_def, &model), SUCCESS);
+}
+
+TEST_F(UtestMemcpyAsyncTaskInfo, memcpy_async_update_args) {
+  DavinciModel model(0, nullptr);
+
+  MemcpyAsyncTaskInfo memcpy_async_task_info;
+  memcpy_async_task_info.davinci_model_ = &model;
+
+  EXPECT_EQ(memcpy_async_task_info.UpdateArgs(), SUCCESS);
+}
+
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/model_helper_unittest.cc b/tests/ut/ge/graph/load/model_helper_unittest.cc
new file mode 100644
index 00000000..455285bf
--- /dev/null
+++ b/tests/ut/ge/graph/load/model_helper_unittest.cc
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#define private public
+#define protected public
+#include "framework/common/helper/model_helper.h"
+#include "ge/model/ge_model.h"
+#undef private
+#undef protected
+
+#include "proto/task.pb.h"
+
+using namespace std;
+
+namespace ge {
+class UtestModelHelper : public testing::Test {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+};
+
+TEST_F(UtestModelHelper, save_size_to_modeldef_failed)
+{
+  GeModelPtr ge_model = ge::MakeShared<ge::GeModel>();
+  ModelHelper model_helper;
+  EXPECT_EQ(ACL_ERROR_GE_MEMORY_ALLOCATION, model_helper.SaveSizeToModelDef(ge_model));
+}
+
+TEST_F(UtestModelHelper, save_size_to_modeldef)
+{
+  GeModelPtr ge_model = ge::MakeShared<ge::GeModel>();
+  std::shared_ptr<domi::ModelTaskDef> task = ge::MakeShared<domi::ModelTaskDef>();
+  ge_model->SetModelTaskDef(task);
+  ModelHelper model_helper;
+  EXPECT_EQ(SUCCESS, model_helper.SaveSizeToModelDef(ge_model));
+}
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc
index 44642f93..00069930 100644
--- a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc
+++ b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc
@@ -254,6 +254,17 @@ TEST_F(UtestModelManagerDavinciModel, eventlist_success) {
   delete model;
 }
 
+// test Shrink
+TEST_F(UtestModelManagerDavinciModel, shrink_success) {
+  DavinciModel model(0, g_label_call_back);
+  OpDescPtr op_desc_ptr = make_shared<OpDesc>("Cast", "Cast");
+  void *addr = nullptr;
+  rtMalloc(&addr, 128, RT_MEMORY_HBM);
+  model.saved_task_addrs_.emplace(op_desc_ptr, addr);
+  model.Shrink();
+  EXPECT_EQ(model.saved_task_addrs_.isEmpty(), true);
+}
+
 // test rtMalloc
 TEST_F(UtestModelManagerDavinciModel, failed_reset_device) {
   DavinciModel model(0, g_label_call_back);
diff --git a/tests/ut/ge/graph/load/new_model_manager_model_manager_aicpu_unittest.cc b/tests/ut/ge/graph/load/new_model_manager_model_manager_aicpu_unittest.cc
new file mode 100644
index 00000000..43e094b5
--- /dev/null
+++ b/tests/ut/ge/graph/load/new_model_manager_model_manager_aicpu_unittest.cc
@@ -0,0 +1,102 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cce/compiler_stub.h>
+#include <gtest/gtest.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "common/debug/log.h"
+#include "common/l2_cache_optimize.h"
+#include "common/model_parser/base.h"
+#include "common/properties_manager.h"
+#include "common/types.h"
+
+#define private public
+#define protected public
+#include "common/helper/om_file_helper.h"
+#include "common/op/ge_op_utils.h"
+#include "graph/load/graph_loader.h"
+#include "graph/load/new_model_manager/davinci_model.h"
+#include "graph/load/new_model_manager/davinci_model_parser.h"
+#include "graph/load/new_model_manager/model_manager.h"
+//#include "new_op_test_utils.h"
+#undef private
+#undef protected
+
+using namespace std;
+using namespace testing;
+
+namespace ge {
+
+const static std::string ENC_KEY = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
+
+class UtestModelManagerModelManagerAicpu : public testing::Test {
+ protected:
+  void SetUp() {}
+
+  void TearDown() {}
+};
+
+TEST_F(UtestModelManagerModelManagerAicpu, checkAicpuOptype) {
+  ModelManager model_manager;
+  uint32_t model_id = 0;
+  std::vector<std::string> aicpu_op_list;
+  std::vector<std::string> aicpu_tf_list;
+  aicpu_tf_list.emplace_back("FrameworkOp");
+  aicpu_tf_list.emplace_back("Unique");
+
+  model_manager.LaunchKernelCheckAicpuOp(aicpu_op_list, aicpu_tf_list);
+  // Load allow listener is null
+  // EXPECT_EQ(ge::FAILED, mm.LoadModelOffline(model_id, data, nullptr, nullptr));
+}
+
+TEST_F(UtestModelManagerModelManagerAicpu, DestroyAicpuKernel) {
+  ModelManager model_manager;
+  uint32_t model_id = 0;
+  std::vector<std::string> aicpu_op_list;
+  std::vector<std::string> aicpu_tf_list;
+  aicpu_tf_list.emplace_back("FrameworkOp");
+  aicpu_tf_list.emplace_back("Unique");
+
+  model_manager.DestroyAicpuKernel(0,0,0);
+  // Load allow listener is null
+  // EXPECT_EQ(ge::FAILED, mm.LoadModelOffline(model_id, data, nullptr, nullptr));
+}
+
+// test GenSessionId
+TEST_F(UtestModelManagerModelManagerAicpu, gen_session_id) {
+  ModelManager manager;
+  uint64_t session_id;
+  manager.GenSessionId(session_id);
+
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  uint64_t timestamp = static_cast<uint64_t>(tv.tv_sec * 1000000);
+
+  const uint64_t kSessionTimeMask = 0xfffffff000000000; // 不比us
+  const uint64_t kSessionPidMask  = 0x000000000000ff00;
+  const uint64_t kSessionBiasMask = 0x00000000000000ff;
+
+  uint32_t pid = getpid();
+
+  EXPECT_EQ(1, kSessionBiasMask & session_id);
+  EXPECT_EQ(pid<<8 & kSessionPidMask, kSessionPidMask & session_id);
+  //EXPECT_EQ(timestamp<<16 & kSessionTimeMask, kSessionTimeMask & session_id);
+}
+
+
+}  // namespace ge
diff --git a/tests/ut/ge/graph/load/new_op_test_utils.h b/tests/ut/ge/graph/load/new_op_test_utils.h
index 325a3f1f..4cbc78ac 100644
--- a/tests/ut/ge/graph/load/new_op_test_utils.h
+++ b/tests/ut/ge/graph/load/new_op_test_utils.h
@@ -154,7 +154,7 @@ class OmeTestOpUtils {
     if (model->HasAttr(MODEL_ATTR_TASKS)) {
        ge::Buffer task_buffer;
        GE_CHK_BOOL_RET_STATUS(ge::AttrUtils::GetZeroCopyBytes(model, MODEL_ATTR_TASKS, task_buffer), FAILED,
-		              "Get bytes failed.");
+		                          "Get bytes failed.");
        std::shared_ptr<ModelTaskDef> task = ge::MakeShared<ModelTaskDef>();
        GE_CHECK_NOTNULL(task);
        GE_IF_BOOL_EXEC(task_buffer.GetData() == nullptr, GELOGE(FAILED, "Get data fail"); return FAILED);
diff --git a/tests/ut/ge/graph/passes/folding_kernel/broadcast_args_kernel_unittest.cc b/tests/ut/ge/graph/passes/folding_kernel/broadcast_args_kernel_unittest.cc
index 7990a117..3a9f758b 100644
--- a/tests/ut/ge/graph/passes/folding_kernel/broadcast_args_kernel_unittest.cc
+++ b/tests/ut/ge/graph/passes/folding_kernel/broadcast_args_kernel_unittest.cc
@@ -52,7 +52,6 @@
 
 using namespace testing;
 using namespace ge;
-using namespace cce;
 using namespace ge::test;
 
 #define TEST_OPERATOR(op_, input_shapes, output_shapes)                                                 \
diff --git a/tests/ut/ge/graph/passes/folding_kernel/broadcast_gradient_args_kernel_unittest.cc b/tests/ut/ge/graph/passes/folding_kernel/broadcast_gradient_args_kernel_unittest.cc
index e8d15291..bb021589 100644
--- a/tests/ut/ge/graph/passes/folding_kernel/broadcast_gradient_args_kernel_unittest.cc
+++ b/tests/ut/ge/graph/passes/folding_kernel/broadcast_gradient_args_kernel_unittest.cc
@@ -52,7 +52,6 @@
 
 using namespace testing;
 using namespace ge;
-using namespace cce;
 
 class UtestBroadcastGradientArgsKernel : public testing::Test {
  protected:
diff --git a/tests/ut/ge/graph/passes/folding_kernel/empty_kernel_unittest.cc b/tests/ut/ge/graph/passes/folding_kernel/empty_kernel_unittest.cc
index 7705f986..65faad20 100644
--- a/tests/ut/ge/graph/passes/folding_kernel/empty_kernel_unittest.cc
+++ b/tests/ut/ge/graph/passes/folding_kernel/empty_kernel_unittest.cc
@@ -53,7 +53,6 @@
 
 using namespace testing;
 using namespace ge;
-using namespace cce;
 using namespace ge::test;
 
 class UtestEmptyKernel : public testing::Test {
diff --git a/tests/ut/ge/graph/passes/fuse_data_nodes_with_common_input_pass_unittest.cc b/tests/ut/ge/graph/passes/fuse_data_nodes_with_common_input_pass_unittest.cc
new file mode 100644
index 00000000..8c3469c8
--- /dev/null
+++ b/tests/ut/ge/graph/passes/fuse_data_nodes_with_common_input_pass_unittest.cc
@@ -0,0 +1,182 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/passes/fuse_data_nodes_with_common_input_pass.h"
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include <map>
+
+#include "inc/pass_manager.h"
+#include "common/ge_inner_error_codes.h"
+#include "graph_builder_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/utils/op_desc_utils.h"
+#include "graph/utils/type_utils.h"
+#include "graph/utils/node_utils.h"
+
+namespace ge {
+
+class UtestFuseDataNodesWithCommonInputPass : public testing::Test {
+protected:
+  void SetUp() {}
+  void TearDown() {}
+
+public:
+  NodePtr MakeNode(const ComputeGraphPtr &graph, uint32_t in_num, uint32_t out_num, string name, string type) {
+    GeTensorDesc test_desc(GeShape(), FORMAT_NCHW, DT_FLOAT);
+    auto op_desc = std::make_shared<OpDesc>(name, type);
+    for (auto i = 0; i < in_num; ++i) {
+      op_desc->AddInputDesc(test_desc);
+    }
+    for (auto i = 0; i < out_num; ++i) {
+      op_desc->AddOutputDesc(test_desc);
+    }
+    return graph->AddNode(op_desc);
+  }
+};
+
+/// graph with subgraph
+///       const
+///       | | |
+///        case
+///          |
+///       netoutput
+///        ...
+///      data0      data1       data2
+TEST_F(UtestFuseDataNodesWithCommonInputPass, graph_with_subgraph1) {
+  PassManager pass_manager;
+  pass_manager.AddPass("FuseDataNodesWithCommonInputPass", new (std::nothrow) FuseDataNodesWithCommonInputPass);
+  ComputeGraphPtr parent_graph = std::make_shared<ComputeGraph>("parent_graph");
+  auto parent_const = MakeNode(parent_graph, 0, 1, "parent_const", "Const");
+  auto parent_case = MakeNode(parent_graph, 3, 1, "parent_case", "Case");
+  auto parent_output = MakeNode(parent_graph, 1, 0, "parent_output", "NetOutput");
+
+  GeTensorDesc tensor_desc(GeShape({1,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+
+  parent_const->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  parent_case->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  parent_case->GetOpDesc()->UpdateInputDesc(1, tensor_desc);
+  parent_case->GetOpDesc()->UpdateInputDesc(2, tensor_desc);
+  parent_case->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+
+  GraphUtils::AddEdge(parent_const->GetOutDataAnchor(0), parent_case->GetInDataAnchor(0));
+  GraphUtils::AddEdge(parent_const->GetOutDataAnchor(0), parent_case->GetInDataAnchor(1));
+  GraphUtils::AddEdge(parent_const->GetOutDataAnchor(0), parent_case->GetInDataAnchor(2));
+  GraphUtils::AddEdge(parent_case->GetOutDataAnchor(0), parent_output->GetInDataAnchor(0));
+
+  auto case_node = parent_graph->FindNode("parent_case");
+  EXPECT_NE(case_node, nullptr);
+  size_t input_data_node_num = case_node->GetInDataNodes().size();
+  EXPECT_EQ(input_data_node_num, 3);
+
+  ComputeGraphPtr sub_graph = std::make_shared<ComputeGraph>("sub_graph");
+  auto data0 = MakeNode(sub_graph, 1, 1, "data0", "Data");
+  data0->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  data0->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  auto data1 = MakeNode(sub_graph, 1, 1, "data1", "Data");
+  data1->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  auto data2 = MakeNode(sub_graph, 1, 1, "data2", "Data");
+  data2->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  data2->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  (void)AttrUtils::SetInt(data0->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, 0);
+  (void)AttrUtils::SetInt(data1->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, 1);
+  (void)AttrUtils::SetInt(data2->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, 2);
+
+  sub_graph->SetParentNode(parent_case);
+  sub_graph->SetParentGraph(parent_graph);
+  parent_graph->AddSubgraph(sub_graph->GetName(), sub_graph);
+  size_t sub_graph_num = parent_graph->GetAllSubgraphs().size();
+  EXPECT_EQ(sub_graph_num, 1);
+
+  auto data1_node = sub_graph->FindNode("data1");
+  EXPECT_NE(data1_node, nullptr);
+  auto data2_node = sub_graph->FindNode("data2");
+  EXPECT_NE(data2_node, nullptr);
+
+  EXPECT_EQ(pass_manager.Run(parent_graph), SUCCESS);
+
+  // after pass, data1 and data2 are fused to data0
+  data1_node = sub_graph->FindNode("data1");
+  EXPECT_EQ(data1_node, nullptr);
+  data2_node = sub_graph->FindNode("data2");
+  EXPECT_EQ(data2_node, nullptr);
+}
+
+/// graph with subgraph
+///            const
+///          /       \
+///        cast1  cast1
+///          \      /
+///             case
+///              |
+///           netoutput
+///        ...
+///       data1       data2
+///          \         /
+///            add
+TEST_F(UtestFuseDataNodesWithCommonInputPass, graph_with_subgraph2) {
+  PassManager pass_manager;
+  pass_manager.AddPass("FuseDataNodesWithCommonInputPass", new (std::nothrow) FuseDataNodesWithCommonInputPass);
+  ComputeGraphPtr parent_graph = std::make_shared<ComputeGraph>("parent_graph");
+  auto parent_const = MakeNode(parent_graph, 0, 1, "parent_const", "Const");
+  auto parent_cast1 = MakeNode(parent_graph, 1, 1, "parent_cast1", "Cast");
+  auto parent_case = MakeNode(parent_graph, 2, 1, "parent_case", "Case");
+  auto parent_output = MakeNode(parent_graph, 1, 0, "parent_output", "NetOutput");
+
+  GeTensorDesc tensor_desc(GeShape({1,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+
+  parent_const->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  parent_cast1->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  parent_cast1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  parent_case->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  parent_case->GetOpDesc()->UpdateInputDesc(1, tensor_desc);
+  parent_case->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+
+  GraphUtils::AddEdge(parent_const->GetOutDataAnchor(0), parent_cast1->GetInDataAnchor(0));
+  GraphUtils::AddEdge(parent_cast1->GetOutDataAnchor(0), parent_case->GetInDataAnchor(0));
+  GraphUtils::AddEdge(parent_const->GetOutDataAnchor(0), parent_cast1->GetInDataAnchor(0));
+  GraphUtils::AddEdge(parent_cast1->GetOutDataAnchor(0), parent_case->GetInDataAnchor(1));
+  GraphUtils::AddEdge(parent_case->GetOutDataAnchor(0), parent_output->GetInDataAnchor(0));
+
+  ComputeGraphPtr sub_graph = std::make_shared<ComputeGraph>("sub_graph");
+  auto data0 = MakeNode(sub_graph, 1, 1, "data0", "Data");
+  data0->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  data0->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  auto data1 = MakeNode(sub_graph, 1, 1, "data1", "Data");
+  data1->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+  data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+  (void)AttrUtils::SetInt(data0->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, 0);
+  (void)AttrUtils::SetInt(data1->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, 1);
+
+  sub_graph->SetParentNode(parent_case);
+  sub_graph->SetParentGraph(parent_graph);
+  parent_graph->AddSubgraph(sub_graph->GetName(), sub_graph);
+
+  size_t sub_graph_num = parent_graph->GetAllSubgraphs().size();
+  EXPECT_EQ(sub_graph_num, 1);
+  auto data1_node = sub_graph->FindNode("data1");
+  EXPECT_NE(data1_node, nullptr);
+
+  EXPECT_EQ(pass_manager.Run(parent_graph), SUCCESS);
+
+  // after pass, data1 is fused to data0
+  data1_node = sub_graph->FindNode("data1");
+  EXPECT_EQ(data1_node, nullptr);
+}
+}  // namespace ge
diff --git a/tests/ut/ge/graph/passes/multi_batch_clone_pass_unittest.cc b/tests/ut/ge/graph/passes/multi_batch_clone_pass_unittest.cc
new file mode 100644
index 00000000..1b75a613
--- /dev/null
+++ b/tests/ut/ge/graph/passes/multi_batch_clone_pass_unittest.cc
@@ -0,0 +1,250 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/passes/multi_batch_clone_pass.h"
+
+#include <gtest/gtest.h>
+#include <set>
+#include <string>
+
+#include "inc/pass_manager.h"
+#include "graph/utils/tensor_utils.h"
+#include "graph/common/local_context.h"
+#include "graph/passes/multi_batch_pass.h"
+#include "graph/preprocess/multi_batch_copy_graph.h"
+#include "graph/preprocess/insert_op/util_insert_aipp_op.h"
+#include "framework/omg/omg_inner_types.h"
+#include "register/op_registry.h"
+
+
+namespace ge{
+class UtestMultiBatchClonePass : public testing::Test {
+protected:
+  void SetUp() {
+    SetLocalOmgContext(domi::GetContext());
+    GetLocalOmgContext().dynamic_image_size.clear();
+    GetLocalOmgContext().dynamic_batch_size.clear();
+  }
+  void TearDown() {
+    GetLocalOmgContext().dynamic_image_size.clear();
+    GetLocalOmgContext().dynamic_batch_size.clear();
+    GetLocalOmgContext().dynamic_node_type.clear();
+  }
+
+public:
+  NodePtr MakeNode(const ComputeGraphPtr &graph, uint32_t in_num, uint32_t out_num, string name, string type) {
+    GeTensorDesc test_desc(GeShape(), FORMAT_NCHW, DT_FLOAT);
+    auto op_desc = std::make_shared<OpDesc>(name, type);
+    for (auto i = 0; i < in_num; ++i) {
+      op_desc->AddInputDesc(test_desc);
+    }
+    for (auto i = 0; i < out_num; ++i) {
+      op_desc->AddOutputDesc(test_desc);
+    }
+    return graph->AddNode(op_desc);
+  }
+
+  NodePtr MakeConstNode(const ComputeGraphPtr &graph) {
+    static uint32_t index = 0;
+    GeTensorDesc test_desc(GeShape(), FORMAT_NCHW, DT_FLOAT);
+    auto op_desc = std::make_shared<OpDesc>("dynamic_const_" + std::to_string(index++), "Const");
+    op_desc->AddOutputDesc(test_desc);
+    return graph->AddNode(op_desc);
+  }
+
+  void make_original_graph(const ComputeGraphPtr &graph) {
+    auto conv2d_node = MakeNode(graph, 3, 1, "conv1", "Conv2D");
+    {
+      auto data1 = MakeNode(graph, 1, 1, "data", "Data");
+      GeTensorDesc tensor_desc(GeShape({-1,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+      data1->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+      data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+      AttrUtils::SetInt(data1->GetOpDesc(), ATTR_NAME_INDEX, 0);
+      GetLocalOmgContext().user_input_dims = {std::make_pair(data1->GetOpDesc()->GetName(), vector<int64_t>{-1,3,224,224})};
+
+      GraphUtils::AddEdge(data1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(0));
+      auto const1 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(1));
+      auto const2 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const2->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(2));
+    }
+
+    auto bn_conv1 = MakeNode(graph, 4, 1, "bn_conv1", "BNInference");
+    {
+      GraphUtils::AddEdge(conv2d_node->GetOutDataAnchor(0), bn_conv1->GetInDataAnchor(0));
+      auto const1 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const1->GetOutDataAnchor(0), bn_conv1->GetInDataAnchor(1));
+      auto const2 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const2->GetOutDataAnchor(0), bn_conv1->GetInDataAnchor(2));
+      auto const3= MakeConstNode(graph);
+      GraphUtils::AddEdge(const3->GetOutDataAnchor(0), bn_conv1->GetInDataAnchor(3));
+    }
+
+    auto scale_conv1 = MakeNode(graph, 4, 1, "scale1", "Scale");
+    {
+      GraphUtils::AddEdge(bn_conv1->GetOutDataAnchor(0), scale_conv1->GetInDataAnchor(0));
+      auto const1 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const1->GetOutDataAnchor(0), scale_conv1->GetInDataAnchor(1));
+      auto const2 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const2->GetOutDataAnchor(0), scale_conv1->GetInDataAnchor(2));
+    }
+
+    auto output_node = MakeNode(graph, 1, 0, "output1", "NetOutput");
+    GraphUtils::AddEdge(scale_conv1->GetOutDataAnchor(0), output_node->GetInDataAnchor(0));
+  }
+
+  void GraphWithJustData(const ComputeGraphPtr &graph) {
+    auto conv2d_node = MakeNode(graph, 3, 1, "conv1", "Conv2D");
+    {
+      auto data1 = MakeNode(graph, 1, 1, "data", "Data");
+      GeTensorDesc tensor_desc(GeShape({-1,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+      data1->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+      data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+      AttrUtils::SetInt(data1->GetOpDesc(), ATTR_NAME_INDEX, 0);
+      GetLocalOmgContext().user_input_dims = {std::make_pair(data1->GetOpDesc()->GetName(), vector<int64_t>{-1,3,224,224})};
+
+      GraphUtils::AddEdge(data1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(0));
+      auto const1 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(1));
+      auto const2 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const2->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(2));
+    }
+
+    auto output_node = MakeNode(graph, 1, 0, "output1", "NetOutput");
+    GraphUtils::AddEdge(conv2d_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(0));
+  }
+
+  void GraphWithGetNextNosink(const ComputeGraphPtr &graph) {
+    auto conv2d_node = MakeNode(graph, 3, 1, "conv1", "Conv2D");
+    {
+      auto data1 = MakeNode(graph, 1, 1, "IteratorGetNext_data", "Data");
+      GeTensorDesc tensor_desc(GeShape({-1,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+      data1->GetOpDesc()->UpdateInputDesc(0, tensor_desc);
+      data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+      AttrUtils::SetInt(data1->GetOpDesc(), ATTR_NAME_INDEX, 0);
+      GetLocalOmgContext().user_input_dims = {std::make_pair(data1->GetOpDesc()->GetName(), vector<int64_t>{-1,3,224,224})};
+
+      GraphUtils::AddEdge(data1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(0));
+      auto const1 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(1));
+      auto const2 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const2->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(2));
+    }
+
+    auto output_node = MakeNode(graph, 1, 0, "output1", "NetOutput");
+    GraphUtils::AddEdge(conv2d_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(0));
+  }
+
+  // getnext has one data and has one out of shape
+  void GraphWithGetNextSink(const ComputeGraphPtr &graph) {
+    auto conv2d_node = MakeNode(graph, 3, 1, "conv1", "Conv2D");
+    {
+      auto data1 = MakeNode(graph, 1, 2, "data", "IteratorV2");
+      GeTensorDesc tensor_desc(GeShape({-1,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+      GeTensorDesc shape_desc(GeShape({4,3,224,224}), FORMAT_NCHW, DT_FLOAT);
+      data1->GetOpDesc()->UpdateOutputDesc(0, tensor_desc);
+      data1->GetOpDesc()->UpdateOutputDesc(1, shape_desc);
+      AttrUtils::SetInt(data1->GetOpDesc(), ATTR_NAME_INDEX, 0);
+      GetLocalOmgContext().user_input_dims = {std::make_pair(data1->GetOpDesc()->GetName(), vector<int64_t>{-1,3,224,224})};
+
+      GraphUtils::AddEdge(data1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(0));
+      auto identity = MakeNode(graph, 1, 0, "identity", "Identity");
+      GraphUtils::AddEdge(data1->GetOutDataAnchor(1), identity->GetInDataAnchor(0));
+      auto const1 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const1->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(1));
+      auto const2 = MakeConstNode(graph);
+      GraphUtils::AddEdge(const2->GetOutDataAnchor(0), conv2d_node->GetInDataAnchor(2));
+    }
+
+    auto output_node = MakeNode(graph, 1, 0, "output1", "NetOutput");
+    GraphUtils::AddEdge(conv2d_node->GetOutDataAnchor(0), output_node->GetInDataAnchor(0));
+  }
+};
+
+// graph is nullptr
+TEST_F(UtestMultiBatchClonePass, graph_nullptr) {
+  PassManager pass_manager;
+  pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass);
+  ComputeGraphPtr graph;
+  EXPECT_EQ(pass_manager.Run(graph), PARAM_INVALID);
+}
+
+// graph with subgraph
+TEST_F(UtestMultiBatchClonePass, graph_with_subgraph) {
+  PassManager pass_manager;
+  pass_manager.AddPass("MultiBatchClonePass", new (std::nothrow) MultiBatchClonePass);
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+  make_original_graph(graph);
+  EXPECT_EQ(pass_manager.Run(graph), SUCCESS);
+
+  ComputeGraphPtr owner = std::make_shared<ComputeGraph>("test_owner");
+  auto func_node = MakeNode(owner, 3, 1, "test_if", "If");
+  graph->SetParentNode(func_node);
+  graph->SetParentGraph(owner);
+  owner->AddSubgraph(graph->GetName(), graph);
+  size_t sub_graph_num = owner->GetAllSubgraphs().size();
+  EXPECT_EQ(sub_graph_num, 1);
+  EXPECT_EQ(pass_manager.Run(graph), SUCCESS);
+}
+
+//graph is uncompute graph, not need to do multi batch
+TEST_F(UtestMultiBatchClonePass, uncompute_graph) {
+  MultiBatchClonePass multi_batch_clone;
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+  make_original_graph(graph);
+  GetLocalOmgContext().need_multi_batch = false;
+  EXPECT_EQ(multi_batch_clone.Run(graph), SUCCESS);
+}
+
+
+//compute_graph with data from DATA
+TEST_F(UtestMultiBatchClonePass, compute_graph_with_data) {
+  MultiBatchClonePass multi_batch_clone;
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+  GraphWithJustData(graph);
+  GetLocalOmgContext().need_multi_batch = true;
+  EXPECT_EQ(multi_batch_clone.Run(graph), SUCCESS);
+  GetLocalOmgContext().dynamic_node_type = DATA;
+  GetLocalOmgContext().dynamic_dims = "1;2;4;8";
+  EXPECT_EQ(multi_batch_clone.Run(graph), SUCCESS);
+  EXPECT_EQ(GetLocalOmgContext().data_nodes.size(), 1);
+}
+
+//compute_graph with data from GetNext_nosink
+TEST_F(UtestMultiBatchClonePass, compute_graph_with_getnext_nosink) {
+  MultiBatchClonePass multi_batch_clone;
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+  GraphWithGetNextNosink(graph);
+  GetLocalOmgContext().need_multi_batch = true;
+  GetLocalOmgContext().dynamic_node_type = GETNEXT;
+  GetLocalOmgContext().dynamic_dims = "1;2;4;8";
+  EXPECT_EQ(multi_batch_clone.Run(graph), SUCCESS);
+  EXPECT_EQ(GetLocalOmgContext().getnext_nosink_nodes.size(), 1);
+}
+
+//compute_graph with data from GetNext_nosink
+TEST_F(UtestMultiBatchClonePass, compute_graph_with_getnext_sink) {
+  MultiBatchClonePass multi_batch_clone;
+  ComputeGraphPtr graph = std::make_shared<ComputeGraph>("test_graph");
+  GraphWithGetNextSink(graph);
+  GetLocalOmgContext().need_multi_batch = true;
+  GetLocalOmgContext().dynamic_node_type = GETNEXT;
+  GetLocalOmgContext().dynamic_dims = "1;2;4;8";
+  EXPECT_EQ(multi_batch_clone.Run(graph), SUCCESS);
+  EXPECT_EQ(GetLocalOmgContext().getnext_nosink_nodes.size(), 0);
+}
+
+}
diff --git a/tests/ut/ge/graph/passes/variable_op_pass_unittest.cc b/tests/ut/ge/graph/passes/variable_op_pass_unittest.cc
index 8058279f..b51908e2 100644
--- a/tests/ut/ge/graph/passes/variable_op_pass_unittest.cc
+++ b/tests/ut/ge/graph/passes/variable_op_pass_unittest.cc
@@ -38,6 +38,7 @@
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph_builder_utils.h"
+#include "cce/dnn.h"
 #include "cce/dnn_struct_base.hpp"
 #include "common/formats/format_transfers/format_transfer_nchw_nc1hwc0.h"
 #include "common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.h"
diff --git a/tests/ut/ge/graph_ir/ge_operator_factory_unittest.cc b/tests/ut/ge/graph_ir/ge_operator_factory_unittest.cc
index 64f76515..97be491a 100644
--- a/tests/ut/ge/graph_ir/ge_operator_factory_unittest.cc
+++ b/tests/ut/ge/graph_ir/ge_operator_factory_unittest.cc
@@ -84,7 +84,7 @@ TEST(UtestGeOperatorFactory, register_func) {
   status = OperatorFactoryImpl::RegisterVerifyFunc("ABC", nullptr);
   EXPECT_EQ(GRAPH_SUCCESS, status);
 }
-
+/*
 TEST(UtestGeOperatorFactory, get_ops_type_list_fail) {
   auto operator_creators_temp = OperatorFactoryImpl::operator_creators_;
   OperatorFactoryImpl::operator_creators_ = nullptr;
@@ -92,4 +92,5 @@ TEST(UtestGeOperatorFactory, get_ops_type_list_fail) {
   graphStatus status = OperatorFactoryImpl::GetOpsTypeList(all_ops);
   EXPECT_EQ(GRAPH_FAILED, status);
   OperatorFactoryImpl::operator_creators_ = operator_creators_temp;
-}
\ No newline at end of file
+}
+*/
diff --git a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc
index 5027c988..ef19b516 100644
--- a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc
+++ b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc
@@ -37,121 +37,32 @@ class UtestGeProfilinganager : public testing::Test {
   void TearDown() override {}
 };
 
-class TestReporter : public Msprof::Engine::Reporter {
- public:
-  TestReporter() {}
-  ~TestReporter() {}
-
- public:
-  int Report(const Msprof::Engine::ReporterData *data) { return 0; }
-
-  int Flush() { return 0; }
-};
-
-class TestPluginIntf : public Msprof::Engine::PluginIntf {
- public:
-  TestPluginIntf() {}
-  ~TestPluginIntf() {}
-
- public:
-  int Init(const Msprof::Engine::Reporter *reporter) { return 0; }
-
-  int UnInit() { return 0; }
-};
-
 TEST_F(UtestGeProfilinganager, init_success) {
   setenv("PROFILING_MODE", "true", true);
   Options options;
   options.device_id = 0;
   options.job_id = "0";
-  string profiling_config;
-
-  ProfilingManager::Instance().SetProfilingConfig(profiling_config);
-
-  Status ret = ProfilingManager::Instance().Init(options);
-  EXPECT_EQ(ret, ge::SUCCESS);
-}
-
-TEST_F(UtestGeProfilinganager, start_profiling_success) {
-  int32_t iter_num = 1;
-
-  setenv("PROFILING_MODE", "true", true);
-  setenv("PROFILING_OPTIONS", "training_trace", true);
-  Options options;
-  string profiling_config;
-
-  ProfilingManager::Instance().SetProfilingConfig(profiling_config);
-
-  Status ret = ProfilingManager::Instance().Init(options);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  ret = ProfilingManager::Instance().StartProfiling(iter_num, 0);
-  EXPECT_EQ(ret, ge::SUCCESS);
+  options.profiling_mode = "1";
+  options.profiling_options = R"({"result_path":"/data/profiling","training_trace":"on","task_trace":"on","aicpu_trace":"on","fp_point":"Data_0","bp_point":"addn","ai_core_metrics":"ResourceConflictRatio"})";
 
-  setenv("PROFILING_OPTIONS", "op_trance", true);
-  ret = ProfilingManager::Instance().Init(options);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  ret = ProfilingManager::Instance().StartProfiling(iter_num, 0);
-  EXPECT_EQ(ret, ge::SUCCESS);
-}
-
-TEST_F(UtestGeProfilinganager, stop_profiling_success) {
-  int32_t iter_num = 1;
-  Options options;
 
-  TestReporter test_reporter;
+  struct MsprofGeOptions prof_conf = {{ 0 }};
 
-  string profiling_config;
-  ProfilingManager::Instance().SetProfilingConfig(profiling_config);
-
-  Status ret = 0;
-  setenv("PROFILING_OPTIONS", "op_trance", true);
-  ret = ProfilingManager::Instance().Init(options);
-  EXPECT_EQ(ret, ge::SUCCESS);
-  ret = ProfilingManager::Instance().StartProfiling(iter_num, 0);
+  Status ret = ProfilingManager::Instance().InitFromOptions(options, prof_conf);
   EXPECT_EQ(ret, ge::SUCCESS);
-  ProfilingManager::Instance().StopProfiling();
-}
-
-TEST_F(UtestGeProfilinganager, plugin_impl_success) {
-  PluginImpl plugin_Impl("FMK");
-  TestReporter test_reporter;
-  Msprof::Engine::Reporter *reporter_ptr = &test_reporter;
-  plugin_Impl.Init(reporter_ptr);
-  plugin_Impl.UnInit();
-}
-
-TEST_F(UtestGeProfilinganager, profiling_engine_impl_success) {
-  ProfilingEngineImpl profiling_engine_impl;
-
-  Msprof::Engine::PluginIntf *plugin_ptr = new TestPluginIntf();
-  profiling_engine_impl.ReleasePlugin(plugin_ptr);
-
-  Msprof::Engine::PluginIntf *ptr = profiling_engine_impl.CreatePlugin();
-  delete ptr;
-  ptr = nullptr;
-}
-
-TEST_F(UtestGeProfilinganager, set_profilng_cfg_success) {
-  string profiling_config = "profiling_mode: true";
-  ProfilingManager::Instance().SetProfilingConfig(profiling_config);
 }
 
-TEST_F(UtestGeProfilinganager, init_from_cfg_success0) {
-  Options options;
-  string profiling_config =
-      "{\"startCfg\":[{\"deviceID\":\"0\",\"features\":[{\"name\":\"op_trace\",\"conf\":\"2\"}]}]}";
-  ProfilingManager::Instance().SetProfilingConfig(profiling_config);
+TEST_F(UtestGeProfilinganager, ParseOptions) {
+setenv("PROFILING_MODE", "true", true);
+Options options;
+options.device_id = 0;
+options.job_id = "0";
+options.profiling_mode = "1";
+options.profiling_options = R"({"result_path":"/data/profiling","training_trace":"on","task_trace":"on","aicpu_trace":"on","fp_point":"Data_0","bp_point":"addn","ai_core_metrics":"ResourceConflictRatio"})";
 
-  Status ret = ProfilingManager::Instance().Init(options);
-  EXPECT_EQ(ret, ge::SUCCESS);
-}
 
-TEST_F(UtestGeProfilinganager, init_from_cfg_success1) {
-  Options options;
-  string profiling_config =
-      "{\"startCfg\":[{\"deviceID\":\"0\",\"features\":[{\"name\":\"test_trace\"}],\"jobID\":\"1231231231\"}]}";
-  ProfilingManager::Instance().SetProfilingConfig(profiling_config);
+struct MsprofGeOptions prof_conf = {{ 0 }};
 
-  Status ret = ProfilingManager::Instance().Init(options);
-  EXPECT_EQ(ret, ge::SUCCESS);
+Status ret = ProfilingManager::Instance().ParseOptions(options.profiling_options);
+EXPECT_EQ(ret, ge::SUCCESS);
 }
diff --git a/tests/ut/ge/session/omg_omg_unittest.cc b/tests/ut/ge/session/omg_omg_unittest.cc
new file mode 100644
index 00000000..b9c7f1ec
--- /dev/null
+++ b/tests/ut/ge/session/omg_omg_unittest.cc
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "common/ge/ge_util.h"
+#include "proto/ge_ir.pb.h"
+#include "inc/framework/omg/omg.h"
+
+
+using namespace std;
+
+namespace ge {
+class UtestOmg : public testing::Test {
+ protected:
+  void SetUp() override {}
+
+  void TearDown() override {}
+};
+
+TEST_F(UtestOmg, display_model_info_failed) {
+  ge::proto::ModelDef model_def;
+  PrintModelInfo(&model_def);
+}
+
+TEST_F(UtestOmg, display_model_info_success) {
+  ge::proto::ModelDef model_def;
+  auto attrs = model_def.mutable_attr();
+  ge::proto::AttrDef *attr_def_soc = &(*attrs)["soc_version"];
+  attr_def_soc->set_s("Ascend310");
+  ge::proto::AttrDef *attr_def = &(*attrs)["om_info_list"];
+  attr_def->mutable_list()->add_i(1);
+  attr_def->mutable_list()->add_i(2);
+  attr_def->mutable_list()->add_i(3);
+  attr_def->mutable_list()->add_i(4);
+  attr_def->mutable_list()->add_i(5);
+  PrintModelInfo(&model_def);
+}
+}  // namespace ge
diff --git a/tests/ut/ge/single_op/single_op_model_unittest.cc b/tests/ut/ge/single_op/single_op_model_unittest.cc
index 7543b212..b6b97d89 100644
--- a/tests/ut/ge/single_op/single_op_model_unittest.cc
+++ b/tests/ut/ge/single_op/single_op_model_unittest.cc
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <vector>
 
-#include "cce/taskdown_common.hpp"
+//#include "cce/taskdown_common.hpp"
 #include "graph/load/new_model_manager/model_utils.h"
 #include "graph/utils/graph_utils.h"
 #include "runtime/rt.h"
diff --git a/tests/ut/ge/single_op/stream_resource_unittest.cc b/tests/ut/ge/single_op/stream_resource_unittest.cc
index 8cc137dc..b7306815 100644
--- a/tests/ut/ge/single_op/stream_resource_unittest.cc
+++ b/tests/ut/ge/single_op/stream_resource_unittest.cc
@@ -58,6 +58,7 @@ TEST_F(UtestStreamResource, test_malloc_memory) {
   ASSERT_NE(res.MallocMemory(purpose, 100), nullptr);
 }
 
+/*
 TEST_F(UtestStreamResource, test_do_malloc_memory) {
   size_t max_allocated = 0;
   vector<uint8_t *> allocated;
@@ -83,3 +84,4 @@ TEST_F(UtestStreamResource, test_do_malloc_memory) {
       rtFree(res);
   }
 }
+*/
diff --git a/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h b/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h
new file mode 100644
index 00000000..8d16467c
--- /dev/null
+++ b/third_party/fwkacllib/inc/aicpu/aicpu_schedule/aicpu_op_type_list.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_OP_TYPE_LIST_H_
+#define AICPU_OP_TYPE_LIST_H_
+
+enum OpKernelType {
+  TF_KERNEL,
+  CPU_KERNEL
+};
+
+enum ReturnCode {
+  OP_TYPE_NOT_SUPPORT,
+  FORMAT_NOT_SUPPORT,
+  DTYPE_NOT_SUPPORT
+};
+
+#pragma pack(push, 1)
+//One byte alignment
+struct SysOpInfo {
+  uint64_t opLen;
+  uint64_t opType;
+  OpKernelType kernelsType;
+};
+
+struct OpParamInfo {
+  uint64_t num;
+  uint64_t dtypeList;
+  uint64_t formatList;
+};
+
+struct SysOpCheckInfo {
+  uint64_t opListNum;
+  uint64_t offSetLen;
+  uint64_t sysOpInfoList;
+  uint64_t opParamInfoList;
+};
+
+struct SysOpCheckResp {
+  uint64_t opListNum;
+  bool isWithoutJson;
+  uint64_t returnCodeList;
+  uint64_t sysOpInfoList;
+  uint64_t opParamInfoList;
+};
+#pragma pack(pop)
+#endif  // AICPU_OP_TYPE_LIST_H_
diff --git a/third_party/fwkacllib/inc/cce/aicpu_engine.h b/third_party/fwkacllib/inc/cce/aicpu_engine.h
index 8bf0bdb6..b83731a8 100644
--- a/third_party/fwkacllib/inc/cce/aicpu_engine.h
+++ b/third_party/fwkacllib/inc/cce/aicpu_engine.h
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #ifndef AICPU_ENGINE_H__
 #define AICPU_ENGINE_H__
 
diff --git a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
index a5f43be9..8c0c1847 100644
--- a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
+++ b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
@@ -33,18 +33,22 @@ typedef enum {
   FMK_KERNEL_TYPE_RESERVED
 } FwkkernelType_t;
 
+#pragma pack(push, 1)
 typedef struct {
   uint32_t fwkKernelType;  // FwkkernelType_t
   union {
     ::aicpu::FWKAdapter::FWKOperateParam fwk_kernel;
   } fwkKernelBase;
-} __attribute__((packed)) STR_FWK_OP_KERNEL;
+} STR_FWK_OP_KERNEL;
+#pragma pack(pop)
 
+#pragma pack(push, 1)
 struct SessionInfo {
   uint64_t sessionId;
   uint64_t kernelId;
   bool sessFlag;
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
 #ifdef __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
index 79d94023..50b39d91 100644
--- a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
+++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
@@ -70,6 +70,7 @@ enum FWKExtUpdateAddrType {
   FWK_ADPT_UPDATE_INPUT_OUTPUT
 };
 
+#pragma pack(push, 1)
 // API Parameter Structure
 struct StrFWKKernel {
   FWKOperateType opType;
@@ -89,31 +90,39 @@ struct StrFWKKernel {
 
   uint64_t extInfoLen;         // extend info total length
   uint64_t extInfoAddr;        // extend info addr, ExtInfo structure
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
 typedef StrFWKKernel FWKOperateParam;
 
 // Extent info ShapeAndType
 const uint32_t kMaxShapeDims = 8;
+#pragma pack(push, 1)
 struct ShapeAndType {
   int32_t type;
   int64_t dims[kMaxShapeDims];
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
 // Extend info structure for extInfoAddr
 const uint32_t kExtInfoHeadSize = 8;
+
+#pragma pack(push, 1)
 struct ExtInfo {
   int32_t  infoType;    // extend type
   uint32_t infoLen;     // length for infoMsg
   char     infoMsg[0];  // extend value
-} __attribute__((packed));
+};
+#pragma pack(pop)
 
+#pragma pack(push, 1)
 struct ResultSummary {
   uint64_t shape_data_ptr;   // shape data addr, need convert to void*
   uint64_t shape_data_size;  // num of dims
   uint64_t raw_data_ptr;     // raw data addr,  need convert to void*
   uint64_t raw_data_size;    // size of raw data
-} __attribute__((packed));
+};
+#pragma pack(pop)
 }  // end  namespace FWKAdapter
 }  // namespace aicpu
 
diff --git a/third_party/fwkacllib/inc/hccl/base.h b/third_party/fwkacllib/inc/hccl/base.h
index 8194097e..9facd20c 100644
--- a/third_party/fwkacllib/inc/hccl/base.h
+++ b/third_party/fwkacllib/inc/hccl/base.h
@@ -22,7 +22,8 @@
 
 #ifndef HCCL_BASE_H_
 #define HCCL_BASE_H_
-
+#include <hccl/hccl_types.h>
+#include <string>
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
@@ -95,6 +96,33 @@ typedef void *rtStream_t;
 */
 typedef void *rtModel_t;
 
+struct HcomOperation {
+    std::string hcclType;
+    void *inputPtr;
+    void *outputPtr;
+    u64 count;
+    HcclDataType dataType;
+    HcclReduceOp opType;
+    u32 root;
+
+    HcomOperation()
+    {
+        inputPtr = nullptr;
+        outputPtr = nullptr;
+        count = 0;
+        dataType = HCCL_DATA_TYPE_RESERVED;
+        opType = HCCL_REDUCE_RESERVED;
+        root = 0;
+    }
+};
+
+struct HcomRemoteAccessAddrInfo {
+    u32 remotetRankID;
+    u64 remoteAddr;  // host embedding table address
+    u64 localAddr;  // device HBM address
+    u64 length;   // Memory Length in Bytes 
+};
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
diff --git a/third_party/fwkacllib/inc/hccl/hcom.h b/third_party/fwkacllib/inc/hccl/hcom.h
index de140b4b..b47887e5 100644
--- a/third_party/fwkacllib/inc/hccl/hcom.h
+++ b/third_party/fwkacllib/inc/hccl/hcom.h
@@ -24,6 +24,8 @@
 
 #include <hccl/base.h>
 #include <hccl/hccl_types.h>
+#include <functional>
+#include <vector>
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,7 +40,7 @@ extern "C" {
  * @param rankSize A pointer identifying the rank number.
  * @return HcclResult 
  */
-HcclResult hcom_get_rank_size(const char *group, u32 *rankSize);
+HcclResult HcomGetRankSize(const char *group, u32 *rankSize);
 
 /**
  * @brief Get the rank number of this rank's server within the group.
@@ -47,7 +49,7 @@ HcclResult hcom_get_rank_size(const char *group, u32 *rankSize);
  * @param localRankSize A pointer identifying the rank number.
  * @return HcclResult 
  */
-HcclResult hcom_get_local_rank_size(const char *group, u32 *localRankSize);
+HcclResult HcomGetLocalRankSize(const char *group, u32 *localRankSize);
 
 /**
  * @brief Get the rank id of this rank.
@@ -56,7 +58,7 @@ HcclResult hcom_get_local_rank_size(const char *group, u32 *localRankSize);
  * @param rankId A pointer identifying the rank id.
  * @return HcclResult 
  */
-HcclResult hcom_get_rank_id(const char *group, u32 *rankId);
+HcclResult HcomGetRankId(const char *group, u32 *rankId);
 
 /**
  * @brief Get the local rank id of this rank's server within the group.
@@ -65,7 +67,7 @@ HcclResult hcom_get_rank_id(const char *group, u32 *rankId);
  * @param localRankId A pointer identifying the local rank id.
  * @return HcclResult 
  */
-HcclResult hcom_get_local_rank_id(const char *group, u32 *localRankId);
+HcclResult HcomGetLocalRankId(const char *group, u32 *localRankId);
 
 /**
  * @brief Get the world rank id according to the group rank id.
@@ -75,7 +77,7 @@ HcclResult hcom_get_local_rank_id(const char *group, u32 *localRankId);
  * @param worldRank A pointer identifying the world rank id.
  * @return HcclResult 
  */
-HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank, u32 *worldRank);
+HcclResult HcomGetWorldRankFromGroupRank(const char *group, u32 groupRank, u32 *worldRank);
 
 /**
  * @brief Get the group rank id according to the world rank id.
@@ -85,7 +87,7 @@ HcclResult hcom_get_world_rank_from_group_rank(const char *group, u32 groupRank,
  * @param groupRank A pointer identifying the group rank id.
  * @return HcclResult 
  */
-HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group, u32 *groupRank);
+HcclResult HcomGetGroupRankFromWorldRank(u32 worldRank, const char *group, u32 *groupRank);
 
 /**
  * @brief Create group.
@@ -95,7 +97,7 @@ HcclResult hcom_get_group_rank_from_world_rank(u32 worldRank, const char *group,
  * @param rankIds A list identifying the ranks in the group.
  * @return HcclResult 
  */
-HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
+HcclResult HcomCreateGroup(const char *group, u32 rankNum, u32 *rankIds);
 
 /**
  * @brief Destroy group
@@ -103,27 +105,36 @@ HcclResult hcom_create_group(const char *group, u32 rankNum, u32 *rankIds);
  * @param group A string identifying the group name.
  * @return HcclResult 
  */
-HcclResult hcom_destroy_group(const char *group);
+HcclResult HcomDestroyGroup(const char *group);
 
 /**
  * @brief Set the gradient split strategy with in the group, according to gradient index.
  *
- * @param group A string identifying the group name.
- * @param segmentNum An integer(u32) identifying the segments number of gradients.
- * @param IdxList A list identifying the index of end gradient in each segment.
+ * @param void
  * @return HcclResult
  */
-extern HcclResult hcom_set_split_strategy_by_index(const char *group, u32 segmentNum, const u32 *IdxList);
+HcclResult HcomExecFinalize();
 
 /**
- * @brief Set the gradient split strategy with in the group, according to gradient data size.
+ * @brief Put collective communication operation into hcom executor.
  *
- * @param group A string identifying the group name.
- * @param segmentNum An integer(u32) identifying the segments number of gradients.
- * @param sizeList A list identifying the percent of each segment.
+ * @param opInfo information about collective communication operation.
+ * @param callback callback after collective communication operation.
+ * @return HcclResult
+ */
+HcclResult HcomExecEnqueueOperation(HcomOperation opInfo, std::function<void(HcclResult status)> callback);
+
+/**
+ * @brief Put remote access operation into hcom executor.
+ *
+ * @param remoteAccessType operation type (read or write).
+ * @param addrInfos address information about collective communication operation.
+ * @param callback callback after collective communication operation.
  * @return HcclResult
  */
-extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segmentNum, const float *sizeList);
+HcclResult HcomExecEnqueueRemoteAccess(const std::string& remoteAccessType,
+                                       const std::vector<HcomRemoteAccessAddrInfo>& addrInfos,
+                                       std::function<void(HcclResult status)> callback);
 
 /**
  * @brief Register memories and init resources for remote access.
@@ -132,7 +143,7 @@ extern HcclResult hcom_set_split_strategy_by_size(const char *group, u32 segment
  * @param count number of remote memory addresses.
  * @return HcclResult
  */
-extern HcclResult hcom_remote_access_mem_register(const MemRegisterAddr* addrList, u32 count);
+extern HcclResult HcomRegRemoteAccessMem(const MemRegisterAddr* addrList, u32 count);
 
 #ifdef __cplusplus
 }
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
index c74f95ac..005014ed 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_linux.h
@@ -215,6 +215,10 @@ typedef struct {
 #define S_IWRITE S_IWUSR
 #endif
 
+#define mm_no_argument        no_argument
+#define mm_required_argument  required_argument
+#define mm_optional_argument  optional_argument
+
 #define M_FILE_RDONLY O_RDONLY
 #define M_FILE_WRONLY O_WRONLY
 #define M_FILE_RDWR O_RDWR
@@ -275,8 +279,9 @@ typedef struct {
 #define M_NAME_MAX MAX_FNAME
 
 #define M_F_OK F_OK
-#define M_R_OK R_OK
+#define M_X_OK X_OK
 #define M_W_OK W_OK
+#define M_R_OK R_OK
 
 #define MM_DT_DIR DT_DIR
 #define MM_DT_REG DT_REG
@@ -412,8 +417,12 @@ MMPA_FUNC_VISIBILITY VOID mmClosePipe(mmPipeHandle pipe[], UINT32 pipeCount);
 // Poll related interface
 MMPA_FUNC_VISIBILITY mmCompletionHandle mmCreateCompletionPort();
 MMPA_FUNC_VISIBILITY VOID mmCloseCompletionPort(mmCompletionHandle handle);
-MMPA_FUNC_VISIBILITY INT32 mmPoll(mmPollfd *fds, INT32 fdCount, INT32 timeout, mmCompletionHandle handleIOCP,
-                                    pmmPollData polledData, mmPollBack pollBack);
+MMPA_FUNC_VISIBILITY INT32 mmPoll(mmPollfd *fds,
+                                  INT32 fdCount,
+                                  INT32 timeout,
+                                  mmCompletionHandle handleIOCP,
+                                  pmmPollData polledData,
+                                  mmPollBack pollBack);
 MMPA_FUNC_VISIBILITY INT32 mmGetErrorCode();
 MMPA_FUNC_VISIBILITY CHAR *mmGetErrorFormatMessage(mmErrorMsg errnum, CHAR *buf, mmSize size);
 MMPA_FUNC_VISIBILITY INT32 mmGetTimeOfDay(mmTimeval *timeVal, mmTimezone *timeZone);
diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
index a5a22b4f..ecc86bf8 100644
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_win.h
@@ -237,6 +237,11 @@ typedef struct {
 } mmThreadAttr;
 
 typedef VOID (*mmPf)(VOID);
+
+#define mm_no_argument        0
+#define mm_required_argument  1
+#define mm_optional_argument  2
+
 #define M_FILE_RDONLY GENERIC_READ
 #define M_FILE_WRONLY GENERIC_WRITE
 #define M_FILE_RDWR (GENERIC_READ | GENERIC_WRITE)
@@ -317,6 +322,7 @@ typedef VOID (*mmPf)(VOID);
 #define M_NAME_MAX  _MAX_FNAME
 
 #define M_F_OK 0
+#define M_X_OK 1
 #define M_W_OK 2
 #define M_R_OK 4
 
diff --git a/third_party/fwkacllib/inc/register/op_kernel_registry.h b/third_party/fwkacllib/inc/register/op_kernel_registry.h
new file mode 100644
index 00000000..5fed8960
--- /dev/null
+++ b/third_party/fwkacllib/inc/register/op_kernel_registry.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_REGISTER_OP_KERNEL_REGISTRY_H_
+#define INC_REGISTER_OP_KERNEL_REGISTRY_H_
+#include <memory>
+#include <string>
+#include "register/register_types.h"
+#include "register.h"
+
+namespace ge {
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry {
+ public:
+  using CreateFn = HostCpuOp* (*)();
+  ~OpKernelRegistry();
+
+  static OpKernelRegistry& GetInstance() {
+    static OpKernelRegistry instance;
+    return instance;
+  }
+
+  bool IsRegistered(const std::string &op_type);
+
+  void RegisterHostCpuOp(const std::string &op_type, CreateFn create_fn);
+
+  std::unique_ptr<HostCpuOp> CreateHostCpuOp(const std::string &op_type);
+
+ private:
+  OpKernelRegistry();
+  class OpKernelRegistryImpl;
+  /*lint -e148*/
+  std::unique_ptr<OpKernelRegistryImpl> impl_;
+};
+} // namespace ge
+
+#endif // INC_REGISTER_OP_KERNEL_REGISTRY_H_
diff --git a/third_party/fwkacllib/inc/register/op_registry.h b/third_party/fwkacllib/inc/register/op_registry.h
new file mode 100644
index 00000000..318eb3ba
--- /dev/null
+++ b/third_party/fwkacllib/inc/register/op_registry.h
@@ -0,0 +1,96 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_REGISTER_OP_REGISTRY_H_
+#define INC_REGISTER_OP_REGISTRY_H_
+
+#include <limits.h>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "register/register.h"
+
+namespace domi {
+enum RemoveInputType {
+  OMG_MOVE_TYPE_DTYPE = 0,
+  OMG_MOVE_TYPE_VALUE,
+  OMG_MOVE_TYPE_SHAPE,
+  OMG_MOVE_TYPE_FORMAT,
+  OMG_MOVE_TYPE_AXIS,
+  OMG_MOVE_TYPE_SCALAR_VALUE,
+  OMG_REMOVE_TYPE_WITH_COND = 1000,
+  OMG_REMOVE_INPUT_WITH_ORIGINAL_TYPE,
+  OMG_INPUT_REORDER,
+};
+
+struct RemoveInputConfigure {
+  int inputIdx = INT_MAX;
+  std::string attrName;
+  RemoveInputType moveType;
+  bool attrValue = false;
+  std::string originalType;
+  std::vector<int> input_order;
+};
+
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpRegistry {
+ public:
+  static OpRegistry *Instance();
+
+  std::vector<OpRegistrationData> registrationDatas;
+
+  bool Register(const OpRegistrationData &reg_data);
+
+  domi::ImplyType GetImplyType(const std::string &op_type);
+
+  void GetOpTypeByImplyType(std::vector<std::string> &vec_op_type, const domi::ImplyType &imply_type);
+
+  domi::ParseParamFunc GetParseParamFunc(const std::string &op_type, const std::string &ori_type);
+
+  domi::ParseParamByOpFunc GetParseParamByOperatorFunc(const std::string &ori_type);
+
+  domi::FusionParseParamFunc GetFusionParseParamFunc(const std::string &op_type, const std::string &ori_type);
+
+  domi::FusionParseParamByOpFunc GetFusionParseParamByOpFunc(const std::string &op_type,
+                                                             const std::string &ori_type);
+
+  domi::ParseSubgraphFunc GetParseSubgraphPostFunc(const std::string &op_type);
+
+  Status GetParseSubgraphPostFunc(const std::string &op_type, domi::ParseSubgraphFuncV2 &parse_subgraph_func);
+
+  domi::ImplyType GetImplyTypeByOriOpType(const std::string &ori_optype);
+
+  const std::vector<RemoveInputConfigure> &GetRemoveInputConfigure(const std::string &ori_optype) const;
+
+  bool GetOmTypeByOriOpType(const std::string &ori_optype, std::string &om_type);
+
+  ParseOpToGraphFunc GetParseOpToGraphFunc(const std::string &op_type, const std::string &ori_type);
+
+ private:
+  std::unordered_map<std::string, domi::ImplyType> op_run_mode_map_;
+  std::unordered_map<std::string, ParseParamFunc> op_parse_params_fn_map_;
+  std::unordered_map<std::string, ParseParamByOpFunc> parse_params_by_op_func_map_;
+  std::unordered_map<std::string, FusionParseParamFunc> fusion_op_parse_params_fn_map_;
+  std::unordered_map<std::string, FusionParseParamByOpFunc> fusion_parse_params_by_op_fn_map_;
+  std::unordered_map<std::string, ParseSubgraphFunc> op_types_to_parse_subgraph_post_func_;
+  std::unordered_map<std::string, std::vector<RemoveInputConfigure>> remove_input_configure_map_;
+  std::unordered_map<std::string, std::string> origin_type_to_om_type_;
+  std::unordered_map<std::string, ParseOpToGraphFunc> parse_op_to_graph_fn_map_;
+  std::unordered_map<std::string, ParseSubgraphFuncV2> op_types_to_parse_subgraph_post_func_v2_;
+};
+}  // namespace domi
+#endif  // INC_REGISTER_OP_REGISTRY_H_
diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h
index 85f16cc5..62a6dcd9 100644
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -18,6 +18,7 @@
 #define __CCE_RUNTIME_BASE_H__
 
 #include <stdint.h>
+#include "toolchain/prof_callback.h"
 
 #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
@@ -80,10 +81,11 @@ typedef enum tagRtLimitType {
 } rtLimitType_t;
 
 typedef struct rtExceptionInfo {
-    uint32_t taskid;
-    uint32_t streamid;
-    uint32_t tid;
-    uint32_t deviceid;
+  uint32_t taskid;
+  uint32_t streamid;
+  uint32_t tid;
+  uint32_t deviceid;
+  uint32_t retcode;
 } rtExceptionInfo;
 
 typedef void (*rtErrorCallback)(rtExceptionType);
@@ -132,13 +134,13 @@ RTS_API rtError_t rtProfilerConfig(uint16_t type);
  * @ingroup profiling_base
  * @brief start rts profiler.
  */
-RTS_API rtError_t rtProfilerStart(uint64_t profConfig, int32_t numsDev, uint32_t* deviceList);
+RTS_API rtError_t rtProfilerStart(uint64_t profConfig, int32_t numsDev, uint32_t *deviceList);
 
 /**
  * @ingroup profiling_base
  * @brief stop rts profiler.
  */
-RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t* deviceList);
+RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t *deviceList);
 
 /**
  * @ingroup profiling_base
@@ -147,6 +149,12 @@ RTS_API rtError_t rtProfilerStop(uint64_t profConfig, int32_t numsDev, uint32_t*
 RTS_API rtError_t rtProfilerTrace(uint64_t id, bool notify, uint32_t flags, rtStream_t stream);
 
 /**
+ * @ingroup profiling_base
+ * @brief ts set profiling reporter callback.
+ */
+RTS_API rtError_t rtSetMsprofReporterCallback(MsprofReporterCallback callback);
+
+/**
  * @ingroup dvrt_base
  * @brief Returns the last error from a runtime call.
  */
@@ -186,6 +194,16 @@ RTS_API rtError_t rtRegDeviceStateCallback(const char *regName, rtDeviceStateCal
 
 /**
  * @ingroup dvrt_base
+ * @brief register callback for fail task 
+ * @param [in] uniName unique register name, can't be null
+ * @param [in] callback fail task callback function
+ * @param [out] NA
+ * @return RT_ERROR_NONE for ok
+ */
+RTS_API rtError_t rtRegTaskFailCallbackByModule(const char *moduleName, rtTaskFailCallback callback);
+
+/**
+ * @ingroup dvrt_base
  * @brief notify handle.
  */
 typedef void *rtNotify_t;
diff --git a/third_party/fwkacllib/inc/runtime/config.h b/third_party/fwkacllib/inc/runtime/config.h
index c471f128..c1316f13 100644
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -42,6 +42,7 @@ typedef enum tagRtChipType {
   CHIP_MDC,
   CHIP_LHISI,
   CHIP_DC,
+  CHIP_CLOUD_V2,
   CHIP_END,
 } rtChipType_t;
 
@@ -62,6 +63,7 @@ typedef enum tagRtPlatformType {
   PLATFORM_LHISI_ES,
   PLATFORM_LHISI_CS,
   PLATFORM_DC,
+  PLATFORM_CLOUD_V2,
   PLATFORM_END,
 } rtPlatformType_t;
 
@@ -119,15 +121,9 @@ typedef struct tagRtMemoryConfig {
   uint32_t compilerSize;
 } rtMemoryConfig_t;
 
-typedef struct tagRtPlatformConfig { uint32_t platformConfig; } rtPlatformConfig_t;
-
-/**
- * @ingroup
- * @brief get platform
- * @param [in] platForm
- * @return platForm
- */
-RTS_API rtError_t rtGetPlatformConfig(rtPlatformConfig_t *platForm);
+typedef struct tagRtPlatformConfig {
+  uint32_t platformConfig;
+} rtPlatformConfig_t;
 
 /**
  * @ingroup
@@ -169,13 +165,6 @@ RTS_API rtError_t rtGetAiCoreMemoryRates(rtAiCoreMemoryRates_t *aiCoreMemoryRate
  */
 RTS_API rtError_t rtGetMemoryConfig(rtMemoryConfig_t *memoryConfig);
 
-/**
- * @ingroup
- * @brief set platform in gen ctx
- * @param [in] platForm
- * @return RT_ERROR_NONE for ok, errno for failed
- */
-RTS_API rtError_t rtSetPlatformType(rtPlatformType_t platformType);
 
 /**
  * @ingroup
diff --git a/third_party/fwkacllib/inc/runtime/context.h b/third_party/fwkacllib/inc/runtime/context.h
index 3346ff75..a42d380a 100644
--- a/third_party/fwkacllib/inc/runtime/context.h
+++ b/third_party/fwkacllib/inc/runtime/context.h
@@ -47,7 +47,7 @@ typedef struct tagRtGroupInfo {
   uint32_t aivectorNum;
   uint32_t sdmaNum;
   uint32_t activeStreamNum;
-  void*  extrPtr;
+  void *extrPtr;
 } rtGroupInfo_t;
 
 /**
diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h
index c70a2372..ba407803 100644
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -185,7 +185,7 @@ RTS_API rtError_t rtDisableP2P(uint32_t devIdDes, uint32_t phyIdSrc);
  * @return RT_ERROR_NONE for ok
  * @return RT_ERROR_INVALID_VALUE for error input
  */
-RTS_API rtError_t rtDeviceCanAccessPeer(int32_t* canAccessPeer, uint32_t device, uint32_t peerDevice);
+RTS_API rtError_t rtDeviceCanAccessPeer(int32_t *canAccessPeer, uint32_t device, uint32_t peerDevice);
 
 /**
  * @ingroup dvrt_dev
diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h
index 98862ad4..43c06e67 100644
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -387,7 +387,7 @@ typedef void *rtModel_t;
  * @return RT_ERROR_NONE for ok
  * @return RT_ERROR_INVALID_VALUE for error input
  */
- RTS_API rtError_t rtDumpAddrSet(rtModel_t model, void *addr, uint32_t dumpSize, uint32_t flag);
+RTS_API rtError_t rtDumpAddrSet(rtModel_t model, void *addr, uint32_t dumpSize, uint32_t flag);
 
 /**
  * @ingroup rt_kernel
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index f175cd45..d5b1b580 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -159,11 +159,11 @@ typedef struct rtAiCoreMemorySize {
  * @ingroup dvrt_mem
  * @brief memory type
  */
-typedef enum tagRtMemoryType { 
-    RT_MEMORY_TYPE_HOST = 1, 
-    RT_MEMORY_TYPE_DEVICE = 2 , 
-    RT_MEMORY_TYPE_SVM = 3,
-    RT_MEMORY_TYPE_DVPP = 4
+typedef enum tagRtMemoryType {
+  RT_MEMORY_TYPE_HOST = 1,
+  RT_MEMORY_TYPE_DEVICE = 2,
+  RT_MEMORY_TYPE_SVM = 3,
+  RT_MEMORY_TYPE_DVPP = 4
 } rtMemoryType_t;
 
 /**
@@ -179,23 +179,23 @@ typedef struct tagRtPointerAttributes {
 
 
 typedef struct rtMallocHostSharedMemoryIn {
-    const char* name;
-    const uint64_t size;
-    uint32_t flag;
+  const char *name;
+  const uint64_t size;
+  uint32_t flag;
 } rtMallocHostSharedMemoryIn;
 
 typedef struct rtMallocHostSharedMemoryOut {
-    int fd;
-    void* ptr;
-    void* devPtr;
+  int fd;
+  void *ptr;
+  void *devPtr;
 } rtMallocHostSharedMemoryOut;
 
 typedef struct rtFreeHostSharedMemoryIn {
-    const char* name;
-    const uint64_t size;
-    int fd;
-    void* ptr;
-    void* devPtr;
+  const char *name;
+  const uint64_t size;
+  int fd;
+  void *ptr;
+  void *devPtr;
 } rtFreeHostSharedMemoryIn;
 
 
@@ -267,7 +267,7 @@ RTS_API rtError_t rtFreeHost(void *hostPtr);
  */
 
 RTS_API rtError_t rtMallocHostSharedMemory(rtMallocHostSharedMemoryIn *in,
-    rtMallocHostSharedMemoryOut *out);
+                                           rtMallocHostSharedMemoryOut *out);
 
 /**
  * @ingroup dvrt_mem
diff --git a/third_party/fwkacllib/inc/runtime/stream.h b/third_party/fwkacllib/inc/runtime/stream.h
index 631c8083..b726fbd5 100644
--- a/third_party/fwkacllib/inc/runtime/stream.h
+++ b/third_party/fwkacllib/inc/runtime/stream.h
@@ -36,6 +36,7 @@ extern "C" {
 #define RT_STREAM_FORBIDDEN_DEFAULT (0x10)
 #define RT_STREAM_HEAD (0x20)
 #define RT_STREAM_PRIMARY_DEFAULT (0x40)
+#define RT_STREAM_PRIMARY_FIRST_DEFAULT (0x80)
 
 /**
  * @ingroup stream_type
diff --git a/third_party/fwkacllib/inc/soft_dp/ExternalSoftDp.h b/third_party/fwkacllib/inc/soft_dp/ExternalSoftDp.h
new file mode 100644
index 00000000..b642cbc8
--- /dev/null
+++ b/third_party/fwkacllib/inc/soft_dp/ExternalSoftDp.h
@@ -0,0 +1,52 @@
+/**
+* @file ExternalSoftDp.h
+*
+* Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+
+#ifndef EXTERNALSOFTDP_H
+#define EXTERNALSOFTDP_H
+
+#include <stdint.h>
+
+extern "C" {
+struct SoftDpProcsessInfo {
+  uint8_t* inputBuffer;
+  uint32_t inputBufferSize;
+
+  uint8_t* outputBuffer;
+  uint32_t outputBufferSize;
+
+  uint32_t outputWidth;
+  uint32_t outputHeight;
+
+  uint32_t reserved;
+};
+
+struct DpCropInfo {
+  uint32_t left;
+  uint32_t right;
+  uint32_t up;
+  uint32_t down;
+};
+
+/*
+ * @brief decode and resize interface
+ * @param [in] SoftDpProcsessInfo& softDpProcsessInfo : soft dp struct
+ * @return success: return 0, fail: return error number
+ */
+uint32_t DecodeAndResizeJpeg(SoftDpProcsessInfo& softDpProcsessInfo);
+
+/*
+ * @brief decode crop and resize interface
+ * @param [in] SoftDpProcsessInfo& softDpProcsessInfo : soft dp struct
+ * @param [in] const DpCropInfo& cropInfo: crop struct
+ * @return success: return 0, fail: return error number
+ */
+uint32_t DecodeAndCropAndResizeJpeg(SoftDpProcsessInfo& softDpProcsessInfo, const DpCropInfo& cropInfo);
+}
+#endif // EXTERNALSOFTDP_H
diff --git a/third_party/fwkacllib/inc/tdt/tsd_client.h b/third_party/fwkacllib/inc/tdt/tsd_client.h
index 6066a12e..665c8b82 100644
--- a/third_party/fwkacllib/inc/tdt/tsd_client.h
+++ b/third_party/fwkacllib/inc/tdt/tsd_client.h
@@ -23,6 +23,7 @@
 #include <mutex>
 #include "tdt/status.h"
 #include "tdt/data_common.h"
+#include "toolchain/prof_callback.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,7 +38,7 @@ extern "C" {
 * Used for the Framework process to communicate with the TSDDaemon process,
 * and notify TSD to complete the initialization of other processes
 *
-* @param phyDeviceId [IN] type #unsigned int. Physical device ID
+* @param logicDeviceId [IN] type #unsigned int. Logic device ID
 * @param rankSize [IN] type #unsigned int. The rankSize of the training.
 * The default value is 1. When rankSize is greater than 1,
 * HCCP will be pulled to perform set communication related operations.
@@ -49,7 +50,7 @@ extern "C" {
 * @li tsd_client.h: Header file where the interface declaration is located.
 * @li data_common.h: Header file where 'TDT_StatusT' defined
 */
-TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t rankSize);
+TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t logicDeviceId, const uint32_t rankSize);
 
 /**
 * @ingroup Close
@@ -67,7 +68,7 @@ TDT_LIB_EXPORT TDT_StatusT TsdOpen(const uint32_t phyDeviceId, const uint32_t ra
 * @li tsd_client.h: Header file where the interface declaration is located.
 * @li data_common.h: Header file where 'TDT_StatusT' defined
 */
-TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t phyDeviceId);
+TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t logicDeviceId);
 
 /**
 * @ingroup UpdateProfilingMode
@@ -85,7 +86,26 @@ TDT_LIB_EXPORT TDT_StatusT TsdClose(const uint32_t phyDeviceId);
 * @li tsd_client.h: Header file where the interface declaration is located.
 * @li data_common.h: Header file where 'TDT_StatusT' defined
 */
-TDT_LIB_EXPORT TDT_StatusT UpdateProfilingMode(const uint32_t phyDeviceId, const uint32_t flag);
+TDT_LIB_EXPORT TDT_StatusT UpdateProfilingMode(const uint32_t logicDeviceId, const uint32_t flag);
+
+/**
+* @ingroup TsdSetMsprofReporterCallback
+* @brief 用于推理场景下设置aicpu的profilng的callback函数
+*
+* @par Function
+* 设置offline模式下aicpu_sd进程的profiling的callback函数
+*
+* @param callback [IN] type #MsprofReporterCallback. 回调函数
+* @retval TDT_OK Success
+* @retval OtherValues Failure
+*
+* @par Dependency
+* @li libtsdclient.so: Library to which the interface belongs.
+* @li tsd_client.h: Header file where the interface declaration is located.
+* @li data_common.h: Header file where 'TDT_StatusT' defined
+* @li prof_callback.h: Headerfile where 'MsprofReporterCallback' defined
+*/
+TDT_LIB_EXPORT TDT_StatusT TsdSetMsprofReporterCallback(MsprofReporterCallback callback);
 
 /**
 * @ingroup CreateCmdParameterObj
diff --git a/third_party/fwkacllib/inc/toolchain/plog.h b/third_party/fwkacllib/inc/toolchain/plog.h
new file mode 100644
index 00000000..0d42e31d
--- /dev/null
+++ b/third_party/fwkacllib/inc/toolchain/plog.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _PLOG_H_
+#define _PLOG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#ifndef LINUX
+#define LINUX 0
+#endif // LINUX
+
+#ifndef WIN
+#define WIN 1
+#endif
+
+#ifndef OS_TYPE
+#define OS_TYPE 0
+#endif // OS_TYPE
+
+#if (OS_TYPE == LINUX)
+#define DLL_EXPORT __attribute__((visibility("default")))
+#else
+#define DLL_EXPORT _declspec(dllexport)
+#endif
+
+/**
+ * @ingroup plog
+ * @brief DlogReportInitialize: init log in service process before all device setting.
+ * @return: 0: SUCCEED, others: FAILED
+ */
+DLL_EXPORT int DlogReportInitialize();
+
+/**
+ * @ingroup plog
+ * @brief DlogReportFinalize: release log resource in service process after all device reset.
+ * @return: 0: SUCCEED, others: FAILED
+ */
+DLL_EXPORT int DlogReportFinalize();
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // D_PLOG_H_
diff --git a/third_party/fwkacllib/inc/toolchain/prof_callback.h b/third_party/fwkacllib/inc/toolchain/prof_callback.h
new file mode 100644
index 00000000..3fad74bc
--- /dev/null
+++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h
@@ -0,0 +1,135 @@
+/**
+ * Copyright 2020-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @file prof_callback.h
+ * @brief declaraion of profiling callbacks
+ */
+
+#ifndef MSPROFILER_PROF_CALLBACK_H_
+#define MSPROFILER_PROF_CALLBACK_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+
+#include "stddef.h"
+#include "stdint.h"
+
+/**
+ * @name  MsprofErrorCode
+ * @brief error code
+ */
+enum MsprofErrorCode {
+    MSPROF_ERROR_NONE = 0,
+    MSPROF_ERROR_MEM_NOT_ENOUGH,
+    MSPROF_ERROR_GET_ENV,
+    MSPROF_ERROR_CONFIG_INVALID,
+    MSPROF_ERROR_ACL_JSON_OFF,
+    MSPROF_ERROR,
+};
+
+#define MSPROF_ENGINE_MAX_TAG_LEN (31)
+
+/**
+ * @name  ReporterData
+ * @brief struct of data to report
+ */
+struct ReporterData {
+    char tag[MSPROF_ENGINE_MAX_TAG_LEN + 1];  // the sub-type of the module, data with different tag will be writen
+    int deviceId;                             // the index of device
+    size_t dataLen;                           // the length of send data
+    unsigned char *data;                      // the data content
+};
+
+/**
+ * @name  MsprofReporterModuleId
+ * @brief module id of data to report
+ */
+enum MsprofReporterModuleId {
+    MSPROF_MODULE_DATA_PREPROCESS = 0,    // DATA_PREPROCESS
+    MSPROF_MODULE_HCCL,                   // HCCL
+    MSPROF_MODULE_ACL,                    // AclModule
+    MSPROF_MODULE_FRAMEWORK,              // Framework
+    MSPROF_MODULE_RUNTIME                 // runtime
+};
+
+/**
+ * @name  MsprofReporterCallbackType
+ * @brief reporter callback request type
+ */
+enum MsprofReporterCallbackType {
+    MSPROF_REPORTER_REPORT = 0,           // report data
+    MSPROF_REPORTER_INIT,                 // init reporter
+    MSPROF_REPORTER_UNINIT,               // uninit reporter
+};
+
+/**
+ * @name  MsprofReporterCallback
+ * @brief callback to start reporter/stop reporter/report date
+ * @param moduleId  [IN] enum MsprofReporterModuleId
+ * @param type      [IN] enum MsprofReporterCallbackType
+ * @param data      [IN] callback data (nullptr on INTI/UNINIT)
+ * @param len       [IN] callback data size (0 on INIT/UNINIT)
+ * @return enum MsprofErrorCode
+ */
+typedef int32_t (*MsprofReporterCallback)(uint32_t moduleId, uint32_t type, void *data, uint32_t len);
+
+
+#define MSPROF_OPTIONS_DEF_LEN_MAX (2048)
+
+/**
+ * @name  MsprofGeOptions
+ * @brief struct of MSPROF_CTRL_INIT_GE_OPTIONS
+ */
+struct MsprofGeOptions {
+    char jobId[MSPROF_OPTIONS_DEF_LEN_MAX];
+    char options[MSPROF_OPTIONS_DEF_LEN_MAX];
+};
+
+/**
+ * @name  MsprofCtrlCallbackType
+ * @brief ctrl callback request type
+ */
+enum MsprofCtrlCallbackType {
+    MSPROF_CTRL_INIT_ACL_ENV = 0,           // start profiling with acl env
+    MSPROF_CTRL_INIT_ACL_JSON,              // start profiling with acl.json
+    MSPROF_CTRL_INIT_GE_OPTIONS,            // start profiling with ge env and options
+    MSPROF_CTRL_FINALIZE                    // stop profiling
+};
+
+/**
+ * @name  MsprofCtrlCallback
+ * @brief callback to start/stop profiling
+ * @param type      [IN] enum MsprofCtrlCallbackType
+ * @param data      [IN] callback data
+ * @param len       [IN] callback data size
+ * @return enum MsprofErrorCode
+ */
+typedef int32_t (*MsprofCtrlCallback)(uint32_t type, void *data, uint32_t len);
+
+/**
+ * @name  MsprofSetDeviceCallback
+ * @brief callback to notify set/reset device
+ * @param devId     [IN] device id
+ * @param isOpenDevice  [IN] true: set device, false: reset device
+ */
+typedef void (*MsprofSetDeviceCallback)(uint32_t devId, bool isOpenDevice);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MSPROFILER_PROF_CALLBACK_H_
diff --git a/third_party/fwkacllib/inc/toolchain/prof_reporter.h b/third_party/fwkacllib/inc/toolchain/prof_reporter.h
index 949011d3..3ae5f8ef 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_reporter.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_reporter.h
@@ -33,20 +33,6 @@
  */
 namespace Msprof {
 namespace Engine {
-/// the max tag length
-#define MSPROF_ENGINE_MAX_TAG_LEN (31)
-/**
- * @ingroup reporter
- * @brief struct ReporterData
- * the sturct of the data send to libmsprof
- */
-struct ReporterData {
-  char tag[MSPROF_ENGINE_MAX_TAG_LEN + 1];  ///< the sub-type of the module, data with different tag will be writen
-  int deviceId;                             ///< the physical id of device
-  size_t dataLen;                           ///< the length of send data
-  unsigned char *data;                      ///< the data content
-};
-
 /**
  * @ingroup reporter
  * @brief class Reporter
diff --git a/third_party/fwkacllib/inc/toolchain/slog.h b/third_party/fwkacllib/inc/toolchain/slog.h
index 5faca0ae..7c4f7be2 100644
--- a/third_party/fwkacllib/inc/toolchain/slog.h
+++ b/third_party/fwkacllib/inc/toolchain/slog.h
@@ -394,4 +394,117 @@ void DlogWithKVInner(int moduleId, int level, KeyValue *pstKVArray, int kvNum, c
 }
 #endif // LOG_CPP
 #endif // __cplusplus
+
+#ifdef LOG_CPP
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @ingroup slog
+ * @brief DlogGetlevelForC: get module loglevel and enableEvent
+ *
+ * @param [in]moduleId: moudule id(see slog.h, eg: CCE), others: invalid
+ * @param [out]enableEvent: 1: enable; 0: disable
+ * @return: module level(0: debug, 1: info, 2: warning, 3: error, 4: null output)
+ */
+DLL_EXPORT int DlogGetlevelForC(int moduleId, int *enableEvent);
+
+/**
+ * @ingroup slog
+ * @brief DlogSetlevelForC: set module loglevel and enableEvent
+ *
+ * @param [in]moduleId: moudule id(see slog.h, eg: CCE), -1: all modules, others: invalid
+ * @param [in]level: log level(0: debug, 1: info, 2: warning, 3: error, 4: null output)
+ * @param [in]enableEvent: 1: enable; 0: disable, others:invalid
+ * @return: 0: SUCCEED, others: FAILED
+ */
+DLL_EXPORT int DlogSetlevelForC(int moduleId, int level, int enableEvent);
+
+/**
+ * @ingroup slog
+ * @brief CheckLogLevelForC: check module level enable or not
+ * users no need to call it because all dlog interface(include inner interface) has already called
+ *
+ * @param [in]moduleId: module id, eg: CCE
+ * @param [in]logLevel: eg: DLOG_EVENT/DLOG_ERROR/DLOG_WARN/DLOG_INFO/DLOG_DEBUG
+ * @return: 1:enable, 0:disable
+ */
+DLL_EXPORT int CheckLogLevelForC(int moduleId, int logLevel);
+
+/**
+ * @ingroup slog
+ * @brief DlogSetAttrForC: set log attr, default pid is 0, default device id is 0, default process type is APPLICATION
+ * @param [in]logAttr: attr info, include pid(must be larger than 0), process type and device id(chip ID)
+ * @return: 0: SUCCEED, others: FAILED
+ */
+DLL_EXPORT int DlogSetAttrForC(LogAttr logAttr);
+
+/**
+ * @ingroup slog
+ * @brief DlogForC: print log, need caller to specify level
+ * call CheckLogLevelForC in advance to optimize performance, call interface with fmt input take time
+ *
+ * @param [in]moduleId: module id, eg: CCE
+ * @param [in]level(0: debug, 1: info, 2: warning, 3: error, 5: trace, 6: oplog, 16: event)
+ * @param [in]fmt: log content
+ */
+#define DlogForC(moduleId, level, fmt, ...)                                                 \
+  do {                                                                                  \
+    if(CheckLogLevelForC(moduleId, level) == 1) {                                           \
+        DlogInnerForC(moduleId, level, "[%s:%d]" fmt, __FILE__, __LINE__, ##__VA_ARGS__);   \
+     }                                                                                  \
+  } while (0)
+
+/**
+ * @ingroup slog
+ * @brief DlogSubForC: print log, need caller to specify level and submodule
+ * call CheckLogLevelForC in advance to optimize performance, call interface with fmt input take time
+ *
+ * @param [in]moduleId: module id, eg: CCE
+ * @param [in]submodule: eg: engine
+ * @param [in]level(0: debug, 1: info, 2: warning, 3: error, 5: trace, 6: oplog, 16: event)
+ * @param [in]fmt: log content
+ */
+#define DlogSubForC(moduleId, submodule, level, fmt, ...)                                                   \
+  do {                                                                                                  \
+    if(CheckLogLevelForC(moduleId, level) == 1) {                                                           \
+        DlogInnerForC(moduleId, level, "[%s:%d][%s]" fmt, __FILE__, __LINE__, submodule, ##__VA_ARGS__);    \
+    }                                                                                                   \
+  } while (0)
+
+/**
+ * @ingroup slog
+ * @brief DlogWithKVForC: print log, need caller to specify level and other paramters
+ * call CheckLogLevelForC in advance to optimize performance, call interface with fmt input take time
+ *
+ * @param [in]moduleId: module id, eg: CCE
+ * @param [in]level(0: debug, 1: info, 2: warning, 3: error, 5: trace, 6: oplog, 16: event)
+ * @param [in]pstKVArray: key-value array
+ * @param [in]kvNum: key-value element num in array
+ * @param [in]fmt: log content
+ */
+#define DlogWithKVForC(moduleId, level, pstKVArray, kvNum, fmt, ...)                                                \
+  do {                                                                                                          \
+    if(CheckLogLevelForC(moduleId, level) == 1) {                                                                   \
+        DlogWithKVInnerForC(moduleId, level, pstKVArray, kvNum, "[%s:%d]" fmt, __FILE__, __LINE__, ##__VA_ARGS__);  \
+    }                                                                                                           \
+  } while (0)
+
+/**
+ * @ingroup slog
+ * @brief DlogFlushForC: flush log buffer to file
+ */
+DLL_EXPORT void DlogFlushForC(void);
+
+/**
+ * @ingroup slog
+ * @brief Internal log interface, other modules are not allowed to call this interface
+ */
+void DlogInnerForC(int moduleId, int level, const char *fmt, ...);
+void DlogWithKVInnerForC(int moduleId, int level, KeyValue *pstKVArray, int kvNum, const char *fmt, ...);
+
+#ifdef __cplusplus
+}
+#endif
+#endif // LOG_CPP
 #endif // D_SYSLOG_H_
diff --git a/third_party/prebuild/aarch64/libalog.so b/third_party/prebuild/aarch64/libalog.so
new file mode 100755
index 00000000..e041ad7e
Binary files /dev/null and b/third_party/prebuild/aarch64/libalog.so differ
diff --git a/third_party/prebuild/aarch64/libslog.so b/third_party/prebuild/aarch64/libslog.so
deleted file mode 100755
index 700fc118..00000000
Binary files a/third_party/prebuild/aarch64/libslog.so and /dev/null differ
diff --git a/third_party/prebuild/x86_64/libalog.so b/third_party/prebuild/x86_64/libalog.so
new file mode 100755
index 00000000..051f85d9
Binary files /dev/null and b/third_party/prebuild/x86_64/libalog.so differ
diff --git a/third_party/prebuild/x86_64/libslog.so b/third_party/prebuild/x86_64/libslog.so
deleted file mode 100755
index 01b75e40..00000000
Binary files a/third_party/prebuild/x86_64/libslog.so and /dev/null differ