!1843 Fix BuildPartitionFrame failed

From: @zhangxiaokun9 Reviewed-by: @xchu42 Signed-off-by:
4 years ago · 9bd1531d58
--- a/ge/ge_local_engine/engine/host_cpu_engine.cc
+++ b/ge/ge_local_engine/engine/host_cpu_engine.cc
@@ -14,14 +14,14 @@
 * limitations under the License.
 */
 #include "ge_local_engine/engine/host_cpu_engine.h"
 #include "graph/common/omg_util.h"
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_adapter.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/type_utils.h"
 #include "register/op_kernel_registry.h"
 #include "register/host_cpu_context.h"
 #include "common/ge/ge_util.h"
 #include "common/ge/plugin_manager.h"
 #include "graph/utils/type_utils.h"
 #include "common/fp16_t.h"
 #include "common/math/math_util.h"

@@ -123,10 +123,7 @@ bool HostCpuEngine::CheckSupported(const string &op_type) {
 }

 Status HostCpuEngine::FindOpKernel(const ge::NodePtr &node, std::unique_ptr<HostCpuOp> &op_kernel) {
  std::string op_type;
  auto status = GetOriginalType(node, op_type);
  GE_CHK_BOOL_EXEC_NOLOG(status == SUCCESS, return status);

  const std::string op_type = NodeUtils::GetNodeType(node);
  auto kernel = OpKernelRegistry::GetInstance().CreateHostCpuOp(op_type);
  if (kernel == nullptr) {
    GELOGD("Op of type %s is not supported by host cpu engine", op_type.c_str());
--- a/ge/graph/load/model_manager/model_manager.cc
+++ b/ge/graph/load/model_manager/model_manager.cc
@@ -1378,7 +1378,9 @@ Status ModelManager::LoadCustAicpuSo(const OpDescPtr &op_desc, const string &so_
 Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) {
  GELOGD("Aicpu kernel launch task in, kernel name %s.", kernel_name.c_str());
  std::lock_guard<std::mutex> lock(cust_aicpu_mutex_);
  if (cust_aicpu_so_.size() == 0) return SUCCESS;
  if (cust_aicpu_so_.empty()) {
    return SUCCESS;
  }
  // get current context
  rtContext_t rt_cur_ctx = nullptr;
  auto rt_error = rtCtxGetCurrent(&rt_cur_ctx);
@@ -1394,9 +1396,19 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) {
    return SUCCESS;
  }

  rtStream_t stream = nullptr;
  vector<void *> allocated_mem;
  std::function<void()> callback = [&]() {
    for (auto mem : allocated_mem) {
      GE_CHK_RT(rtFree(mem));
    }
    if (stream != nullptr) {
      GE_CHK_RT(rtStreamDestroy(stream));
    }
  };
  GE_MAKE_GUARD(release, callback);

  rtError_t status;
  rtStream_t stream = nullptr;
  vector<CustAicpuSoBuf> v_cust_so;
  void *args = nullptr;

@@ -1471,13 +1483,6 @@ Status ModelManager::LaunchKernelCustAicpuSo(const string &kernel_name) {
    GELOGE(RT_FAILED, "[Call][RtStreamSynchronize] fail, ret = 0x%X", status);
    return RT_ERROR_TO_GE_STATUS(status);
  }
  std::function<void()> callback = [&]() {
    for (auto mem : allocated_mem) {
      GE_CHK_RT(rtFree(mem));
    }
    GE_CHK_RT(rtStreamDestroy(stream));
  };
  GE_MAKE_GUARD(release, callback);
  GELOGI("Cpu kernel launch task success.");
  return SUCCESS;
 }
--- a/ge/graph/partition/dynamic_shape_partition.cc
+++ b/ge/graph/partition/dynamic_shape_partition.cc
@@ -284,9 +284,6 @@ Status DynamicShapePartitioner::InitClusters() {
    auto cluster = MakeShared<Cluster>(rank++, type, node, this);
    REQUIRE_NOT_NULL(cluster, "[New][Memory] for cluster failed.");
    node_2_cluster_[node] = cluster;
    if (cluster->IsUnknownShape()) {
      ordered_cluster_.push_back(cluster);
    }

    int64_t group_index = -1;
    if (AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_CONTROL_FLOW_GROUP, group_index)) {
@@ -306,7 +303,7 @@ Status DynamicShapePartitioner::InitClusters() {
  return SUCCESS;
 }

 Status DynamicShapePartitioner::TopologicalSortClusters() {
 Status DynamicShapePartitioner::TopologicalSortClusters(const OrderedFilter &ordered_filter) {
  ordered_cluster_.clear();
  // BFS topological sort clusters for known shape cluster
  std::queue<ClusterPtr> ready_clusters;
@@ -331,7 +328,7 @@ Status DynamicShapePartitioner::TopologicalSortClusters() {
    auto cluster = ready_clusters.front();
    ready_clusters.pop();
    cluster->UpdateRank(rank++);
    if (cluster->IsKnownShape() || cluster->IsInputNode()) {
    if (ordered_filter == nullptr || ordered_filter(cluster)) {
      ordered_cluster_.push_back(cluster);
    }
    for (const auto &out_cluster : cluster->Outputs()) {
@@ -378,7 +375,6 @@ void DynamicShapePartitioner::MergeClustersControlFlow() {
      continue;
    }

    bool is_unknown_cluster = cluster->IsUnknownShape();
    for (++rit; rit != control_cluster.rend(); ++rit) {
      const auto &cluster_from = *rit;
      if (all_merged_clusters.count(cluster_from) > 0) {
@@ -395,11 +391,6 @@ void DynamicShapePartitioner::MergeClustersControlFlow() {
        }
      }
    }

    if (!is_unknown_cluster && cluster->IsUnknownShape()) {
      GELOGD("Add to ordered cluster: %s", cluster->DebugString().c_str());
      ordered_cluster_.push_back(cluster);
    }
  }
 }

@@ -475,9 +466,19 @@ void DynamicShapePartitioner::MergeClustersInputData() {
 }

 Status DynamicShapePartitioner::MergeClusters() {
  const auto filter_known = [](const ClusterPtr &cluster) {
    return cluster->IsKnownShape() || cluster->IsInputNode();
  };
  const auto filter_unknown = [](const ClusterPtr &cluster) {
    return cluster->IsUnknownShape();
  };

  MergeClustersControlFlow();
  REQUIRE_SUCCESS(TopologicalSortClusters(filter_unknown),
                  "[TopologicalSort][Clusters] after merge control flow clusters failed.");
  MergeClustersUnknownShape();
  REQUIRE_SUCCESS(TopologicalSortClusters(), "[TopologicalSort][Clusters] after merge unknown shape clusters failed.");
  REQUIRE_SUCCESS(TopologicalSortClusters(filter_known),
                  "[TopologicalSort][Clusters] after merge unknown shape clusters failed.");
  MergeClustersKnownShape();
  MergeClustersInputData();
  return SUCCESS;
--- a/ge/graph/partition/dynamic_shape_partition.h
+++ b/ge/graph/partition/dynamic_shape_partition.h
@@ -111,6 +111,8 @@ class DynamicShapePartitioner {

  Status Partition();

  using OrderedFilter = std::function<bool(const std::shared_ptr<Cluster> &cluster)>;

 private:
  Status PartitionImpl();
  // Collect nodes that satisfy the unknowshape rules:
@@ -138,7 +140,7 @@ class DynamicShapePartitioner {
  // Merge clusters step3
  void MergeClustersInputData();
  // Topological sort clusters after merge unknown shape clusters.
  Status TopologicalSortClusters();
  Status TopologicalSortClusters(const OrderedFilter &ordered_filter);
  // Deduplicate merged clusters
  void PruneUniqueClusters();
  // Establish the input-output anchors for each partition of the cluster and record links to other clusters
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@@ -265,10 +265,6 @@ Status HybridModelBuilder::GetOrCreateNodeItem(const NodePtr &node, NodeItem **n
    return SUCCESS;
  }

  if (node->GetType() == MEMCPYASYNC) { // Convert MemcpyAsync to Identity.
    node->GetOpDesc()->SetType(IDENTITY);
  }

  std::unique_ptr<NodeItem> new_node;
  GE_CHK_STATUS_RET(NodeItem::Create(node, new_node), "[Invoke][Create] failed, model_name_:[%s]", GetGraphName());
  GE_CHK_STATUS_RET_NOLOG(NodeExecutorManager::GetInstance().GetExecutor(*node, &new_node->node_executor));
--- a/ge/hybrid/model/node_item.cc
+++ b/ge/hybrid/model/node_item.cc
@@ -15,9 +15,7 @@
 */

 #include "hybrid/model/node_item.h"
 #include <sstream>
 #include "framework/common/debug/log.h"
 #include "graph/common/omg_util.h"

 #include "graph/compute_graph.h"
 #include "graph/debug/ge_attr_define.h"
 #include "hybrid/executor/worker/shape_inference_engine.h"
@@ -98,8 +96,7 @@ Status ParseFusedSubgraph(NodeItem &node_item) {
    GE_CHECK_NOTNULL(node);
    auto op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    std::string node_type;
    GE_CHK_STATUS_RET(GetOriginalType(node, node_type));
    const std::string node_type = NodeUtils::GetNodeType(node);
    if (node_type == DATA) {
      GE_CHK_GRAPH_STATUS_RET(ParseInputMapping(*node, *op_desc, *fused_subgraph));
    } else if (node_type == kNodeTypeRetVal) {
@@ -409,8 +406,8 @@ void NodeItem::SetDataSend(NodeItem *node_item, int anchor_index) {

 void NodeItem::SetCtrlSend(NodeItem *node_item, uint32_t switch_index) {
  if (switch_index < switch_groups_.size()) {
    std::vector<const NodeItem *> &switch_group = switch_groups_[switch_index];
    switch_group.emplace_back(node_item);
    auto &switch_group = switch_groups_[switch_index];
    switch_group.emplace(node_item);
  } else {
    ctrl_send_.insert(node_item);
  }
@@ -433,8 +430,8 @@ void NodeItem::SetMergeCtrl(NodeItem *node_item, uint32_t merge_index) {
  }

  // this is StreamMerge node, node_item is StreamActive node.
  std::vector<const NodeItem *> &switch_group = switch_groups_[merge_index];
  switch_group.emplace_back(node_item);
  auto &switch_group = switch_groups_[merge_index];
  switch_group.emplace(node_item);

  node_item->ctrl_send_.emplace(this);
  GELOGI("Node[%s] will control node[%s]", node_item->NodeName().c_str(), NodeName().c_str());
--- a/ge/hybrid/model/node_item.h
+++ b/ge/hybrid/model/node_item.h
@@ -155,7 +155,7 @@ struct NodeItem {
  std::map<const NodeItem *, int> data_recv_;  // Recv data notify from
  std::set<const NodeItem *> ctrl_send_;  // Send ctrl notify to
  std::set<const NodeItem *> ctrl_recv_;  // Recv ctrl notify from
  std::vector<std::vector<const NodeItem *>> switch_groups_;  // Send ctrl notify to
  std::vector<std::set<const NodeItem *>> switch_groups_;  // Send ctrl notify to

  std::shared_ptr<NodeTask> kernel_task;
  std::unique_ptr<FusedSubgraph> fused_subgraph;
--- a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
+++ b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc
@@ -342,6 +342,7 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function<void()> do
      GE_CHK_RT_RET(rtEventDestroy(evt));
    }
    GELOGI("rdma callback success.");
    return SUCCESS;
  };

  HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback);
--- a/ge/hybrid/node_executor/rts/rts_node_executor.cc
+++ b/ge/hybrid/node_executor/rts/rts_node_executor.cc
@@ -17,13 +17,9 @@
 #include "hybrid/node_executor/rts/rts_node_executor.h"
 #include "hybrid/node_executor/rts/rts_task_factory.h"

 #include "framework/common/debug/log.h"
 #include "common/ge/ge_util.h"
 #include "framework/common/types.h"
 #include "graph/common/omg_util.h"
 #include "graph/utils/tensor_utils.h"
 #include "hybrid/model/hybrid_model.h"
 #include "runtime/rt.h"

 namespace ge {
 namespace hybrid {
@@ -33,6 +29,7 @@ REGISTER_RTS_TASK_CREATOR(IDENTITY, IdentityNodeTask);
 REGISTER_RTS_TASK_CREATOR(IDENTITYN, IdentityNNodeTask);
 REGISTER_RTS_TASK_CREATOR(READVARIABLEOP, ReadVariableOpNodeTask);
 REGISTER_RTS_TASK_CREATOR(PROFILINGTRAININGTRACE, ProfilingTraceNodeTask);
 REGISTER_RTS_TASK_CREATOR(MEMCPYASYNC, IdentityNodeTask);

 Status IdentityNodeTask::DoCopyTensor(TaskContext &context, int index) {
  auto input_desc = context.MutableInputDesc(index);
@@ -133,8 +130,7 @@ Status ProfilingTraceNodeTask::ExecuteAsync(TaskContext &context, std::function<
 Status RtsNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const {
  GE_CHECK_NOTNULL(node);
  GELOGD("[%s] Load for local task.", node->GetName().c_str());
  std::string node_type;
  GE_CHK_STATUS_RET(GetOriginalType(node, node_type), "Get original type failed.");
  const std::string node_type = NodeUtils::GetNodeType(node);
  RtsNodeTaskPtr rts_task = RtsTaskFactory::GetInstance().Create(node_type);
  if (rts_task == nullptr) {
    GELOGE(UNSUPPORTED, "[%s] Unsupported RTS op type: %s", node->GetName().c_str(), node_type.c_str());
--- a/ge/hybrid/node_executor/rts/rts_node_task.cc
+++ b/ge/hybrid/node_executor/rts/rts_node_task.cc
@@ -43,7 +43,6 @@ namespace hybrid {
 REGISTER_RTS_TASK_CREATOR(STREAMACTIVE, StreamActiveNodeTask);
 REGISTER_RTS_TASK_CREATOR(STREAMSWITCH, StreamSwitchNodeTask);
 REGISTER_RTS_TASK_CREATOR(STREAMMERGE, StreamMergeNodeTask);
 REGISTER_RTS_TASK_CREATOR(MEMCPYASYNC, MemcpyAsyncNodeTask);

 REGISTER_RTS_TASK_CREATOR(ENTER, PassThroughNodeTask);
 REGISTER_RTS_TASK_CREATOR(REFENTER, PassThroughNodeTask);
@@ -168,34 +167,6 @@ Status StreamMergeNodeTask::ExecuteAsync(TaskContext &task_context, std::functio
  return SUCCESS;
 }

 Status MemcpyAsyncNodeTask::ExecuteAsync(TaskContext &task_context, std::function<void()> done_callback) {
  GELOGD("[%s] Start to execute.", task_context.GetNodeName());
  auto input_desc = task_context.MutableInputDesc(0);
  GE_CHECK_NOTNULL(input_desc);
  int64_t copy_size = 0;
  GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorSizeInBytes(*input_desc, copy_size));
  // copy_size would not be negative since GetTensorSizeInBytes returned successfully.
  if (copy_size > 0) {
    const auto in_v = task_context.MutableInput(0);
    const auto out_v = task_context.MutableOutput(0);
    GE_CHECK_NOTNULL(in_v);
    GE_CHECK_NOTNULL(out_v);
    GELOGD("[%s] input size: %zu, output size: %zu, copy size: %ld", task_context.GetNodeName(),
           in_v->GetSize(), out_v->GetSize(), copy_size);
    GE_CHK_RT_RET(rtMemcpyAsync(out_v->MutableData(), out_v->GetSize(), in_v->GetData(), copy_size,
                                RT_MEMCPY_DEVICE_TO_DEVICE, task_context.GetStream()));
  } else {
    GELOGW("[%s] invalid copy size: %ld", task_context.GetNodeName(), copy_size);
  }

  if (done_callback) {
    GE_CHK_STATUS_RET(task_context.RegisterCallback(done_callback));
  }

  GELOGD("[%s] Done executing successfully.", task_context.GetNodeName());
  return SUCCESS;
 }

 Status PassThroughNodeTask::ExecuteAsync(TaskContext &task_context, std::function<void()> done_callback) {
  GELOGD("[%s] Start to execute.", task_context.GetNodeName());
  const auto in_x = task_context.GetInput(0); // x
--- a/ge/hybrid/node_executor/rts/rts_node_task.h
+++ b/ge/hybrid/node_executor/rts/rts_node_task.h
@@ -60,11 +60,6 @@ class StreamMergeNodeTask : public RtsNodeTask {
  Status ExecuteAsync(TaskContext &task_context, std::function<void()> done_callback) override;
 };

 class MemcpyAsyncNodeTask : public RtsNodeTask {
 public:
  Status ExecuteAsync(TaskContext &task_context, std::function<void()> done_callback) override;
 };

 class PassThroughNodeTask : public RtsNodeTask {
 public:
  Status ExecuteAsync(TaskContext &task_context, std::function<void()> done_callback) override;
--- a/tests/ut/ge/graph/load/model_manager_unittest.cc
+++ b/tests/ut/ge/graph/load/model_manager_unittest.cc
@@ -438,4 +438,22 @@ TEST_F(UtestModelManagerModelManager, test_data_input_tensor) {
  auto ret = mm.DataInputTensor(model_id,inputs);
  EXPECT_EQ(PARAM_INVALID, ret);    // HybridDavinciModel::impl_ is null.
 }

 TEST_F(UtestModelManagerModelManager, test_launch_kernel_cust_aicpu) {
  ModelManager mm;

  // cust_aicpu_so_ is empty.
  EXPECT_EQ(mm.LaunchKernelCustAicpuSo("empty_cust_aicpu"), SUCCESS);

  // deleteCustOp after Launch will deleted.
  uintptr_t resource_id = 1;    // for rtCtxGetCurrent stub
  std::vector<char> kernel_bin(256);
  auto &cust_resource_001 = mm.cust_aicpu_so_[resource_id];
  auto tbe_kernel  = std::shared_ptr<OpKernelBin>(new OpKernelBin("deleteCustOp", std::move(kernel_bin)));
  auto &cust_opkernel_001 = cust_resource_001["deleteCustOp"] = tbe_kernel;

  EXPECT_FALSE(mm.cust_aicpu_so_.empty());
  EXPECT_EQ(mm.LaunchKernelCustAicpuSo("deleteCustOp"), SUCCESS);
  EXPECT_TRUE(mm.cust_aicpu_so_.empty());
 }
 }  // namespace ge