!1214 add superkernel off attr for graph

From: @ni100die Reviewed-by: @xchu42,@ji_chen Signed-off-by:
4 years ago · e119b3c951
--- a/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/ge/graph/build/memory/graph_mem_assigner.cc
@@ -434,7 +434,7 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
                      "Assign node %s continuous input memory failed.", node->GetName().c_str())
  }
  for (auto pair : memory_offset_) {
    GELOGD("After reassign continuous memory, memory type = %ld, memoffset = %zu.", pair.first,
    GELOGD("After reassign continuous memory, memory type = %ld, mem_offset = %zu.", pair.first,
           pair.second.mem_offset_);
  }
  return ge::SUCCESS;
@@ -512,11 +512,11 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
        auto peer_output_offset = output_list.at(peer_out_data_anchor->GetIdx());
        output_list.at(peer_out_data_anchor->GetIdx()) = output_list_this.at(out2ins.begin()->first);
        peer_op_desc->SetOutputOffset(output_list);
        GELOGI("Node %s out %d ref in %d input node %s, use output offset %ld update %ld", node->GetName().c_str(),
        GELOGI("Node %s out %d ref in %d input node %s, use output offset %ld update %ld.", node->GetName().c_str(),
               out2ins.begin()->first, out2ins.begin()->second, peer_op_desc->GetName().c_str(),
               output_list_this.at(out2ins.begin()->first), peer_output_offset);
      } else {
        GELOGD("Node %s out %d ref in %d input node %s with total ref numbers %zu", node->GetName().c_str(),
        GELOGD("Node %s out %d ref in %d input node %s with total ref numbers %zu.", node->GetName().c_str(),
               out2ins.begin()->first, out2ins.begin()->second, peer_op_desc->GetName().c_str(), out2ins.size());
      }
      // first input is beginning offset
@@ -542,7 +542,7 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
    }
    GELOGI("[IMAS]Continuous input : Set %s name[%s] optype[%s] output[%d] offset to [%zu] stream_id[%ld] memtype[%ld] "
        "size[%zu] realsize[%ld] nopadding[%d].", node->GetOwnerComputeGraph()->GetName().c_str(),
        "size[%zu] realsize[%ld] nopadding size[%d].", node->GetOwnerComputeGraph()->GetName().c_str(),
        peer_op_desc->GetName().c_str(), node->GetType().c_str(), peer_out_data_anchor->GetIdx(),
        output_list.at(peer_out_data_anchor->GetIdx()), peer_op_desc->GetStreamId(), memory_type,
        is_continuous_input_allocated ? 0UL : align_size, real_size, is_nopadding);
@@ -1549,7 +1549,7 @@ bool GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcessDirectly(
    auto continuous_type = iter->second;
    bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0);
    if (continuous_input) {
      GELOGI("node %s 's precursor node %s need assign continuous input memory, store node firstly.",
      GELOGI("Node %s 's precursor node %s need assign continuous input memory, store node firstly.",
             input_continuous_node->GetName().c_str(), in_node->GetName().c_str());
      return false;
    }
@@ -1559,7 +1559,7 @@ bool GraphMemoryAssigner::AssignContinuousInputMemoryWithAtomicProcessDirectly(
    node_2_continuous_type.emplace(out_node, continuous_type);
    bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0);
    if (continuous_input) {
      GELOGI("node %s 's succeed node %s need assign continuous input memory, store node firstly.",
      GELOGI("Node %s 's succeed node %s need assign continuous input memory, store node firstly.",
             input_continuous_node->GetName().c_str(), out_node->GetName().c_str());
      return false;
    }
--- a/ge/graph/build/model_builder.cc
+++ b/ge/graph/build/model_builder.cc
@@ -366,8 +366,11 @@ void ModelBuilder::InitL1FusionOption() {
  string buffer_optimize = "off_optimize";
  graphStatus ret = ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
  if (ret == GRAPH_SUCCESS) {
    is_l1_fusion_enable_ = (buffer_optimize == "l1_optimize");
    GELOGD("The value of %s is %s.", BUFFER_OPTIMIZE.c_str(), buffer_optimize.c_str());
    bool off_superkernel = false;
    (void)AttrUtils::GetBool(compute_graph_, ATTR_NAME_OFF_SUPERKERNEL_ATTR, off_superkernel);
    is_l1_fusion_enable_ = ((buffer_optimize == "l1_optimize") && (!off_superkernel));
    GELOGI("Compute graph %s the value of %s is %s, superkernel flag %d.", compute_graph_->GetName().c_str(),
           BUFFER_OPTIMIZE.c_str(), buffer_optimize.c_str(), is_l1_fusion_enable_);
  } else {
    GELOGW("The value of %s is empty.", kEnableL1Fusion.c_str());
  }
@@ -709,7 +712,7 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) {
  GE_TIMESTAMP_START(SetInputOutputOffset);
  SetInputOutputOffsetPass input_output_offset;
  GE_CHK_STATUS_RET(input_output_offset.Run(compute_graph_), "Set input output offset failed.");
  GE_TIMESTAMP_END(SetInputOutputOffset, "SetInputOutputOffsetPass::Run.");
  GE_TIMESTAMP_END(SetInputOutputOffset, "SetInputOutputOffsetPass::Run");
  // Compile single op in graph build stage
  GE_TIMESTAMP_START(CompileSingleOp);
--- a/ge/graph/load/model_manager/davinci_model.cc
+++ b/ge/graph/load/model_manager/davinci_model.cc
@@ -532,20 +532,20 @@ Status DavinciModel::DoTaskSink() {
  GE_CHK_STATUS_RET(BindModelStream(), "Bind model stream failed.");
  if (known_node_) {
    GE_CHK_STATUS_RET(MallocKnownArgs(), "Mallloc known node args failed.");
    GE_CHK_STATUS_RET(MallocKnownArgs(), "Mallloc known node's args failed");
  }
  GE_CHK_STATUS_RET(InitTaskInfo(*model_task_def.get()), "InitTaskInfo failed.");
  GE_CHK_STATUS_RET(InitTaskInfo(*model_task_def.get()), "InitTaskInfo failed");
  GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed.");
  GE_CHK_STATUS_RET(ModelManager::GetInstance()->LaunchCustAicpuSo(), "Launch cust aicpu so failed");
  GE_CHK_STATUS_RET(ModelManager::GetInstance()->CheckAicpuOpList(ge_model_), "Check aicpu op type failed.");
  GE_CHK_STATUS_RET(ModelManager::GetInstance()->CheckAicpuOpList(ge_model_), "Check aicpu op type failed");
  GE_CHK_STATUS_RET(InitEntryTask(), "InitEntryTask failed.");
  GE_CHK_STATUS_RET(InitEntryTask(), "InitEntryTask failed");
  GE_CHK_STATUS_RET(InitL1DataDumperArgs(), "InitL1DataDumperArgs failed.");
  GE_CHK_STATUS_RET(InitL1DataDumperArgs(), "InitL1DataDumperArgs failed");
  GE_CHK_STATUS_RET(DistributeTask(), "Distribute failed.");
  GE_CHK_STATUS_RET(DistributeTask(), "Distribute failed");
  GE_CHK_RT_RET(rtModelLoadComplete(rt_model_handle_));
--- a/ge/graph/manager/graph_manager.cc
+++ b/ge/graph/manager/graph_manager.cc
@@ -3090,6 +3090,15 @@ Status GraphManager::OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGra
    sub_graph->SetSessionID(session_id);
    sub_graph->SetGraphID(graph_node->GetGraphId());
  }
  bool off_superkernel = false;
  if (AttrUtils::GetBool(compute_graph, ATTR_NAME_OFF_SUPERKERNEL_ATTR, off_superkernel)) {
    GELOGI("Compute graph %s get superkernel flag %d.", compute_graph->GetName().c_str(), off_superkernel);
    if (!AttrUtils::SetBool(merged_compute_graph, ATTR_NAME_OFF_SUPERKERNEL_ATTR, off_superkernel)) {
      GELOGE(FAILED, "Compute graph %s set superkernel flag %d failed", merged_compute_graph->GetName().c_str(),
             off_superkernel);
      return FAILED;
    }
  }
  GE_TIMESTAMP_EVENT_END(MergeSubgraph, "OptimizeSubgraph::MergeSubGraph");
  GE_DUMP(merged_compute_graph, "mergedComputeGraph");
  compute_graph = merged_compute_graph;
--- a/ge/host_kernels/dynamic_stitch_kernel.cc
+++ b/ge/host_kernels/dynamic_stitch_kernel.cc
@@ -111,8 +111,9 @@ void DynamicStitchKernel::ComputeMergedShape(const vector<ConstGeTensorPtr> &inp
  int32_t merged_first_dim = 0;
  int64_t indices_shape_size = 0;
  for (int i = 0; i < n_; i++) {
    indices_shape_size = input[i]->GetTensorDesc().GetShape().GetShapeSize();
    indices_shape_size = indices_shape_size == 0 ? 1 : indices_shape_size;
    // shape is [] means scalar
    indices_shape_size =
      input[i]->GetTensorDesc().GetShape().GetDims().empty() ? 1 : input[i]->GetTensorDesc().GetShape().GetShapeSize();
    const int32_t *input_indices = reinterpret_cast<const int32_t *>(input[i]->GetData().data());
    for (int64_t j = 0; j < indices_shape_size; j++) {
      merged_first_dim = std::max(merged_first_dim, input_indices[j]);