diff --git a/build.sh b/build.sh index 6cbc03d2..74f13849 100644 --- a/build.sh +++ b/build.sh @@ -176,7 +176,7 @@ cd ${BASEPATH} mkdir -p output/plugin/nnengine/ge_config/ find output/ -name graphengine_lib.tar -exec rm {} \; cp src/ge/engine_manager/engine_conf.json output/plugin/nnengine/ge_config/ -find output/ -maxdepth 1 -name libengine.so -exec mv {} output/plugin/nnengine/ \; +find output/ -maxdepth 1 -name libengine.so -exec mv -f {} output/plugin/nnengine/ \; tar -cf graphengine_lib.tar output/* mv -f graphengine_lib.tar output echo "---------------- GraphEngine package archive generated ----------------" diff --git a/inc/common/util/compress/compress.h b/inc/common/util/compress/compress.h new file mode 100644 index 00000000..6908fb75 --- /dev/null +++ b/inc/common/util/compress/compress.h @@ -0,0 +1,36 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COMPRESS_H +#define COMPRESS_H + +#include + +enum CmpStatus { RET_SUCCESS = 0, RET_ERROR = -1 }; + +struct CompressConfig { + size_t inputSize; // length of data to compress + size_t engineNum; // how many decompress engines + size_t maxRatio; // how much size of a basic compression block, only 64 supported now (8x: 64 4x: 32) + size_t channel; // channels of L2 or DDR. For load balance + size_t fractalSize; // size of compressing block + bool isTight; // whether compose compressed data tightly +}; + +CmpStatus CompressWeights(char* input, const CompressConfig& compressConfig, char* indexs, char* output, + size_t& compressedLength); + +#endif // COMPRESS_H diff --git a/inc/common/util/platform_info.h b/inc/common/util/platform_info.h new file mode 100644 index 00000000..52dc0621 --- /dev/null +++ b/inc/common/util/platform_info.h @@ -0,0 +1,97 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLATFORM_INFO_H +#define PLATFORM_INFO_H + +#include +#include +#include +#include "platform_info_def.h" + +using std::map; +using std::string; +using std::vector; + +namespace fe { + +class PlatformInfoManager { + public: + PlatformInfoManager(const PlatformInfoManager &) = delete; + PlatformInfoManager &operator=(const PlatformInfoManager &) = delete; + + static PlatformInfoManager &Instance(); + uint32_t InitializePlatformInfo(); + uint32_t Finalize(); + + uint32_t GetPlatformInfo(const string SoCVersion, PlatformInfo &platformInfo, OptionalInfo &optiCompilationInfo); + + void SetOptionalCompilationInfo(OptionalInfo &optiCompilationInfo); + + private: + PlatformInfoManager(); + ~PlatformInfoManager(); + + uint32_t LoadIniFile(string iniFileRealPath); + + void Trim(string &str); + + uint32_t LoadConfigFile(string realPath); + + string RealPath(const std::string &path); + + string GetSoFilePath(); + + void ParseVersion(map &versionMap, string &socVersion, PlatformInfo &platformInfoTemp); + + void ParseSocInfo(map &socInfoMap, PlatformInfo &platformInfoTemp); + + void ParseCubeOfAICoreSpec(map &aiCoreSpecMap, PlatformInfo &platformInfoTemp); + + void ParseBufferOfAICoreSpec(map &aiCoreSpecMap, PlatformInfo &platformInfoTemp); + + void ParseUBOfAICoreSpec(map &aiCoreSpecMap, PlatformInfo &platformInfoTemp); + + void ParseAICoreSpec(map &aiCoreSpecMap, PlatformInfo &platformInfoTemp); + + void ParseBufferOfAICoreMemoryRates(map &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp); + + void ParseAICoreMemoryRates(map &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp); + + void ParseUBOfAICoreMemoryRates(map &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp); + + void ParseAICoreintrinsicDtypeMap(map &aiCoreintrinsicDtypeMap, PlatformInfo &platformInfoTemp); + + void ParseVectorCoreSpec(map &vectorCoreSpecMap, PlatformInfo &platformInfoTemp); + + void ParseVectorCoreMemoryRates(map &vectorCoreMemoryRatesMap, PlatformInfo &platformInfoTemp); + + void ParseVectorCoreintrinsicDtypeMap(map &vectorCoreintrinsicDtypeMap, + PlatformInfo &platformInfoTemp); + + uint32_t ParsePlatformInfoFromStrToStruct(map> &contentInfoMap, string &socVersion, + PlatformInfo &platformInfoTemp); + + uint32_t AssemblePlatformInfoVector(map> &contentInfoMap); + + private: + bool initFlag_; + map platformInfoMap_; + OptionalInfo optiCompilationInfo_; +}; + +} // namespace fe +#endif diff --git a/inc/common/util/platform_info_def.h b/inc/common/util/platform_info_def.h new file mode 100644 index 00000000..663a2cae --- /dev/null +++ b/inc/common/util/platform_info_def.h @@ -0,0 +1,122 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PLATFORM_INFO_DEF_H +#define PLATFORM_INFO_DEF_H + +#include +#include +#include + +using std::map; +using std::string; +using std::vector; + +namespace fe { +enum MemoryType { DDR = 0, HBM }; + +enum L2Type { Cache = 0, Buff }; + +typedef struct tagStrInfo { + string aicVersion; + string ccecAICVersion; + string ccecAIVVersion; + string isSupportAIcpuCompiler; +} StrInfo; + +typedef struct tagSoCInfo { + uint32_t aiCoreCnt; + uint32_t vectorCoreCnt; + uint32_t aiCpuCnt; + MemoryType memoryType; + uint64_t memorySize; + L2Type l2Type; + uint64_t l2Size; + uint32_t l2PageNum; +} SoCInfo; + +typedef struct tagAiCoreSpec { + double cubeFreq; + uint64_t cubeMSize; + uint64_t cubeNSize; + uint64_t cubeKSize; + uint64_t vecCalcSize; + uint64_t l0ASize; + uint64_t l0BSize; + uint64_t l0CSize; + uint64_t l1Size; + uint64_t smaskBuffer; + uint64_t ubSize; + uint64_t ubblockSize; + uint64_t ubbankSize; + uint64_t ubbankNum; + uint64_t ubburstInOneBlock; + uint64_t ubbankGroupNum; +} AiCoreSpec; + +typedef struct tagAiCoreMemoryRates { + double ddrRate; + double l2Rate; + double l2ReadRate; + double l2WriteRate; + double l1ToL0ARate; + double l1ToL0BRate; + double l1ToUBRate; + double l0CToUBRate; + double ubToL2Rate; + double ubToDdrRate; + double ubToL1Rate; +} AiCoreMemoryRates; + +typedef struct tagVectorCoreSpec { + uint64_t vecCalcSize; + uint64_t smaskBuffer; + uint64_t ubSize; + uint64_t ubblockSize; + uint64_t ubbankSize; + uint64_t ubbankNum; + uint64_t ubburstInOneBlock; + uint64_t ubbankGroupNum; +} VectorCoreSpec; + +typedef struct tagVectorCoreMemoryRates { + double ddrRate; + double l2Rate; + double l2ReadRate; + double l2WriteRate; + double ubToL2Rate; + double ubToDdrRate; +} VectorCoreMemoryRates; + +typedef struct tagPlatformInfo { + StrInfo strInfo; + SoCInfo socInfo; + AiCoreSpec aiCoreSpec; + AiCoreMemoryRates aiCoreMemoryRates; + map> aiCoreIntrinsicDtypeMap; + VectorCoreSpec vectorCoreSpec; + VectorCoreMemoryRates vectorCoreMemoryRates; + map> vectorCoreIntrinsicDtypeMap; +} PlatformInfo; + +typedef struct tagOptionalInfo { + string socVersion; + string coreType; + uint32_t aiCoreNum; + string l1FusionFlag; +} OptionalInfo; +} // namespace fe +#endif diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index bf9a10b4..13477bbd 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -40,6 +40,8 @@ const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath"; const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump"; const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath"; const char *const OPTION_EXEC_DUMP_STEP = "ge.exec.dumpStep"; +const char *const OPTION_EXEC_ENABLE_INCRE_BUILD = "ge.exec.enableIncreBuild"; +const char *const OPTION_EXEC_INCRE_BUILD_CACHE_PATH = "ge.exec.increBuildCachePath"; // Hccl flag, if ge.exec.hcclFlag =1, it means load plugin for opskernel, else:ge.exec.hcclFlag =0 const char *const OPTION_EXEC_HCCL_FLAG = "ge.exec.hcclFlag"; const char *const OPTION_EXEC_ATOMIC_FLAG = "ge.exec.enable_atomic"; diff --git a/inc/external/register/register.h b/inc/external/register/register.h index 045a1570..f96044de 100644 --- a/inc/external/register/register.h +++ b/inc/external/register/register.h @@ -116,27 +116,5 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpReceiver { namespace ge { using OpRegistrationData = domi::OpRegistrationData; using OpReceiver = domi::OpReceiver; - -class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp { - public: - HostCpuOp() = default; - virtual ~HostCpuOp() = default; - - virtual graphStatus Compute(Operator &op, const std::map &inputs, - std::map &outputs) = 0; -}; - -class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOpRegistrar { - public: - HostCpuOpRegistrar(const char *op_type, HostCpuOp *(*create_fn)()); -}; - -#define REGISTER_HOST_CPU_OP_BUILDER(name, op) REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(__COUNTER__, name, op) - -#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(ctr, name, op) REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op) - -#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op) \ - static ::ge::HostCpuOpRegistrar register_host_cpu_op##ctr __attribute__((unused)) = \ - ::ge::HostCpuOpRegistrar(name, []() -> ::ge::HostCpuOp * { return new (std::nothrow) op(); }) } // namespace ge #endif // INC_EXTERNAL_REGISTER_REGISTER_H_ diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h index 0adc812d..1cc2245b 100644 --- a/inc/framework/common/types.h +++ b/inc/framework/common/types.h @@ -434,6 +434,7 @@ REGISTER_OPTYPE_DECLARE(STREAMSWITCH, "StreamSwitch"); REGISTER_OPTYPE_DECLARE(STREAMSWITCHN, "StreamSwitchN"); REGISTER_OPTYPE_DECLARE(STREAMACTIVE, "StreamActive"); REGISTER_OPTYPE_DECLARE(MEMCPYASYNC, "MemcpyAsync"); +REGISTER_OPTYPE_DECLARE(MEMCPYADDRASYNC, "MemcpyAddrAsync"); REGISTER_OPTYPE_DECLARE(STREAMMERGE, "StreamMerge"); REGISTER_OPTYPE_DECLARE(ENDGRAPH, "EndGraph"); REGISTER_OPTYPE_DECLARE(SEND, "Send"); @@ -441,6 +442,7 @@ REGISTER_OPTYPE_DECLARE(RECV, "Recv"); REGISTER_OPTYPE_DECLARE(LABELSET, "LabelSet"); REGISTER_OPTYPE_DECLARE(LABELGOTO, "LabelGoto"); +REGISTER_OPTYPE_DECLARE(LABELGOTOEX, "LabelGotoEx"); REGISTER_OPTYPE_DECLARE(LABELSWITCH, "LabelSwitch"); REGISTER_OPTYPE_DECLARE(LABELSWITCHBYINDEX, "LabelSwitchByIndex"); diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h index 00f5edbd..57d1c6c6 100644 --- a/inc/graph/debug/ge_attr_define.h +++ b/inc/graph/debug/ge_attr_define.h @@ -979,9 +979,14 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_N_BATCH_SPILT; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NO_TASK_AND_DUMP_NEEDED; +// functional ops attr +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WHILE_COND; +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WHILE_BODY; + // used for label switch GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_INDEX; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_LIST; + // Varible GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_SRC_VAR_NAME; GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SRC_VAR_NAME; diff --git a/inc/graph/detail/model_serialize_imp.h b/inc/graph/detail/model_serialize_imp.h index 1d50577c..ad4e6475 100644 --- a/inc/graph/detail/model_serialize_imp.h +++ b/inc/graph/detail/model_serialize_imp.h @@ -22,7 +22,7 @@ #include #include #include "graph/anchor.h" -#include "detail/attributes_holder.h" +#include "graph/detail/attributes_holder.h" #include "graph/ge_tensor.h" #include "graph/graph.h" #include "graph/node.h" diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h index 8066e8b5..fb979e3e 100644 --- a/inc/graph/utils/graph_utils.h +++ b/inc/graph/utils/graph_utils.h @@ -262,6 +262,8 @@ class GraphUtils { static graphStatus MoveOutCtrlEdges(NodePtr &src_node, NodePtr &dst_node); static ComputeGraphPtr FindRootGraph(ComputeGraphPtr graph); + + static graphStatus TopologicalSortingByName(const ge::ComputeGraphPtr &compute_graph, vector &node_vec); }; class ComputeGraphBuilder { diff --git a/src/common/graph/compute_graph.cc b/src/common/graph/compute_graph.cc index a35747d4..2dcc7a54 100644 --- a/src/common/graph/compute_graph.cc +++ b/src/common/graph/compute_graph.cc @@ -54,17 +54,34 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY size_t ComputeGraph::GetAllNodesS return s; } GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraph::Vistor ComputeGraph::GetAllNodes() const { - vector all_nodes(nodes_.size()); - (void)std::copy(nodes_.begin(), nodes_.end(), all_nodes.begin()); - for (const auto &sub_graph : sub_graph_) { - if (sub_graph == nullptr) { - GELOGW("sub graph is nullptr"); + if (sub_graph_.empty()) { + return Vistor(shared_from_this(), nodes_); + } + + std::vector all_nodes; + std::deque candidates; + + candidates.insert(candidates.begin(), nodes_.begin(), nodes_.end()); + + while (!candidates.empty()) { + NodePtr node = candidates.front(); + all_nodes.emplace_back(node); + candidates.pop_front(); + + OpDescPtr op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { continue; } - for (const auto &node : sub_graph->GetAllNodes()) { - all_nodes.push_back(node); + + const auto &subgraph_names = op_desc->GetSubgraphInstanceNames(); + for (auto name_iter = subgraph_names.rbegin(); name_iter != subgraph_names.rend(); ++name_iter) { + auto subgraph = GetSubgraph(*name_iter); + if (subgraph != nullptr) { + candidates.insert(candidates.begin(), subgraph->nodes_.begin(), subgraph->nodes_.end()); + } } } + return Vistor(shared_from_this(), all_nodes); } size_t ComputeGraph::GetDirectNodesSize() const { return nodes_.size(); } @@ -602,7 +619,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::InsertE graphStatus ComputeGraph::DFSTopologicalSorting(std::vector &node_vec, std::map &map_in_edge_num, std::vector &stack) { - GELOGI("Runing_Dfs_Sort"); + GELOGI("Runing_Dfs_Sort: %s", name_.c_str()); // Record the number of non data nodes but no input nodes GE_CHK_BOOL_EXEC(SortNodes(stack, map_in_edge_num) == GRAPH_SUCCESS, return GRAPH_FAILED, "sort nodes failed"); @@ -647,7 +664,7 @@ graphStatus ComputeGraph::DFSTopologicalSorting(std::vector &node_vec, graphStatus ComputeGraph::BFSTopologicalSorting(std::vector &node_vec, std::map &map_in_edge_num, std::deque &stack) { - GELOGI("Runing_Bfs_Sort"); + GELOGI("Runing_Bfs_Sort: %s", name_.c_str()); std::vector stack_input; std::map breadth_node_map; // Record the number of non data nodes but no input nodes @@ -735,7 +752,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::Topolog use_BFS = true; } } else { - GELOGW("Get OPTION_GRAPH_RUN_MODE failed, use BFSTopologicalSorting by default."); + GELOGW("OPTION_GRAPH_RUN_MODE not set, use BFSTopologicalSorting by default."); } if (use_BFS) { diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc index 92040051..961d3bc4 100644 --- a/src/common/graph/ge_attr_define.cc +++ b/src/common/graph/ge_attr_define.cc @@ -955,11 +955,8 @@ const std::string ATTR_NAME_DATA_DUMP_ORIGIN_FORMAT = "_datadump_origin_format"; const std::string ATTR_NAME_DATA_DUMP_ORIGIN_DATA_TYPE = "_datadump_origin_data_type"; // functional ops attr -const std::string ATTR_NAME_TCOND = "Tcond"; -const std::string ATTR_NAME_TIN = "Tin"; -const std::string ATTR_NAME_TOUT = "Tout"; -const std::string ATTR_NAME_THEN_BRANCH = "then_branch"; -const std::string ATTR_NAME_ELSE_BRANCH = "else_branch"; +const std::string ATTR_NAME_WHILE_COND = "cond"; +const std::string ATTR_NAME_WHILE_BODY = "body"; // used for label switch const std::string ATTR_NAME_LABEL_SWITCH_INDEX = "_label_switch_index"; diff --git a/src/common/graph/utils/graph_utils.cc b/src/common/graph/utils/graph_utils.cc index c5e45516..1886ee66 100644 --- a/src/common/graph/utils/graph_utils.cc +++ b/src/common/graph/utils/graph_utils.cc @@ -28,6 +28,7 @@ #include #include #include +#include #include "./ge_context.h" #include "debug/ge_util.h" @@ -1999,4 +2000,60 @@ void PartialGraphBuilder::BuildExistNodes(graphStatus &error_code, std::string & GELOGD("Build exist nodes succ."); } + +GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus +GraphUtils::TopologicalSortingByName(const ge::ComputeGraphPtr &compute_graph, vector &node_vec) { + std::vector stack_input; + std::map map_in_edge_num; + graphStatus ret = compute_graph->SortNodes(stack_input, map_in_edge_num); + if (ret != GRAPH_SUCCESS) { + GELOGE(GRAPH_FAILED, "Sort nodes failed."); + return GRAPH_FAILED; + } + const size_t non_user_input_index = stack_input.size() - compute_graph->inputs_order_.size() - 1; + std::sort(stack_input.begin(), stack_input.begin() + non_user_input_index, + [](const NodePtr &a, const NodePtr &b) -> bool { return (a->GetName() > b->GetName()); }); + + std::queue stack; + NodePtr cur_node = nullptr; + std::map name_node_map; + vector nodes_name; + while (!stack_input.empty() || !stack.empty()) { + if (!stack.empty()) { + cur_node = stack.front(); + stack.pop(); + } else { + cur_node = stack_input.back(); + stack_input.pop_back(); + } + node_vec.emplace_back(cur_node); + compute_graph->CollectBreadthOutNode(cur_node, map_in_edge_num, name_node_map); + for (const auto &iter : name_node_map) { + nodes_name.emplace_back(iter.first); + } + std::sort(nodes_name.begin(), nodes_name.end()); + for (const auto &iter : nodes_name) { + stack.push(name_node_map[iter]); + } + name_node_map.clear(); + nodes_name.clear(); + } + // If they are not equal, there is a closed loop + if (node_vec.size() != compute_graph->nodes_.size()) { + std::set itered_nodes_set; + for (auto &node : node_vec) { + itered_nodes_set.insert(node.get()); + } + GE_LOGE("Failed to do topo sorting total %zu, itered %zu, exist closed loop in graph.", + compute_graph->nodes_.size(), node_vec.size()); + for (auto &node : compute_graph->nodes_) { + if (itered_nodes_set.count(node.get()) == 0) { + GE_LOGE("The node %s does not itered when topological sorting", node->GetName().c_str()); + } + } + return GRAPH_FAILED; + } + return GRAPH_SUCCESS; +} + } // namespace ge diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt index 56e5e2b0..1a3434a5 100755 --- a/src/ge/CMakeLists.txt +++ b/src/ge/CMakeLists.txt @@ -41,6 +41,7 @@ include_directories(${GE_SOURCE_DIR}/inc/external/graph) include_directories(${GE_SOURCE_DIR}/inc/framework) include_directories(${GE_SOURCE_DIR}/inc/framework/common) include_directories(${GE_SOURCE_DIR}/inc/runtime) +include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce) include_directories(${GE_SOURCE_DIR}/third_party/securec/include) @@ -55,6 +56,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "common/formats/utils/formats_trans_utils.cc" "common/fp16_t.cc" "common/ge/plugin_manager.cc" + "common/helper/model_cache_helper.cc" "common/profiling/profiling_manager.cc" "engine_manager/dnnengine_manager.cc" "ge_local_engine/engine/host_cpu_engine.cc" @@ -92,6 +94,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/load/new_model_manager/task_info/kernel_task_info.cc" "graph/load/new_model_manager/task_info/label_goto_task_info.cc" "graph/load/new_model_manager/task_info/label_set_task_info.cc" + "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc" "graph/load/new_model_manager/task_info/memcpy_async_task_info.cc" "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "graph/load/new_model_manager/task_info/stream_active_task_info.cc" @@ -269,6 +272,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "common/formats/utils/formats_trans_utils.cc" "common/fp16_t.cc" "common/ge/plugin_manager.cc" + "common/helper/model_cache_helper.cc" "common/profiling/profiling_manager.cc" "engine_manager/dnnengine_manager.cc" "ge_local_engine/engine/host_cpu_engine.cc" @@ -305,6 +309,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "graph/load/new_model_manager/task_info/kernel_task_info.cc" "graph/load/new_model_manager/task_info/label_goto_task_info.cc" "graph/load/new_model_manager/task_info/label_set_task_info.cc" + "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc" "graph/load/new_model_manager/task_info/memcpy_async_task_info.cc" "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "graph/load/new_model_manager/task_info/stream_active_task_info.cc" @@ -470,7 +475,7 @@ target_link_libraries(ge_compiler ${slog} ${mmpa} ${msprof} - ${runtime} + ${runtime_compiler} ${resouce} rt dl) diff --git a/src/ge/common/formats/format_transfers/datatype_transfer.cc b/src/ge/common/formats/format_transfers/datatype_transfer.cc index bac3a178..e5d21307 100644 --- a/src/ge/common/formats/format_transfers/datatype_transfer.cc +++ b/src/ge/common/formats/format_transfers/datatype_transfer.cc @@ -134,10 +134,6 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result } auto trans_mode = iter->second; - if (args.src_data_size == 0) { - GELOGE(PARAM_INVALID, "Invalid src data size %zu", args.src_data_size); - return PARAM_INVALID; - } int size = GetSizeByDataType(args.dst_data_type); if (size <= 0) { GELOGE(PARAM_INVALID, "Failed to calc size from data type %s", @@ -149,6 +145,12 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result return PARAM_INVALID; } size_t total_size = static_cast(args.src_data_size * size); + result.length = total_size; + if (total_size == 0) { + GELOGI("In TransDataType, total_size is zero, has no data."); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to alloc the memory for dst buf %zu, data size %zu", total_size, args.src_data_size); @@ -162,7 +164,6 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result return INTERNAL_ERROR; } result.data = dst; - result.length = total_size; return SUCCESS; } diff --git a/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc b/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc index 3458f83c..40dc749d 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc @@ -134,6 +134,11 @@ Status FormatTransferC1hwncoc0Hwcn::TransFormat(const TransArgs &args, TransResu int size = GetSizeByDataType(args.src_data_type); int64_t total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc index 45808fa0..dc8e1033 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc @@ -88,6 +88,11 @@ Status TransFormatDhwckToFz3D(const TransArgs &args, TransResult &result) { dst_size *= dim; } dst_size *= data_size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc index 86c6935d..11e3d270 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc @@ -89,6 +89,11 @@ Status TransFormatDhwncToFz3DTranspose(const TransArgs &args, TransResult &resul dst_size *= dim; } dst_size *= data_size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", diff --git a/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc b/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc index 76834437..ff7b84a4 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc @@ -116,6 +116,11 @@ Status CheckShapeRelation(const TransArgs &args, ShapeVector &hw_shape) { Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, const ShapeVector &hw_shape) { int size = GetSizeByDataType(args.src_data_type); int64_t dst_size = GetItemNumByShape(args.dst_shape) * size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", @@ -184,6 +189,11 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, const ShapeVector &dst_hw_shape) { int size = GetSizeByDataType(args.src_data_type); int64_t dst_size = GetItemNumByShape(args.dst_shape) * size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", diff --git a/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc b/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc index aedc7589..f3d06496 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc @@ -119,6 +119,11 @@ Status TransFormatFromNchwToFz(const TransArgs &args, TransResult &result) { int64_t total_ele_cnt = hf_cnt * vf_cnt * fractal_ele_cnt; int size = GetSizeByDataType(args.src_data_type); int64_t dst_size = total_ele_cnt * size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", @@ -194,6 +199,11 @@ Status TransFormatHwcnToFz(const TransArgs &args, TransResult &result) { dst_size *= dim; } dst_size *= data_size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", @@ -259,6 +269,11 @@ Status TransFormatNhwcToFz(const TransArgs &args, TransResult &result) { dst_size *= dim; } dst_size *= data_size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", diff --git a/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc b/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc index 59baccff..d5507765 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc @@ -117,6 +117,11 @@ Status CheckShapeRelation(const TransArgs &args, ShapeVector &hw_shape) { Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, const ShapeVector &hw_shape) { int size = GetSizeByDataType(args.src_data_type); int64_t dst_size = GetItemNumByShape(args.dst_shape) * size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", @@ -153,8 +158,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con auto src_offset = (src_h_head + w1_idx * w0) * size; auto dst_offset = (h0_head + w1_idx * h0w0) * size; auto protected_size = dst_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? dst_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? dst_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size * w0)); if (ret != EOK) { @@ -169,8 +174,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con auto src_offset = (src_h_head + src_w_idx) * size; auto dst_offset = (w0_head + w0_idx) * size; auto protected_size = dst_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? dst_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? dst_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { @@ -189,6 +194,11 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, const ShapeVector &dst_hw_shape) { int size = GetSizeByDataType(args.src_data_type); int64_t dst_size = GetItemNumByShape(args.dst_shape) * size; + if (dst_size == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", @@ -226,8 +236,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con auto src_offset = (h0_head + w1_idx * h0w0) * size; auto dst_offset = (dst_h_head + w1_idx * w0) * size; auto protected_size = dst_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? dst_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? dst_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size * w0)); if (ret != EOK) { @@ -242,8 +252,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con auto dst_w_idx = w1_head + w0_idx; auto dst_offset = (dst_h_head + dst_w_idx) * size; auto protected_size = dst_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? dst_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? dst_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { diff --git a/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc b/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc index 3453c232..b0eebcfa 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc @@ -133,6 +133,12 @@ Status FormatTransferFracZHwcn::TransFormat(const TransArgs &args, TransResult & int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc b/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc index 6f616051..9f8d9e39 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc @@ -133,6 +133,12 @@ Status FormatTransferFracZNchw::TransFormat(const TransArgs &args, TransResult & int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; @@ -140,6 +146,7 @@ Status FormatTransferFracZNchw::TransFormat(const TransArgs &args, TransResult & GELOGD("Begin to trans format from FracZ to NCHW, src shape %s, data type %s, dst shape %s, memory size %ld", ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); + if (GetDstDataAfterTrans(args, result, size, total_size) != SUCCESS) { GELOGE(INTERNAL_ERROR, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), diff --git a/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc b/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc index 57b840af..9a1e5f3b 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc @@ -132,6 +132,12 @@ Status FormatTransferFracZNhwc::TransFormat(const TransArgs &args, TransResult & int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc b/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc index e7f6754f..7101256a 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc @@ -35,7 +35,7 @@ Status TransShapeHwcnToC1hwncoc0(const DataType &data_type, const std::vector &dst_shape) { auto cube_size = GetCubeSizeByDataType(data_type); dst_shape.clear(); - dst_shape.push_back((src_shape.at(kHwcnC) - 1) / cube_size + 1); + dst_shape.push_back(Ceil(src_shape.at(kHwcnC), static_cast(cube_size))); dst_shape.push_back(src_shape.at(kHwcnH)); dst_shape.push_back(src_shape.at(kHwcnW)); dst_shape.push_back(src_shape.at(kHwcnN)); @@ -169,6 +169,12 @@ Status FormatTransferHwcnC1hwncoc0::TransFormat(const TransArgs &args, TransResu int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc index eab3ba96..57ab1266 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc @@ -58,7 +58,7 @@ Status CheckArgsForNc1hwc0ToNchw(const TransArgs &args) { } if (src_shape.at(kNc1hwc0H) != dst_shape.at(kNchwH) || src_shape.at(kNc1hwc0W) != dst_shape.at(kNchwW) || src_shape.at(kNc1hwc0N) != dst_shape.at(kNchwN) || src_shape.at(kNc1hwc0C0) != c0 || - src_shape.at(kNc1hwc0C1) != (dst_shape.at(kNchwC) - 1) / c0 + 1) { + src_shape.at(kNc1hwc0C1) != (Ceil(dst_shape.at(kNchwC), c0))) { GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); return PARAM_INVALID; @@ -102,8 +102,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in auto src_offset = src_idx * size; auto dst_offset = dst_idx * size; auto protected_size = total_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? total_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? total_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { @@ -130,6 +130,12 @@ Status FormatTransferNc1hwc0Nchw::TransFormat(const TransArgs &args, TransResult int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc index e9e8b19f..e68e54de 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc @@ -58,7 +58,7 @@ Status CheckArgsForNc1hwc0ToNhwc(const TransArgs &args) { } if (src_shape.at(kNc1hwc0H) != dst_shape.at(kNhwcH) || src_shape.at(kNc1hwc0W) != dst_shape.at(kNhwcW) || src_shape.at(kNc1hwc0N) != dst_shape.at(kNhwcN) || src_shape.at(kNc1hwc0C0) != c0 || - src_shape.at(kNc1hwc0C1) != (dst_shape.at(kNhwcC) - 1) / c0 + 1) { + src_shape.at(kNc1hwc0C1) != (Ceil(dst_shape.at(kNhwcC), c0))) { GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s", ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str()); return PARAM_INVALID; @@ -102,8 +102,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in auto src_offset = src_idx * size; auto dst_offset = dst_idx * size; auto protected_size = total_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? total_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? total_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset, static_cast(protected_size), args.data + src_offset, static_cast(size)); if (ret != EOK) { @@ -130,6 +130,12 @@ Status FormatTransferNc1hwc0Nhwc::TransFormat(const TransArgs &args, TransResult int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc index 481a64e9..638cc9eb 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc @@ -134,6 +134,10 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", total_ele_cnt, size); return INTERNAL_ERROR); int64_t dst_size = total_ele_cnt * size; + if (dst_size == 0) { + result.length = 0; + return SUCCESS; + } std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); if (dst == nullptr) { @@ -219,6 +223,10 @@ Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld", diff --git a/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc b/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc index 13e48f8c..b4e92cbc 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc @@ -40,7 +40,7 @@ Status TransShapeNchwToNc1hwc0(const std::vector &src_shape, DataType d } dst_shape.clear(); dst_shape.push_back(src_shape.at(kNchwN)); - dst_shape.push_back((src_shape.at(kNchwC) - 1) / c0 + 1); + dst_shape.push_back(Ceil(src_shape.at(kNchwC), c0)); dst_shape.push_back(src_shape.at(kNchwH)); dst_shape.push_back(src_shape.at(kNchwW)); dst_shape.push_back(c0); @@ -74,25 +74,8 @@ Status CheckArgsForNchwToNc1hwc0(const TransArgs &args) { return SUCCESS; } -} // namespace -Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult &result) { - if (CheckArgsForNchwToNc1hwc0(args) != SUCCESS) { - return PARAM_INVALID; - } - // Guarantee the validity of parameters in check function - int size = GetSizeByDataType(args.src_data_type); - auto total_size = GetItemNumByShape(args.dst_shape) * size; - if (total_size <= 0) { - GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, - ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); - return PARAM_INVALID; - } - GELOGD( - "Begin to trans format from NCHW to NC1HWC0, src shape %s, data type " - "%s, dst shape %s memory size %ld", - ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), - ShapeToString(args.dst_shape).c_str(), total_size); +Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) { std::shared_ptr dst(new (std::nothrow) uint8_t[total_size], std::default_delete()); if (dst == nullptr) { GELOGE(OUT_OF_MEMORY, @@ -132,8 +115,8 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult int64_t dst_index = c0_idx + w_head_addr; int64_t dst_offset = dst_index * size; auto protected_size = total_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? total_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? total_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); int64_t cIdx = c0_idx + c1_idx * c0; int64_t srcIdx = n_idx * chw + cIdx * hw + h_idx * w + w_idx; auto src_offset = srcIdx * size; @@ -150,7 +133,7 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult } } else { auto ret = - memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, static_cast(size)); + memset_s(dst.get() + dst_offset, static_cast(protected_size), 0, static_cast(size)); if (ret != EOK) { GELOGE(INTERNAL_ERROR, "Failed to set to 0 to " @@ -169,6 +152,39 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult result.length = static_cast(total_size); return SUCCESS; } +} // namespace + +Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult &result) { + if (CheckArgsForNchwToNc1hwc0(args) != SUCCESS) { + return PARAM_INVALID; + } + // Guarantee the validity of parameters in check function + int size = GetSizeByDataType(args.src_data_type); + auto total_size = GetItemNumByShape(args.dst_shape) * size; + if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, + ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); + return PARAM_INVALID; + } + GELOGD( + "Begin to trans format from NCHW to NC1HWC0, src shape %s, data type " + "%s, dst shape %s memory size %ld", + ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size); + if (GetDstDataAfterTrans(args, result, size, total_size) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld", + ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), + ShapeToString(args.dst_shape).c_str(), total_size); + return INTERNAL_ERROR; + } + return SUCCESS; +} Status FormatTransferNchwNc1hwc0::TransShape(Format src_format, const std::vector &src_shape, DataType data_type, Format dst_format, std::vector &dst_shape) { diff --git a/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc b/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc index b461e270..a5be94ff 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc @@ -38,7 +38,7 @@ Status TransShapeNhwcToNc1hwc0(const std::vector &src_shape, DataType d } dst_shape.clear(); dst_shape.push_back(src_shape.at(kNhwcN)); - dst_shape.push_back((src_shape.at(kNhwcC) - 1) / c0 + 1); + dst_shape.push_back(Ceil(src_shape.at(kNhwcC), c0)); dst_shape.push_back(src_shape.at(kNhwcH)); dst_shape.push_back(src_shape.at(kNhwcW)); dst_shape.push_back(c0); @@ -119,8 +119,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in int64_t dst_idx = c0_idx + w_head_addr; int64_t dst_offset = dst_idx * size; auto protected_size = total_size - dst_offset < static_cast(SECUREC_MEM_MAX_LEN) - ? total_size - dst_offset - : static_cast(SECUREC_MEM_MAX_LEN); + ? total_size - dst_offset + : static_cast(SECUREC_MEM_MAX_LEN); int64_t c_idx = c0_idx + c1_idx * c0; int64_t src_idx = n_idx * hwc + h_idx * wc + w_idx * c + c_idx; auto src_offset = src_idx * size; @@ -161,6 +161,12 @@ Status FormatTransferNhwcNc1hwc0::TransFormat(const TransArgs &args, TransResult int size = GetSizeByDataType(args.src_data_type); auto total_size = GetItemNumByShape(args.dst_shape) * size; if (total_size <= 0) { + int64_t src_size = GetItemNumByShape(args.src_shape); + if (total_size == 0 && src_size == 0) { + result.length = static_cast(total_size); + return SUCCESS; + } + GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size, ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return PARAM_INVALID; diff --git a/src/ge/common/formats/format_transfers/format_transfer_transpose.cc b/src/ge/common/formats/format_transfers/format_transfer_transpose.cc index a523a326..ec309543 100644 --- a/src/ge/common/formats/format_transfers/format_transfer_transpose.cc +++ b/src/ge/common/formats/format_transfers/format_transfer_transpose.cc @@ -27,22 +27,22 @@ namespace ge { namespace formats { namespace { std::map>> perm_args{ - {FORMAT_NCHW, - {{FORMAT_NHWC, std::vector({0, 2, 3, 1})}, - {FORMAT_HWCN, std::vector({2, 3, 1, 0})}, - {FORMAT_CHWN, std::vector({1, 2, 3, 0})}}}, - {FORMAT_NHWC, - {{FORMAT_NCHW, std::vector({0, 3, 1, 2})}, - {FORMAT_CHWN, std::vector({3, 1, 2, 0})}, - {FORMAT_HWCN, std::vector({1, 2, 3, 0})}}}, - {FORMAT_HWCN, - {{FORMAT_NCHW, std::vector({3, 2, 0, 1})}, - {FORMAT_NHWC, std::vector({3, 0, 1, 2})}, - {FORMAT_CHWN, std::vector({2, 0, 1, 3})}}}, - {FORMAT_CHWN, - {{FORMAT_NCHW, std::vector({3, 0, 1, 2})}, - {FORMAT_NHWC, std::vector({3, 1, 2, 0})}, - {FORMAT_HWCN, std::vector({1, 2, 0, 3})}}}, + {FORMAT_NCHW, + {{FORMAT_NHWC, std::vector({0, 2, 3, 1})}, + {FORMAT_HWCN, std::vector({2, 3, 1, 0})}, + {FORMAT_CHWN, std::vector({1, 2, 3, 0})}}}, + {FORMAT_NHWC, + {{FORMAT_NCHW, std::vector({0, 3, 1, 2})}, + {FORMAT_CHWN, std::vector({3, 1, 2, 0})}, + {FORMAT_HWCN, std::vector({1, 2, 3, 0})}}}, + {FORMAT_HWCN, + {{FORMAT_NCHW, std::vector({3, 2, 0, 1})}, + {FORMAT_NHWC, std::vector({3, 0, 1, 2})}, + {FORMAT_CHWN, std::vector({2, 0, 1, 3})}}}, + {FORMAT_CHWN, + {{FORMAT_NCHW, std::vector({3, 0, 1, 2})}, + {FORMAT_NHWC, std::vector({3, 1, 2, 0})}, + {FORMAT_HWCN, std::vector({1, 2, 0, 3})}}}, }; bool IsShapeArgValid(const std::vector &src_shape, const std::vector &perm_arg) { @@ -51,8 +51,8 @@ bool IsShapeArgValid(const std::vector &src_shape, const std::vector &src_shape, Data int64_t dst_ele_num = GetItemNumByShape(dst_shape); int64_t data_size = GetSizeByDataType(src_data_type); int64_t dst_size = data_size * dst_ele_num; - std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); GELOGD("Begin to transpose, src shape %s, perm arg %s, dst shape %s, data type %s", JoinToString(src_shape).c_str(), JoinToString(perm_arg).c_str(), JoinToString(dst_shape).c_str(), TypeUtils::DataTypeToSerialString(src_data_type).c_str()); + if (dst_ele_num == 0) { + result.length = static_cast(dst_size); + return SUCCESS; + } + std::shared_ptr dst(new (std::nothrow) uint8_t[dst_size], std::default_delete()); int64_t dst_index = 0; std::vector dst_indexes(dst_shape.size()); while (dst_index < dst_ele_num) { auto src_offset = GenOffset(src_heads, dst_indexes) * data_size; auto dst_offset_bytes = dst_index * data_size; auto protected_size = dst_size - dst_offset_bytes < static_cast(SECUREC_MEM_MAX_LEN) - ? dst_size - dst_offset_bytes - : static_cast(SECUREC_MEM_MAX_LEN); + ? dst_size - dst_offset_bytes + : static_cast(SECUREC_MEM_MAX_LEN); auto ret = memcpy_s(dst.get() + dst_offset_bytes, static_cast(protected_size), src + src_offset, static_cast(data_size)); if (ret != EOK) { diff --git a/src/ge/common/formats/formats.cc b/src/ge/common/formats/formats.cc index 938f0888..d01d055b 100644 --- a/src/ge/common/formats/formats.cc +++ b/src/ge/common/formats/formats.cc @@ -24,6 +24,7 @@ #include #include +#include "common/formats/utils/formats_trans_utils.h" #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "graph/utils/type_utils.h" @@ -38,10 +39,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Status TransFormat(const TransArg TypeUtils::FormatToSerialString(args.dst_format).c_str()); return UNSUPPORTED; } - if (args.data == nullptr) { + + auto src_shape_size = GetItemNumByShape(args.src_shape); + if (args.data == nullptr && src_shape_size != 0) { GELOGE(PARAM_INVALID, "Invalid input null data"); return PARAM_INVALID; } + return transfer->TransFormat(args, result); } @@ -71,6 +75,12 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Status TransDataType(const CastAr TypeUtils::DataTypeToSerialString(args.dst_data_type).c_str()); return UNSUPPORTED; } + + if (args.data == nullptr && args.src_data_size != 0) { + GELOGE(PARAM_INVALID, "Invalid input null data"); + return PARAM_INVALID; + } + return transfer->TransDataType(args, result); } diff --git a/src/ge/common/formats/utils/formats_trans_utils.cc b/src/ge/common/formats/utils/formats_trans_utils.cc index 35a0a073..23da0f74 100644 --- a/src/ge/common/formats/utils/formats_trans_utils.cc +++ b/src/ge/common/formats/utils/formats_trans_utils.cc @@ -69,11 +69,11 @@ bool IsShapeValid(const std::vector &shape) { } int64_t num = 1; for (auto dim : shape) { - if (dim < 1) { - GELOGE(PARAM_INVALID, "Invalid zero dim in the shape %s", ShapeToString(shape).c_str()); + if (dim < 0) { + GELOGE(PARAM_INVALID, "Invalid negative dim in the shape %s", ShapeToString(shape).c_str()); return false; } - if (kShapeItemNumMAX / dim < num) { + if (dim != 0 && kShapeItemNumMAX / dim < num) { GELOGE(PARAM_INVALID, "Shape overflow, the total count should be less than %ld!", kShapeItemNumMAX); return false; } diff --git a/src/ge/common/formats/utils/formats_trans_utils.h b/src/ge/common/formats/utils/formats_trans_utils.h index 310aaf38..a8fbd09b 100644 --- a/src/ge/common/formats/utils/formats_trans_utils.h +++ b/src/ge/common/formats/utils/formats_trans_utils.h @@ -64,6 +64,9 @@ bool IsShapeEqual(const GeShape &src, const GeShape &dst); template T Ceil(T n1, T n2) { + if (n1 == 0) { + return 0; + } return (n2 != 0) ? (n1 - 1) / n2 + 1 : 0; } diff --git a/src/ge/common/helper/model_cache_helper.cc b/src/ge/common/helper/model_cache_helper.cc new file mode 100644 index 00000000..58c82138 --- /dev/null +++ b/src/ge/common/helper/model_cache_helper.cc @@ -0,0 +1,1707 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "common/ge/ge_util.h" +#include "common/helper/model_cache_helper.h" +#include "common/types.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/ge_types.h" +#include "framework/common/helper/model_helper.h" +#include "framework/common/util.h" +#include "graph/detail/attributes_holder.h" +#include "graph/detail/model_serialize_imp.h" +#include "graph/load/new_model_manager/davinci_model_parser.h" +#include "graph/model.h" +#include "graph/utils/graph_utils.h" +#include "graph/utils/tensor_utils.h" +#include "init/gelib.h" +#include "proto/ge_ir.pb.h" + +using namespace std; + +namespace { +const char *const kGraphName = "temp_name"; +const char *const kDpop = "DPOP"; +const char *const kDpopFunction = "dpop_function"; +// Keys of json +const char *const kNodeNum = "nodeNum"; +const char *const kEdgeNum = "edgeNum"; +const char *const kGraphHash = "graphHash"; +const char *const kNodeHash = "nodeHash"; +const char *const kHash = "hash"; +const char *const kSessionId = "sessionId"; +const char *const kDeviceId = "deviceId"; +const char *const kJobId = "jobId"; +const char *const kGraphMemMaxSize = "graphMemMaxSize"; +const char *const kVarMemMaxSize = "varMemMaxSize"; +const char *const kVarMemLogicBase = "varMemLogicBase"; +const char *const kUseMaxMemSize = "useMaxMemSize"; +const char *const kMemResourceMap = "memResourceMap"; +const char *const kMemType = "memType"; +const char *const kTotalSize = "totalSize"; +const char *const kVarMemSize = "varMemSize"; +const char *const kVarResource = "varResource"; +const char *const kVarAddrMgrMap = "varAddrMgrMap"; +const char *const kName = "name"; +const char *const kAddress = "address"; +const char *const kOffset = "offset"; +const char *const kMemoryType = "memoryType"; +const char *const kTensorDesc = "tensorDesc"; +const char *const kDataType = "dataType"; +const char *const kShape = "shape"; +const char *const kLayout = "layout"; +const char *const kOriginDataType = "originDataType"; +const char *const kOriginShape = "originShape"; +const char *const kOriginLayout = "originLayout"; +const char *const kRealDimCnt = "realDimCnt"; +const char *const kCurVarTensorDescMap = "curVarTensorDescMap"; +const char *const kTransRoads = "transRoads"; +const char *const kTransRoad = "transRoad"; +const char *const kNodeType = "nodeType"; +const char *const kInputTensorDesc = "inputTensorDesc"; +const char *const kOutputTensorDesc = "outputTensorDesc"; +const char *const kChangedGraphId = "changedGraphId"; +const char *const kAllocatedGraphId = "allocatedGraphId"; +const char *const kGraphId = "graphId"; +const char *const kVarBroadcastInfo = "varBroadcastInfo"; +const char *const kBroadcastName = "broadcastName"; +const char *const kIdx = "idx"; +const char *const kInputOffset = "inputOffset"; +const char *const kInputSize = "inputSize"; +const char *const kOutputOffset = "outputOffset"; +const char *const kOutputSize = "outputSize"; +// Suffix of cache files +const char *const kBeforeVarManagerSuffix = "_before_build_var_manager.json"; +const char *const kAfterVarManagerSuffix = "_after_build_var_manager.json"; +const char *const kManifestSuffix = ".manifest"; +const char *const kOmSuffix = ".om"; +} // namespace + +namespace ge { +map ModelCacheHelper::graph_id_run_times_; +ModelCacheHelper::ModelCacheHelper(uint64_t session_id, uint32_t graph_id, ComputeGraphPtr &compute_graph) + : session_id_(session_id), + graph_id_(graph_id), + compute_graph_(compute_graph), + is_cache_path_valid_for_output(false) { + if (graph_id_run_times_.count(graph_id) == 0) { + graph_id_run_times_[graph_id] = 1; + } else { + graph_id_run_times_[graph_id] = graph_id_run_times_[graph_id] + 1; + } + for (const auto &node : compute_graph_->GetDirectNode()) { + bool is_variable = (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2) || + (node->GetType() == VARHANDLEOP) || (node->GetType() == CONSTANTOP); + if (!is_variable) { + continue; + } + var_names_.insert(node->GetName()); + } + std::shared_ptr instance_ptr = ge::GELib::GetInstance(); + if (instance_ptr != nullptr && instance_ptr->IsIncreBuild()) { + std::string cache_path = instance_ptr->GetIncreBuildCachePath(); + GELOGD("Incre build path conf: %s", cache_path.c_str()); + string fake_file_path = cache_path + to_string(graph_id_) + kManifestSuffix; + if (CheckOutputPathValid(fake_file_path)) { + is_cache_path_valid_for_output = true; + } else { + GELOGW("Invalid cache path for output."); + } + std::string real_cache_path = RealPath(cache_path.c_str()); + if (real_cache_path.empty()) { + GELOGW("Invalid incre build cache path conf: %s", cache_path.c_str()); + return; + } + cache_path_ = real_cache_path + '/'; + GELOGD("Try to use incre build cache path: %s", cache_path_.c_str()); + } +} + +bool ModelCacheHelper::IsModelCacheHit() const { + CacheInfo cache_info; + if (GetCacheInfo(cache_info) != SUCCESS) { + GELOGI("Get cache info of graph id[%u] failed.", graph_id_); + return false; + } + // Check number of nodes and edges first. + if (cache_info.node_num != compute_graph_->GetDirectNodesSize()) { + GELOGI("Graph id[%u] cache miss: the node number of the graph does not match the cache info.", graph_id_); + return false; + } + size_t edge_num = 0; + for (const auto &node : compute_graph_->GetDirectNode()) { + for (const auto &anchor : node->GetAllInAnchors()) { + edge_num += anchor->GetPeerAnchors().size(); + } + } + if (cache_info.edge_num != edge_num) { + GELOGI("Graph id[%u] cache miss: the edge number of the graph does not match the cache info.", graph_id_); + return false; + } + size_t compute_graph_hash; + auto ret = GetComputeGraphHash(compute_graph_hash); + if (ret != SUCCESS || cache_info.graph_hash != compute_graph_hash) { + GELOGI("Graph id[%u] cache miss: the hash code of the graph does not match the cache info.", graph_id_); + return false; + } + if (!IsNodeHashSameAsCache(cache_info.nodes_hash)) { + GELOGI("Graph id[%u] cache miss: the hash code of node does not match the cache info.", graph_id_); + return false; + } + + string var_manager_cache = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kBeforeVarManagerSuffix; + Json var_manager_json; + if (LoadJsonFromFile(var_manager_cache, var_manager_json) != SUCCESS) { + GELOGW("Fail to load json from cache file: %s", var_manager_cache.c_str()); + return false; + } + if (!IsVarManagerSameAsCache(var_manager_json)) { + GELOGI("Graph id[%u] cache miss: the VarManager dos not match the cache info.", graph_id_); + return false; + } + GELOGI("Graph id[%u] cache hit.", graph_id_); + return true; +} + +Status ModelCacheHelper::RefreshComputeGraph(const ComputeGraphPtr &compute_graph) { + if (compute_graph->IsValid()) { + compute_graph_ = compute_graph; + var_names_.clear(); + for (const auto &node : compute_graph_->GetDirectNode()) { + bool is_variable = (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2) || + (node->GetType() == VARHANDLEOP) || (node->GetType() == CONSTANTOP); + if (!is_variable) { + continue; + } + var_names_.insert(node->GetName()); + } + return SUCCESS; + } else { + GELOGW("Invalid compute graph."); + return FAILED; + } +} + +Status ModelCacheHelper::ClearCache(uint32_t graph_id) const { + if (!is_cache_path_valid_for_output) { + GELOGW("Invalid cache path."); + return SUCCESS; + } + string manifest_file = cache_path_ + to_string(graph_id) + kManifestSuffix; + string manifest_file_path = RealPath(manifest_file.c_str()); + int ret; + if (!manifest_file_path.empty()) { + ret = remove(manifest_file_path.c_str()); + // If remove file failed, print the warning log + if (ret != 0) { + GELOGW("Clear cache [%s] failed.", manifest_file_path.c_str()); + } + } + string before_var_manager_file = cache_path_ + to_string(graph_id) + kManifestSuffix; + string before_var_manager_file_path = RealPath(before_var_manager_file.c_str()); + if (!before_var_manager_file_path.empty()) { + ret = remove(before_var_manager_file_path.c_str()); + if (ret != 0) { + GELOGW("Clear cache [%s] failed.", before_var_manager_file_path.c_str()); + } + } + string after_var_manager_file = cache_path_ + to_string(graph_id) + kManifestSuffix; + string after_var_manager_file_path = RealPath(after_var_manager_file.c_str()); + if (!after_var_manager_file_path.empty()) { + ret = remove(after_var_manager_file_path.c_str()); + if (ret != 0) { + GELOGW("Clear cache [%s] failed.", after_var_manager_file_path.c_str()); + } + } + string om_file = cache_path_ + to_string(graph_id) + kManifestSuffix; + string om_file_path = RealPath(om_file.c_str()); + if (!om_file_path.empty()) { + ret = remove(om_file_path.c_str()); + if (ret != 0) { + GELOGW("Clear cache [%s] failed.", om_file_path.c_str()); + } + } + return SUCCESS; +} + +Status ModelCacheHelper::RecoverVarManagerFromCache() const { + string var_manager_cache = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kAfterVarManagerSuffix; + Json var_manager_json; + if (LoadJsonFromFile(var_manager_cache, var_manager_json) != SUCCESS) { + GELOGW("Fail to load json from cache file: %s", var_manager_cache.c_str()); + return FAILED; + } + + Json mem_resource_json = move(var_manager_json[kMemResourceMap]); + auto ret = RecoverMemResource(mem_resource_json); + if (ret != SUCCESS) { + GELOGW("Recover VarManager from cache failed.[MemResource]"); + return FAILED; + } + Json var_resource_json = move(var_manager_json[kVarResource]); + ret = RecoverAllocatedGraphId(var_resource_json[kAllocatedGraphId]); + if (ret != SUCCESS) { + GELOGW("Recover VarManager from cache failed.[AllocatedGraphId]"); + return FAILED; + } + ret = RecoverChangedGraphId(var_resource_json[kChangedGraphId]); + if (ret != SUCCESS) { + GELOGW("Recover VarManager from cache failed.[ChangedGraphId]"); + return FAILED; + } + ret = RecoverBroadcastInfo(var_resource_json[kVarBroadcastInfo]); + if (ret != SUCCESS) { + GELOGW("Recover VarManager from cache failed.[VarBroadcastInfo]"); + return FAILED; + } + ret = RecoverVarAddrAndTensorDesc(var_resource_json[kVarAddrMgrMap]); + if (ret != SUCCESS) { + GELOGW("Recover VarManager from cache failed.[VarAddrMgrMap & CurVarTensorDesc]"); + return FAILED; + } + ret = RecoverTransRoads(var_resource_json[kTransRoads]); + if (ret != SUCCESS) { + GELOGW("Recover VarManager from cache failed.[TransRoads]"); + return FAILED; + } + GELOGI("Recover VarManager from cache[%s] success.", cache_path_.c_str()); + return SUCCESS; +} + +Status ModelCacheHelper::RecompileNodes(GeModelPtr &ge_model) { + std::shared_ptr instance = ge::GELib::GetInstance(); + if (instance == nullptr || !instance->InitFlag()) { + GELOGW("RecompileNodes failed."); + return ge::GE_CLI_GE_NOT_INITIALIZED; + } + auto compute_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph()); + vector nodes; + for (auto &node : compute_graph->GetDirectNode()) { + if (node == nullptr) { + continue; + } + auto op_desc = node->GetOpDesc(); + if (op_desc == nullptr) { + continue; + } + + string kernel_lib_name = op_desc->GetOpKernelLibName(); + if (kernel_lib_name.empty()) { + // reset op kernel lib + (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc); + kernel_lib_name = op_desc->GetOpKernelLibName(); + if (kernel_lib_name.empty()) { + GELOGW("Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(), op_desc->GetType().c_str()); + return INTERNAL_ERROR; + } + } + OpsKernelInfoStorePtr kernel_info = instance->OpsKernelManagerObj().GetOpsKernelInfoStore(kernel_lib_name); + if (kernel_info == nullptr) { + GELOGW("Get op %s ops kernel info store failed", node->GetName().c_str()); + return INTERNAL_ERROR; + } + auto ge_desc = MakeShared(op_desc); + if (ge_desc == nullptr) { + GELOGE(GE_GRAPH_MEMORY_ALLOC_FAILED, "Fail to malloc op desc."); + return FAILED; + } + // TBE compile op + vector node_vec = {node}; + auto ret = kernel_info->CompileOp(node_vec); + if (ret != ge::SUCCESS) { + GELOGW("Compile single op failed, node name is %s", node->GetName().c_str()); + return ret; + } + } + // Reset TBE Kernels + TBEKernelStore tbe_kernel_store; + for (const ge::NodePtr &n : compute_graph->GetDirectNode()) { + auto node_op_desc = n->GetOpDesc(); + GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue); + TBEKernelPtr tbe_kernel = node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr()); + GE_IF_BOOL_EXEC(tbe_kernel == nullptr, continue); + tbe_kernel_store.AddTBEKernel(tbe_kernel); + GELOGD("Add tbe kernel bin %s", tbe_kernel->GetName().c_str()); + } + if (!tbe_kernel_store.Build()) { + GELOGW("TBE Kernels store build failed!"); + return FAILED; + } + ge_model->SetTBEKernelStore(tbe_kernel_store); + return SUCCESS; +} + +Status ModelCacheHelper::GetNodesHash(map &hash_map) const { + vector nodes; + GraphUtils::TopologicalSortingByName(compute_graph_, nodes); + ModelSerializeImp model_serialize_imp; + std::hash node_hash; + for (const auto &node : nodes) { + if (node == nullptr) { + continue; + } + proto::OpDef op_def; + bool is_framework_op = (node->GetType() == FRAMEWORKOP); + string type; + bool is_dpop = false; + string origin_dpop_name; + if (is_framework_op) { + if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type)) { + GELOGI("Get original type of framework op[%s], %s.", node->GetName().c_str(), type.c_str()); + if (type == kDpop) { + GELOGI("DPOP op found:%s.", node->GetName().c_str()); + origin_dpop_name = node->GetName(); + node->GetOpDesc()->SetName(kDpopFunction); + is_dpop = true; + } + } else { + GELOGW("Get original type of framework op[%s] failed.", node->GetName().c_str()); + } + } + bool ret = model_serialize_imp.SerializeNode(node, &op_def, is_framework_op); + op_def.set_id(0); + if (is_dpop) { + node->GetOpDesc()->SetName(origin_dpop_name); + } + if (!ret) { + GELOGW("Fail to serialize node[%].", node->GetName().c_str()); + return INTERNAL_ERROR; + } + string prototxt; + ret = google::protobuf::TextFormat::PrintToString(op_def, &prototxt); + if (!ret) { + GELOGW("Print OpDef to string failed."); + hash_map.clear(); + return INTERNAL_ERROR; + } + size_t hash_code = node_hash(prototxt); + if (is_dpop) { + hash_map[kDpopFunction] = hash_code; + } else { + hash_map[node->GetName()] = hash_code; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::GetComputeGraphHash(size_t &hash) const { + proto::GraphDef graph_proto; + ModelSerializeImp model_serialize_imp; + // The name of compute graph may be generated randomly, so replace it temporarily. + const string origin_name = compute_graph_->GetName(); + compute_graph_->SetName(kGraphName); + bool serialize_ret = model_serialize_imp.SerializeGraph(compute_graph_, &graph_proto); + graph_proto.clear_op(); + if (!serialize_ret) { + GELOGW("Serialize graph failed."); + hash = 0; + return INTERNAL_ERROR; + } + compute_graph_->SetName(origin_name); + // Generate proto text of GraphDef + string prototxt; + bool print_ret = google::protobuf::TextFormat::PrintToString(graph_proto, &prototxt); + if (!print_ret) { + GELOGW("Print GraphDef to string failed."); + hash = 0; + return INTERNAL_ERROR; + } + // Get the hash code of proto text + std::hash graph_hash; + hash = graph_hash(prototxt); + return SUCCESS; +} + +Status ModelCacheHelper::SaveJsonToFile(const string &file_name, const Json &json) const { + if (!is_cache_path_valid_for_output) { + GELOGW("Invalid cache path."); + return PARAM_INVALID; + } + // Check whether the manifest exists, if not, create it. + string real_path = RealPath(cache_path_.c_str()); + if (real_path.empty()) { + GELOGW("File path is invalid. please check cache path: %s", cache_path_.c_str()); + return FAILED; + } + const string path = cache_path_ + file_name; + const int FILE_AUTHORITY = 0600; + int fd = open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, FILE_AUTHORITY); + if (fd < 0) { + GELOGW("Fail to open the file: %s.", path.c_str()); + return INTERNAL_ERROR; + } + if (close(fd) != 0) { + GELOGW("Fail to close the file: %s.", path.c_str()); + return INTERNAL_ERROR; + } + + // Write json into cache file + ofstream ofs; + ofs.open(path); + if (!ofs.is_open()) { + GELOGW("Fail to open the file: %s.", path.c_str()); + return INTERNAL_ERROR; + } + ofs << json << std::endl; + ofs.close(); + return SUCCESS; +} + +Status ModelCacheHelper::LoadJsonFromFile(const string &file_name, Json &json) const { + if (!json.is_null()) { + GELOGW("Input param json type should be null."); + return PARAM_INVALID; + } + string real_path = RealPath(cache_path_.c_str()); + if (real_path.empty()) { + GELOGW("File path is invalid. please check cache path: %s", cache_path_.c_str()); + return FAILED; + } + const string path = cache_path_ + file_name; + if (!CheckInputPathValid(path)) { + GELOGW("Invalid cache path for input:%s.", path.c_str()); + return FAILED; + } + string cache_real_path = RealPath(path.c_str()); + if (real_path.empty()) { + GELOGI("File[%s] is not found.", path.c_str()); + return FAILED; + } + // Read json from cache file + ifstream ifs; + ifs.open(path); + if (!ifs.is_open()) { + GELOGW("Fail to open the file: %s.", path.c_str()); + return INTERNAL_ERROR; + } + ifs >> json; + if (!json.is_object()) { + GELOGW("Fail to load the json file: %s.", path.c_str()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::SaveCacheInfoToCache() const { + // Generate cache json + // example: {"edgeNum":6,"nodeNum":7,"graphCache":134714827475991356} + Json cache_json; + try { + cache_json[kNodeNum] = compute_graph_->GetDirectNodesSize(); + size_t edge_num = 0; + for (const auto &node : compute_graph_->GetDirectNode()) { + for (const auto &anchor : node->GetAllInAnchors()) { + edge_num += anchor->GetPeerAnchors().size(); + } + } + cache_json[kEdgeNum] = edge_num; + size_t hash = 0; + auto ret = GetComputeGraphHash(hash); + if (ret != SUCCESS) { + GELOGW("Error occur when generate graph hash code."); + return ret; + } + cache_json[kGraphHash] = hash; + Json nodes_hash_json; + ret = GetNodesHashMapJson(nodes_hash_json); + if (ret != SUCCESS) { + GELOGW("Error occur when generate nodes hash code."); + return ret; + } + cache_json[kNodeHash] = nodes_hash_json; + } catch (const std::exception &e) { + GELOGW("Fail to generate cache info json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + string cache_manifest = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kManifestSuffix; + + auto ret = SaveJsonToFile(cache_manifest, cache_json); + if (ret != SUCCESS) { + GELOGW("Fail to save cache info to json file, path: %s.", cache_path_.c_str()); + return ret; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetCacheInfo(CacheInfo &cache_info) const { + string cache_manifest = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kManifestSuffix; + Json cache_json; + if (LoadJsonFromFile(cache_manifest, cache_json) != SUCCESS) { + GELOGW("Fail to load json from cache file: %s", cache_manifest.c_str()); + return INTERNAL_ERROR; + } + if (!cache_json.is_object()) { + GELOGW("Manifest should be a json object"); + return INTERNAL_ERROR; + } + try { + cache_info.node_num = cache_json[kNodeNum]; + cache_info.edge_num = cache_json[kEdgeNum]; + cache_info.graph_hash = cache_json[kGraphHash]; + Json nodes_hash_json = cache_json[kNodeHash]; + if (!(nodes_hash_json.is_null() || nodes_hash_json.is_array())) { + GELOGW("Nodes hash in cache be null or array."); + return FAILED; + } + for (const auto &iter : nodes_hash_json) { + cache_info.nodes_hash[iter[kName].get()] = iter[kHash].get(); + } + } catch (const std::exception &e) { + GELOGW("Fail to get info from json file. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +bool ModelCacheHelper::IsAllocatedGraphIdSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare allocated graph id info between json and VarManager + std::unordered_map allocated_graph_id; + auto ret = ParseAllocatedGraphIdFromJson(json, allocated_graph_id); + if (ret != SUCCESS) { + GELOGW("Fail to parse AllocatedGraphId from Json."); + return false; + } + for (const auto &iter : allocated_graph_id) { + uint32_t graph_id = 0; + ret = VarManager::Instance(session_id_)->GetAllocatedGraphId(iter.first, graph_id); + if (ret != SUCCESS) { + GELOGW("Fail to find allocated graph id of var[%s].", iter.first.c_str()); + return false; + } + if (graph_id != iter.second) { + GELOGW("The allocated graph id of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsNodeHashSameAsCache(const map &hash_map) const { + map cur_hash_map; + GetNodesHash(cur_hash_map); + if (hash_map.size() != cur_hash_map.size()) { + GELOGI("The number of hash code is different from cache info."); + return false; + } + for (const auto &iter : cur_hash_map) { + if (hash_map.count(iter.first) == 0) { + GELOGI("Node[%s] is not found in cache info.", iter.first.c_str()); + return false; + } + if (hash_map.at(iter.first) != iter.second) { + GELOGI("The hash code of node[%s] is different from cache info.", iter.first.c_str()); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsMemResourceSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare var mem size info between json and VarManager + std::map var_mem_size; + auto ret = ParseMemResourceFromJson(json, var_mem_size); + if (ret != SUCCESS) { + GELOGW("Fail to parse MemResource from Json."); + return false; + } + for (const auto &iter : var_mem_size) { + int64_t mem_size = VarManager::Instance(session_id_)->GetVarMemSize(iter.first); + if (mem_size != iter.second) { + GELOGW("The var mem size of memory_type[%u] in cache is different from VarManager.", iter.first); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsChangedGraphIdSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare variable changed graph id info between json and VarManager + std::unordered_map changed_graph_id; + auto ret = ParseChangedGraphIdFromJson(json, changed_graph_id); + if (ret != SUCCESS) { + GELOGW("Fail to parse ChangedGraphId from Json."); + return false; + } + for (const auto &iter : changed_graph_id) { + uint32_t graph_id = 0; + ret = VarManager::Instance(session_id_)->GetChangedGraphId(iter.first, graph_id); + if (ret != SUCCESS) { + GELOGW("Fail to find changed graph id of var[%s].", iter.first.c_str()); + return false; + } + if (graph_id != iter.second) { + GELOGW("The changed graph id of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsCurVarTensorDescSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare variable tensor desc info between json and VarManager + std::unordered_map cur_var_tensor_desc; + auto ret = ParseCurVarTensorDescMapFromJson(json, cur_var_tensor_desc); + if (ret != SUCCESS) { + GELOGW("Fail to parse CurVarTensorDesc from Json."); + return false; + } + for (const auto &iter : cur_var_tensor_desc) { + GeTensorDesc tensor_desc; + ret = VarManager::Instance(session_id_)->GetCurVarDesc(iter.first, tensor_desc); + if (ret != SUCCESS) { + GELOGW("Fail to find tensor desc of var[%s].", iter.first.c_str()); + return false; + } + uint32_t l_real_dim_cnt = 0; + uint32_t r_real_dim_cnt = 0; + TensorUtils::GetRealDimCnt(tensor_desc, l_real_dim_cnt); + TensorUtils::GetRealDimCnt(iter.second, r_real_dim_cnt); + if ((tensor_desc.GetDataType() != iter.second.GetDataType()) || + (tensor_desc.GetOriginDataType() != iter.second.GetOriginDataType()) || + (tensor_desc.GetFormat() != iter.second.GetFormat()) || + (tensor_desc.GetOriginFormat() != iter.second.GetOriginFormat()) || + (tensor_desc.GetShape().ToString() != iter.second.GetShape().ToString()) || + (tensor_desc.GetOriginShape().ToString() != iter.second.GetOriginShape().ToString()) || + (l_real_dim_cnt != r_real_dim_cnt)) { + GELOGW("The var tensor desc of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsVarAddrMgrMapSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare variable address info between json and VarManager + std::vector> var_addr_mgr_vector; + std::unordered_set var_offset_set; + auto ret = ParseVarAddrMgrMapFromJson(json, var_addr_mgr_vector, var_offset_set); + if (ret != SUCCESS) { + GELOGW("Fail to parse VarAddrMgrMap from Json."); + return false; + } + for (const auto &iter : var_addr_mgr_vector) { + uint8_t *dev_ptr = nullptr; + rtMemType_t memory_type; + ret = VarManager::Instance(session_id_)->GetVarAddr(iter.first, iter.second.tensor_desc, &dev_ptr, memory_type); + if (ret != SUCCESS) { + GELOGW("Fail to find tensor desc of var[%s].", iter.first.c_str()); + return false; + } + // Compare memory type and logic address + if (iter.second.memory_type != memory_type || iter.second.address != dev_ptr) { + GELOGW("The VarAddrMgr of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsBroadcastInfoSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare broadcast info between json and VarManager + std::unordered_map var_broadcast_info; + auto ret = ParseBroadcastInfoFromJson(json, var_broadcast_info); + if (ret != SUCCESS) { + GELOGW("Fail to parse BroadcastInfo from Json."); + return false; + } + for (const auto &iter : var_broadcast_info) { + VarBroadCastInfo broadcast_info; + if (VarManager::Instance(session_id_)->GetBroadCastInfo(graph_id_, iter.first, broadcast_info) != SUCCESS) { + GELOGW("Fail to find broadcast info of var[%s].", iter.first.c_str()); + return false; + } + if (iter.second.var_name != broadcast_info.var_name || iter.second.idx != broadcast_info.idx || + iter.second.input_size != broadcast_info.input_size || + iter.second.input_offset != broadcast_info.input_offset || + iter.second.output_size != broadcast_info.output_size || + iter.second.output_offset != broadcast_info.output_offset) { + GELOGW("The BroadcastInfo of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + } + return true; +} + +bool ModelCacheHelper::IsTransRoadsSameAsCache(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return false; + } + // Compare trans road between json and VarManager + std::unordered_map> trans_roads; + auto ret = ParseTransRoadsFromJson(json, trans_roads); + if (ret != SUCCESS) { + GELOGW("Fail to parse TransRoads from Json."); + return false; + } + for (const auto &iter : trans_roads) { + VarTransRoad *trans_road; + trans_road = VarManager::Instance(session_id_)->GetTransRoad(iter.first); + if (trans_road == nullptr) { + GELOGW("Fail to find trans road of var[%s].", iter.first.c_str()); + return false; + } + if (trans_road->size() != iter.second.size()) { + GELOGW("The TransRoad of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + // Compare every trans node in trans road. + for (size_t idx = 0; idx < trans_road->size(); idx += 1) { + if (!(trans_road->at(idx).node_type == iter.second.at(idx).node_type && + trans_road->at(idx).input == iter.second.at(idx).input && + trans_road->at(idx).output == iter.second.at(idx).output)) { + GELOGW("The TransRoad of variable[%s] in cache is different from VarManager.", iter.first.c_str()); + return false; + } + } + } + return true; +} + +bool ModelCacheHelper::IsVarManagerParamSameAsCache(Json &json) const { + if (!json.is_object()) { + GELOGW("Input param json type should be object."); + return false; + } + try { + if (json[kSessionId].get() != session_id_) { + GELOGW("Check VarManager cache failed.[sessionId]"); + return false; + } + if (json[kDeviceId].get() != VarManager::Instance(session_id_)->DeviceId()) { + GELOGW("Check VarManager cache failed.[deviceId]"); + return false; + } + if (json[kJobId].get() != VarManager::Instance(session_id_)->JobId()) { + GELOGW("Check VarManager cache failed.[jobId]"); + return false; + } + if (json[kGraphMemMaxSize].get() != VarManager::Instance(session_id_)->GetGraphMemoryMaxSize()) { + GELOGW("Check VarManager cache failed.[graphMemMaxSize]"); + return false; + } + if (json[kVarMemMaxSize].get() != VarManager::Instance(session_id_)->GetVarMemMaxSize()) { + GELOGW("Check VarManager cache failed.[varMemMaxSize]"); + return false; + } + if (json[kVarMemLogicBase].get() != VarManager::Instance(session_id_)->GetVarMemLogicBase()) { + GELOGW("Check VarManager cache failed.[varMemLogicBase]"); + return false; + } + if (json[kUseMaxMemSize].get() != VarManager::Instance(session_id_)->GetUseMaxMemorySize()) { + GELOGW("Check VarManager cache failed.[useMaxMemSize]"); + return false; + } + } catch (const std::exception &e) { + GELOGW("Fail to check VarManager json. Error message: %s", e.what()); + return false; + } + return true; +} + +bool ModelCacheHelper::IsVarManagerSameAsCache(Json &json) const { + if (!json.is_object()) { + GELOGW("Input param json type should be object."); + return false; + } + try { + if (!IsVarManagerParamSameAsCache(json)) { + GELOGW("Check VarManager cache failed.[Param]"); + return false; + } + Json mem_resource_json = move(json[kMemResourceMap]); + auto ret = IsMemResourceSameAsCache(mem_resource_json); + if (!ret) { + GELOGW("Check VarManager cache failed.[MemResource]"); + return false; + } + Json var_resource_json = move(json[kVarResource]); + ret = IsAllocatedGraphIdSameAsCache(var_resource_json[kAllocatedGraphId]); + if (!ret) { + GELOGW("Check VarManager cache failed.[AllocatedGraphId]"); + return false; + } + ret = IsChangedGraphIdSameAsCache(var_resource_json[kChangedGraphId]); + if (!ret) { + GELOGW("Check VarManager cache failed.[ChangedGraphId]"); + return false; + } + ret = IsBroadcastInfoSameAsCache(var_resource_json[kVarBroadcastInfo]); + if (!ret) { + GELOGW("Check VarManager cache failed.[VarBroadcastInfo]"); + return false; + } + ret = IsCurVarTensorDescSameAsCache(var_resource_json[kCurVarTensorDescMap]); + if (!ret) { + GELOGW("Check VarManager cache failed.[CurVarTensorDesc]"); + return false; + } + ret = IsVarAddrMgrMapSameAsCache(var_resource_json[kVarAddrMgrMap]); + if (!ret) { + GELOGW("Check VarManager cache failed.[VarAddrMgrMap]"); + return false; + } + ret = IsTransRoadsSameAsCache(var_resource_json[kTransRoads]); + if (!ret) { + GELOGW("Check VarManager cache failed.[TransRoads]"); + return false; + } + } catch (const std::exception &e) { + GELOGW("Fail to check VarManager json. Error message: %s", e.what()); + return false; + } + return true; +} + +Status ModelCacheHelper::RecoverMemResource(const Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::map var_mem_size; + auto ret = ParseMemResourceFromJson(json, var_mem_size); + if (ret != SUCCESS) { + GELOGW("Fail to parse MemResource from Json."); + return ret; + } + for (const auto &iter : var_mem_size) { + ret = VarManager::Instance(session_id_)->UpdateVarMemSize(iter.first, iter.second); + if (ret != SUCCESS) { + GELOGW("Fail to recover var mem size."); + return ret; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::RecoverAllocatedGraphId(const Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::unordered_map allocated_graph_id; + auto ret = ParseAllocatedGraphIdFromJson(json, allocated_graph_id); + if (ret != SUCCESS) { + GELOGW("Fail to parse AllocatedGraphId from Json."); + return ret; + } + for (const auto &iter : allocated_graph_id) { + ret = VarManager::Instance(session_id_)->SetAllocatedGraphId(iter.first, iter.second); + if (ret != SUCCESS) { + GELOGW("Fail to recover allocated graph id."); + return ret; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::RecoverChangedGraphId(const Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::unordered_map changed_graph_id; + auto ret = ParseChangedGraphIdFromJson(json, changed_graph_id); + if (ret != SUCCESS) { + GELOGW("Fail to parse AllocatedGraphId from Json."); + return ret; + } + for (const auto &iter : changed_graph_id) { + ret = VarManager::Instance(session_id_)->SetChangedGraphId(iter.first, iter.second); + if (ret != SUCCESS) { + GELOGW("Fail to recover changed graph id."); + return ret; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::RecoverVarAddrAndTensorDesc(const Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::vector> var_addr_mgr_vector; + std::unordered_set var_offset_set; + auto ret = ParseVarAddrMgrMapFromJson(json, var_addr_mgr_vector, var_offset_set); + if (ret != SUCCESS) { + GELOGW("Fail to parse VarAddrMgrMap from Json."); + return ret; + } + for (const auto &iter : var_addr_mgr_vector) { + const VarAddrMgr &tensor_addr_mgr = iter.second; + const bool var_exist = VarManager::Instance(session_id_)->IsVarExist(iter.first, tensor_addr_mgr.tensor_desc); + // SaveVarVddr if var does not exist, the logic address will be recorded by VarManager + if (!var_exist) { + auto logic_address = reinterpret_cast(tensor_addr_mgr.address); + auto offset = (tensor_addr_mgr.offset); + // Check logic address and offset + if (logic_address - offset != VarManager::Instance(session_id_)->GetVarMemLogicBase()) { + GELOGW("Check logic_address[%u] and offset [%u] of %s failed, var mem logic base is %u, abandon", logic_address, + offset, iter.first.c_str(), VarManager::Instance(session_id_)->GetVarMemLogicBase()); + return PARAM_INVALID; + } + // Offset is needed by SaveVarVddr instead of logic address + ret = VarManager::Instance(session_id_) + ->SaveVarAddr(iter.first, tensor_addr_mgr.tensor_desc, reinterpret_cast(offset), + tensor_addr_mgr.memory_type); + if (ret != SUCCESS) { + GELOGW("Fail to recover VarAddr or TensorDesc of var[%s].", iter.first.c_str()); + return ret; + } + } + // SetVarAddr to update cur_var_tensor_desc_map_ + ret = VarManager::Instance(session_id_) + ->SetVarAddr(iter.first, tensor_addr_mgr.tensor_desc, tensor_addr_mgr.address, tensor_addr_mgr.memory_type); + if (ret != SUCCESS) { + GELOGW("Fail to recover VarAddr or TensorDesc desc of var[%s].", iter.first.c_str()); + return ret; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::RecoverBroadcastInfo(const Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::unordered_map var_broadcast_info; + auto ret = ParseBroadcastInfoFromJson(json, var_broadcast_info); + if (ret != SUCCESS) { + GELOGW("Fail to parse BroadcastInfo from Json."); + return ret; + } + for (const auto &iter : var_broadcast_info) { + VarBroadCastInfo broadcast_info; + ret = VarManager::Instance(session_id_)->SaveBroadCastInfo(graph_id_, iter.second); + if (ret != SUCCESS) { + GELOGW("Fail to recover broadcast info of var[%s].", iter.first.c_str()); + return ret; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::RecoverTransRoads(const Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::unordered_map> trans_roads; + auto ret = ParseTransRoadsFromJson(json, trans_roads); + if (ret != SUCCESS) { + GELOGW("Fail to parse TransRoads from Json."); + return ret; + } + for (const auto &iter : trans_roads) { + ret = VarManager::Instance(session_id_)->SetTransRoad(iter.first, iter.second); + if (ret != SUCCESS) { + GELOGW("Fail to find trans road of var[%s].", iter.first.c_str()); + return ret; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::TensorDescToJson(const GeTensorDesc &ge_tensor_desc, Json &json) { + if (!(json.is_null() || json.is_object())) { + GELOGW("Input param json type should be null or object."); + return PARAM_INVALID; + } + try { + json[kDataType] = static_cast(ge_tensor_desc.GetDataType()); + json[kOriginDataType] = static_cast(ge_tensor_desc.GetOriginDataType()); + json[kLayout] = static_cast(ge_tensor_desc.GetFormat()); + json[kOriginLayout] = static_cast(ge_tensor_desc.GetOriginFormat()); + json[kShape] = ge_tensor_desc.GetShape().GetDims(); + json[kOriginShape] = ge_tensor_desc.GetOriginShape().GetDims(); + uint32_t real_dim_cnt = 0; + (void)TensorUtils::GetRealDimCnt(ge_tensor_desc, real_dim_cnt); // [No need to check value] + json[kRealDimCnt] = real_dim_cnt; + } catch (const std::exception &e) { + GELOGW("Fail to trans GeTensorDesc to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::JsonToTensorDesc(const Json &json, ge::GeTensorDesc &ge_tensor_desc) { + if (!json.is_object()) { + GELOGW("Input param json type should be object."); + return PARAM_INVALID; + } + try { + ge_tensor_desc.SetDataType(static_cast(json[kDataType].get())); + ge_tensor_desc.SetOriginDataType(static_cast(json[kOriginDataType].get())); + ge_tensor_desc.SetFormat(static_cast(json[kLayout].get())); + ge_tensor_desc.SetOriginFormat(static_cast(json[kOriginLayout].get())); + GeShape shape(json[kShape].get>()); + ge_tensor_desc.SetShape(shape); + GeShape origin_shape(json[kOriginShape].get>()); + ge_tensor_desc.SetOriginShape(origin_shape); + auto real_dim_cnt = json[kRealDimCnt].get(); + (void)TensorUtils::SetRealDimCnt(ge_tensor_desc, real_dim_cnt); // [No need to check value] + } catch (const std::exception &e) { + GELOGW("Fail to trans Json to GeTensorDesc. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetNodesHashMapJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + map hash_map; + GetNodesHash(hash_map); + for (const auto &iter : hash_map) { + Json node_hash_json; + try { + node_hash_json[kName] = iter.first; + node_hash_json[kHash] = iter.second; + json.emplace_back(move(node_hash_json)); + } catch (const std::exception &e) { + GELOGW("Fail to trans node cache to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::GetMemResourceMap(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + const auto total_size = VarManager::Instance(session_id_)->GetVarMemMaxSize(); + const auto var_mem_size = VarManager::Instance(session_id_)->GetVarMemSize(RT_MEMORY_HBM); + Json mem_resource_json; + try { + mem_resource_json[kMemType] = RT_MEMORY_HBM; + mem_resource_json[kTotalSize] = total_size; + mem_resource_json[kVarMemSize] = var_mem_size; + json.emplace_back(move(mem_resource_json)); + } catch (const std::exception &e) { + GELOGW("Fail to trans MemResourceMap to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetVarAddrMgrMapJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + std::unordered_map var_addr_mgr_map; + VarManager::Instance(session_id_)->GetAllVarAddrMgr(var_addr_mgr_map); + try { + for (const auto &iter : var_addr_mgr_map) { + Json var_addr_json; + string name; + GetVarNameFromVarKey(iter.first, iter.second.tensor_desc, name); + var_addr_json[kName] = name; + var_addr_json[kAddress] = reinterpret_cast(iter.second.address); + var_addr_json[kMemoryType] = iter.second.memory_type; + var_addr_json[kOffset] = iter.second.offset; + + // Copy tensor desc to json. + Json tensor_desc_json; + auto ret = TensorDescToJson(iter.second.tensor_desc, tensor_desc_json); + if (ret != SUCCESS) { + GELOGW("Fail to trans tensor desc to json."); + return INTERNAL_ERROR; + } + var_addr_json[kTensorDesc] = move(tensor_desc_json); + + json.emplace_back(move(var_addr_json)); + } + } catch (const std::exception &e) { + GELOGW("Fail to trans VarAddrMgrMap to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetCurVarTensorDescMapJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + try { + for (const auto &name : var_names_) { + Json cur_tensor_desc_json; + GeTensorDesc tensor_desc; + auto ret = VarManager::Instance(session_id_)->GetCurVarDesc(name, tensor_desc); + if (ret != SUCCESS) { + GELOGI("Get variable[%s] current tensor desc failed. It will be skipped.", name.c_str()); + continue; + } + cur_tensor_desc_json[kName] = name; + + Json tensor_desc_json; + ret = TensorDescToJson(tensor_desc, tensor_desc_json); + if (ret != SUCCESS) { + GELOGW("Fail to trans tensor desc to json."); + return INTERNAL_ERROR; + } + cur_tensor_desc_json[kTensorDesc] = move(tensor_desc_json); + json.emplace_back(move(cur_tensor_desc_json)); + } + } catch (const std::exception &e) { + GELOGW("Fail to trans CurVarTensorDescMap to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetTransRoadsJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + try { + for (const auto &name : var_names_) { + auto trans_road = VarManager::Instance(session_id_)->GetTransRoad(name); + if (trans_road == nullptr) { + continue; + } + // Json object, variable name and trans road + Json trans_road_map_json; + trans_road_map_json[kName] = name; + + Json trans_road_json; + Status ret; + // Add nodes' info to json + for (const auto &trans_node_info : *trans_road) { + Json trans_node_info_json; + trans_node_info_json[kNodeType] = trans_node_info.node_type; + Json input_tensor_desc_json; + ret = TensorDescToJson(trans_node_info.input, input_tensor_desc_json); + if (ret != SUCCESS) { + GELOGW("Fail to trans tensor desc to json."); + return INTERNAL_ERROR; + } + trans_node_info_json[kInputTensorDesc] = move(input_tensor_desc_json); + Json output_tensor_desc_json; + ret = TensorDescToJson(trans_node_info.output, output_tensor_desc_json); + if (ret != SUCCESS) { + GELOGW("Fail to trans tensor desc to json."); + return INTERNAL_ERROR; + } + trans_node_info_json[kOutputTensorDesc] = move(output_tensor_desc_json); + trans_road_json.emplace_back(move(trans_node_info_json)); + } + trans_road_map_json[kTransRoad] = move(trans_road_json); + json.emplace_back(move(trans_road_map_json)); + } + } catch (const std::exception &e) { + GELOGW("Fail to trans VarToTransRoad to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetChangedGraphIdJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + for (const auto &name : var_names_) { + uint32_t changed_graph_id = 0; + Status ret = VarManager::Instance(session_id_)->GetChangedGraphId(name, changed_graph_id); + if (ret != SUCCESS) { + continue; + } + Json name_and_changed_graph_id; + try { + name_and_changed_graph_id[kName] = name; + name_and_changed_graph_id[kGraphId] = changed_graph_id; + json.emplace_back(move(name_and_changed_graph_id)); + } catch (const std::exception &e) { + GELOGW("Fail to trans ChangedGraphId to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::GetAllocatedGraphIdJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + for (const auto &name : var_names_) { + uint32_t allocated_graph_id = 0; + Status ret = VarManager::Instance(session_id_)->GetAllocatedGraphId(name, allocated_graph_id); + if (ret != SUCCESS) { + continue; + } + Json name_and_allocated_graph_id; + try { + name_and_allocated_graph_id[kName] = name; + name_and_allocated_graph_id[kGraphId] = allocated_graph_id; + json.emplace_back(move(name_and_allocated_graph_id)); + } catch (const std::exception &e) { + GELOGW("Fail to trans AllocatedGraphId to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::GetBroadcastInfoJson(Json &json) const { + if (!(json.is_null() || json.is_array())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + for (const auto &name : var_names_) { + VarBroadCastInfo var_broadcast_info; + Status ret = VarManager::Instance(session_id_)->GetBroadCastInfo(graph_id_, name, var_broadcast_info); + if (ret != SUCCESS) { + continue; + } + Json var_broadcast_info_json; + try { + var_broadcast_info_json[kName] = name; + var_broadcast_info_json[kBroadcastName] = var_broadcast_info.broadcast_name; + var_broadcast_info_json[kIdx] = var_broadcast_info.idx; + var_broadcast_info_json[kInputOffset] = var_broadcast_info.input_offset; + var_broadcast_info_json[kInputSize] = var_broadcast_info.input_size; + var_broadcast_info_json[kOutputOffset] = var_broadcast_info.output_offset; + var_broadcast_info_json[kOutputSize] = var_broadcast_info.output_size; + json.emplace_back(move(var_broadcast_info_json)); + } catch (const std::exception &e) { + GELOGW("Fail to trans VarBroadcastInfo to json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::GetVarResourceJson(Json &json) const { + if (!(json.is_null() || json.is_object())) { + GELOGW("Input param json type should be null or object."); + return PARAM_INVALID; + } + Json var_addr_mgr_map_json; + Status ret = GetVarAddrMgrMapJson(var_addr_mgr_map_json); + if (ret != SUCCESS) { + GELOGW("GetVarAddrMgrMapJson failed."); + return INTERNAL_ERROR; + } + + Json cur_var_tensor_desc_map_json; + ret = GetCurVarTensorDescMapJson(cur_var_tensor_desc_map_json); + if (ret != SUCCESS) { + GELOGW("GetCurVarTensorDescMapJson failed."); + return INTERNAL_ERROR; + } + + Json trans_roads_json; + ret = GetTransRoadsJson(trans_roads_json); + if (ret != SUCCESS) { + GELOGW("GetTransRoadsJson failed."); + return INTERNAL_ERROR; + } + + Json changed_graph_id_json; + ret = GetChangedGraphIdJson(changed_graph_id_json); + if (ret != SUCCESS) { + GELOGW("GetChangedGraphIdJson failed."); + return INTERNAL_ERROR; + } + + Json allocated_graph_id_json; + ret = GetAllocatedGraphIdJson(allocated_graph_id_json); + if (ret != SUCCESS) { + GELOGW("GetAllocatedGraphIdJson failed."); + return INTERNAL_ERROR; + } + + Json var_broadcast_info_json; + ret = GetBroadcastInfoJson(var_broadcast_info_json); + if (ret != SUCCESS) { + GELOGW("GetBroadcastInfoJson failed."); + return INTERNAL_ERROR; + } + + try { + json[kVarAddrMgrMap] = move(var_addr_mgr_map_json); + json[kCurVarTensorDescMap] = move(cur_var_tensor_desc_map_json); + json[kTransRoads] = move(trans_roads_json); + json[kChangedGraphId] = move(changed_graph_id_json); + json[kAllocatedGraphId] = move(allocated_graph_id_json); + json[kVarBroadcastInfo] = move(var_broadcast_info_json); + } catch (const exception &e) { + GELOGW("Fail to generate VarResource json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::GetVarManagerJson(Json &json) const { + if (!(json.is_null() || json.is_object())) { + GELOGW("Input param json type should be null or object."); + return PARAM_INVALID; + } + + Json mem_resource_map_json; + auto ret = GetMemResourceMap(mem_resource_map_json); + if (ret != SUCCESS) { + GELOGW("GetMemResourceMap failed."); + return INTERNAL_ERROR; + } + + Json var_resource_json; + ret = GetVarResourceJson(var_resource_json); + if (ret != SUCCESS) { + GELOGW("GetVarResourceJson failed."); + return INTERNAL_ERROR; + } + + try { + json[kSessionId] = session_id_; + json[kDeviceId] = VarManager::Instance(session_id_)->DeviceId(); + json[kJobId] = VarManager::Instance(session_id_)->JobId(); + json[kGraphMemMaxSize] = VarManager::Instance(session_id_)->GetGraphMemoryMaxSize(); + json[kVarMemMaxSize] = VarManager::Instance(session_id_)->GetVarMemMaxSize(); + json[kVarMemLogicBase] = VarManager::Instance(session_id_)->GetVarMemLogicBase(); + json[kUseMaxMemSize] = VarManager::Instance(session_id_)->GetUseMaxMemorySize(); + json[kMemResourceMap] = move(mem_resource_map_json); + json[kVarResource] = move(var_resource_json); + } catch (const exception &e) { + GELOGW("Fail to generate VarManager json. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::SaveVarManagerToCache(bool before_build) const { + if (!is_cache_path_valid_for_output) { + GELOGW("Invalid cache path."); + return FAILED; + } + Json var_manager_json; + auto ret = GetVarManagerJson(var_manager_json); + if (ret != SUCCESS) { + GELOGW("Fail to generate VarManager json."); + return FAILED; + } + string var_manager_path = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + + (before_build ? kBeforeVarManagerSuffix : kAfterVarManagerSuffix); + ret = SaveJsonToFile(var_manager_path, var_manager_json); + if (ret != SUCCESS) { + GELOGW("Fail to save VarManager info to json file, path: %s.", cache_path_.c_str()); + return ret; + } + return SUCCESS; +} + +Status ModelCacheHelper::SaveOmModelToCache(const GeModelPtr &ge_model) const { + if (!is_cache_path_valid_for_output) { + GELOGW("Invalid cache path."); + return FAILED; + } + string om_path = RealPath(cache_path_.c_str()); + if (om_path.empty()) { + GELOGW("file path is invalid. please check path om: %s", cache_path_.c_str()); + return FAILED; + } + string cache_om_path = cache_path_; + cache_om_path += (to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kOmSuffix); + GELOGI("SaveOmModelToCache: start to save om model : %s", cache_om_path.c_str()); + ModelHelper model_helper; + SaveParam save_param; + ModelBufferData model; + Status ret = model_helper.SaveToOmModel(ge_model, save_param, cache_om_path, model); + if (ret != SUCCESS) { + GELOGW("SaveOmModelToCache: save mode failed. ret = %u", ret); + return ret; + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseMemResourceFromJson(const Json &json, map &mem_resource) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + mem_resource.clear(); + for (const Json &mem_resource_json : json) { + MemResource var_addr_mgr; + try { + rtMemType_t mem_type = mem_resource_json[kMemType].get(); + uint64_t var_mem_size = mem_resource_json[kVarMemSize].get(); + mem_resource[mem_type] = var_mem_size; + } catch (const exception &e) { + GELOGW("Fail to trans Json to MemResource. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseVarAddrMgrMapFromJson( + const Json &json, std::vector> &var_addr_mgr_vector, + std::unordered_set &var_offset_set) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + var_addr_mgr_vector.clear(); + var_offset_set.clear(); + for (const Json &var_addr_json : json) { + VarAddrMgr var_addr_mgr; + try { + auto logic_address = var_addr_json[kAddress].get(); + auto address = reinterpret_cast(logic_address); + var_addr_mgr.address = address; + var_addr_mgr.offset = var_addr_json[kOffset].get(); + var_addr_mgr.memory_type = var_addr_json[kMemoryType].get(); + auto ret = JsonToTensorDesc(var_addr_json[kTensorDesc], var_addr_mgr.tensor_desc); + if (ret != SUCCESS) { + GELOGW("Fail to trans json to tensor desc."); + return ret; + } + var_addr_mgr_vector.emplace_back(var_addr_json[kName].get(), move(var_addr_mgr)); + var_offset_set.insert(logic_address); + } catch (const exception &e) { + GELOGW("Fail to trans Json to VarAddrMgr. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseCurVarTensorDescMapFromJson( + const Json &json, std::unordered_map &cur_var_tensor_desc_map) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + cur_var_tensor_desc_map.clear(); + for (const Json &tensor_desc_json : json) { + GeTensorDesc tensor_desc; + try { + auto ret = JsonToTensorDesc(tensor_desc_json[kTensorDesc], tensor_desc); + if (ret != SUCCESS) { + GELOGW("Fail to trans json to tensor desc."); + return ret; + } + cur_var_tensor_desc_map[tensor_desc_json[kName].get()] = move(tensor_desc); + } catch (const exception &e) { + GELOGW("Fail to trans Json to VarAddrMgr. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseTransRoadsFromJson( + const Json &json, std::unordered_map> &trans_roads) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + trans_roads.clear(); + try { + for (const Json &name_trans_road_json : json) { + const Json &trans_road_json = name_trans_road_json[kTransRoad]; + if (!(trans_road_json.is_array() || trans_road_json.is_null())) { + GELOGW("%s json type should be null or object.", kTransRoad); + return PARAM_INVALID; + } + vector trans_road; + for (const Json &trans_node_json : trans_road_json) { + TransNodeInfo trans_node_info; + trans_node_info.node_type = trans_node_json[kNodeType]; + GeTensorDesc input_tensor_desc; + auto ret = JsonToTensorDesc(trans_node_json[kInputTensorDesc], input_tensor_desc); + if (ret != SUCCESS) { + GELOGW("Fail to trans json to tensor desc."); + return ret; + } + trans_node_info.input = move(input_tensor_desc); + GeTensorDesc output_tensor_desc; + ret = JsonToTensorDesc(trans_node_json[kOutputTensorDesc], output_tensor_desc); + if (ret != SUCCESS) { + GELOGW("Fail to trans json to tensor desc."); + return ret; + } + trans_node_info.output = move(output_tensor_desc); + trans_road.emplace_back(move(trans_node_info)); + } + trans_roads[name_trans_road_json[kName].get()] = move(trans_road); + } + } catch (const exception &e) { + GELOGW("Fail to trans Json to TransRoads. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseChangedGraphIdFromJson(const Json &json, + std::unordered_map &changed_graph_id) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + changed_graph_id.clear(); + for (const Json &name_graph_id_json : json) { + try { + changed_graph_id[name_graph_id_json[kName].get()] = name_graph_id_json[kGraphId].get(); + } catch (const exception &e) { + GELOGW("Fail to trans Json to changed graph id. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseAllocatedGraphIdFromJson(const Json &json, + std::unordered_map &allocated_graph_id) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + allocated_graph_id.clear(); + for (const Json &name_graph_id_json : json) { + try { + allocated_graph_id[name_graph_id_json[kName].get()] = name_graph_id_json[kGraphId].get(); + } catch (const exception &e) { + GELOGW("Fail to trans Json to allocated graph id. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status ModelCacheHelper::ParseBroadcastInfoFromJson( + const Json &json, std::unordered_map &var_broadcast_info) { + if (!(json.is_array() || json.is_null())) { + GELOGW("Input param json type should be null or array."); + return PARAM_INVALID; + } + for (const Json &broadcast_info_json : json) { + VarBroadCastInfo broadcast_info; + try { + broadcast_info.var_name = broadcast_info_json[kName].get(); + broadcast_info.broadcast_name = broadcast_info_json[kBroadcastName].get(); + broadcast_info.idx = broadcast_info_json[kIdx].get(); + broadcast_info.input_offset = broadcast_info_json[kInputOffset].get(); + broadcast_info.input_size = broadcast_info_json[kInputSize].get(); + broadcast_info.output_offset = broadcast_info_json[kOutputOffset].get(); + broadcast_info.output_size = broadcast_info_json[kOutputSize].get(); + } catch (const exception &e) { + GELOGW("Fail to trans Json to VarBroadCastInfo. Error message: %s", e.what()); + return INTERNAL_ERROR; + } + var_broadcast_info[broadcast_info.var_name] = broadcast_info; + } + return SUCCESS; +} + +Status ModelCacheHelper::LoadOmModelFromCache(GeModelPtr &ge_model) const { + string cache_om = cache_path_ + to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kOmSuffix; + if (!CheckInputPathValid(cache_om)) { + GELOGW("Invalid cache path for input:%s.", cache_om.c_str()); + return FAILED; + } + string om_path = RealPath(cache_om.c_str()); + if (om_path.empty()) { + GELOGW("file path is invalid. please check file om: %s", om_path.c_str()); + return FAILED; + } + GELOGI("load model data from file: %s", om_path.c_str()); + Status ret; + string key_path; + int32_t priority = 0; + ModelData model_data; + ret = DavinciModelParser::LoadFromFile(om_path.c_str(), key_path.c_str(), priority, model_data); + if (ret != SUCCESS) { + GELOGW("LoadOmModelFromCache: Load model from file fialed. ret = %u", ret); + return ret; + } + + ModelHelper model_helper; + ret = model_helper.LoadModel(model_data); + if (ret != SUCCESS) { + GELOGW("LoadOmModelFromCache: Load model from data failed. ret = %u", ret); + return ret; + } + ge_model = model_helper.GetGeModel(); + // Load TbeKernelBin to op desc from TBEKernelStore + const TBEKernelStore &tbekernel_store = ge_model->GetTBEKernelStore(); + const ComputeGraphPtr compute_graph_in_model = GraphUtils::GetComputeGraph(ge_model->GetGraph()); + for (const auto &node : compute_graph_in_model->GetDirectNode()) { + auto op_desc = node->GetOpDesc(); + tbekernel_store.LoadTBEKernelBinToOpDesc(op_desc); + GELOGI("LoadOmModelFromCache: Load tbe kernel bin to op desc."); + } + return SUCCESS; +} + +Status ModelCacheHelper::GetVarNameFromVarKey(const string &var_key, const GeTensorDesc &tensor_desc, + string &var_name) { + std::string::size_type underline_idx = var_key.rfind('_'); + if (underline_idx == std::string::npos) { + GELOGW("Invalid var key: underline not found"); + return FAILED; + } + std::string::size_type format_idx = + var_key.rfind(std::to_string(static_cast(tensor_desc.GetFormat())), underline_idx); + if (format_idx == std::string::npos) { + GELOGW("Invalid var key: format not found"); + return FAILED; + } + var_name = var_key.substr(0, format_idx); + return SUCCESS; +} +} // namespace ge diff --git a/src/ge/common/helper/model_cache_helper.h b/src/ge/common/helper/model_cache_helper.h new file mode 100644 index 00000000..91257282 --- /dev/null +++ b/src/ge/common/helper/model_cache_helper.h @@ -0,0 +1,121 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_HELPER_MODEL_CACHE_HELPER_H_ +#define GE_COMMON_HELPER_MODEL_CACHE_HELPER_H_ + +#include +#include +#include + +#include "ge/ge_api_error_codes.h" +#include "graph/compute_graph.h" +#include "graph/manager/graph_var_manager.h" +#include "model/ge_model.h" + +namespace ge { +using Json = nlohmann::json; + +struct CacheInfo { + size_t node_num; + size_t edge_num; + size_t graph_hash; + map nodes_hash; + CacheInfo() : node_num(0), edge_num(0), graph_hash(0) {} +}; + +class ModelCacheHelper { + public: + ModelCacheHelper(uint64_t session_id, uint32_t graph_id, ComputeGraphPtr &compute_graph); + + Status SaveCacheInfoToCache() const; + Status SaveVarManagerToCache(bool before_build) const; + Status SaveOmModelToCache(const GeModelPtr &ge_model) const; + bool IsModelCacheHit() const; + Status RecoverVarManagerFromCache() const; + Status LoadOmModelFromCache(GeModelPtr &ge_model) const; + Status RefreshComputeGraph(const ComputeGraphPtr &compute_graph); + Status ClearCache(uint32_t graph_id) const; + + private: + Status GetComputeGraphHash(size_t &hash) const; + Status GetNodesHash(map &hash_map) const; + Status GetCacheInfo(CacheInfo &cache_info) const; + + Status RecoverMemResource(const Json &json) const; + Status RecoverAllocatedGraphId(const Json &json) const; + Status RecoverChangedGraphId(const Json &json) const; + Status RecoverVarAddrAndTensorDesc(const Json &json) const; + Status RecoverBroadcastInfo(const Json &json) const; + Status RecoverTransRoads(const Json &json) const; + static Status RecompileNodes(GeModelPtr &ge_model); + + bool IsNodeHashSameAsCache(const map &hash_map) const; + bool IsMemResourceSameAsCache(Json &json) const; + bool IsChangedGraphIdSameAsCache(Json &json) const; + bool IsAllocatedGraphIdSameAsCache(Json &json) const; + bool IsCurVarTensorDescSameAsCache(Json &json) const; + bool IsVarAddrMgrMapSameAsCache(Json &json) const; + bool IsBroadcastInfoSameAsCache(Json &json) const; + bool IsTransRoadsSameAsCache(Json &json) const; + bool IsVarManagerSameAsCache(Json &json) const; + bool IsVarManagerParamSameAsCache(Json &json) const; + + Status SaveJsonToFile(const string &file_name, const Json &json) const; + Status LoadJsonFromFile(const string &file_name, Json &json) const; + + Status GetNodesHashMapJson(Json &json) const; + Status GetMemResourceMap(Json &json) const; + Status GetVarAddrMgrMapJson(Json &json) const; + Status GetCurVarTensorDescMapJson(Json &json) const; + Status GetTransRoadsJson(Json &json) const; + Status GetChangedGraphIdJson(Json &json) const; + Status GetAllocatedGraphIdJson(Json &json) const; + Status GetBroadcastInfoJson(Json &json) const; + Status GetVarResourceJson(Json &json) const; + Status GetVarManagerJson(Json &json) const; + + static Status TensorDescToJson(const GeTensorDesc &ge_tensor_desc, Json &json); + static Status JsonToTensorDesc(const Json &json, GeTensorDesc &ge_tensor_desc); + static Status ParseMemResourceFromJson(const Json &json, map &mem_resource); + static Status ParseVarAddrMgrMapFromJson(const Json &json, + std::vector> &var_addr_mgr_vector, + std::unordered_set &var_offset_set); + static Status ParseCurVarTensorDescMapFromJson( + const Json &json, std::unordered_map &cur_var_tensor_desc_map); + static Status ParseTransRoadsFromJson(const Json &json, + std::unordered_map> &trans_roads); + static Status ParseChangedGraphIdFromJson(const Json &json, + std::unordered_map &changed_graph_id); + static Status ParseAllocatedGraphIdFromJson(const Json &json, + std::unordered_map &allocated_graph_id); + static Status ParseBroadcastInfoFromJson(const Json &json, + std::unordered_map &var_broadcast_info); + static Status GetVarNameFromVarKey(const string &var_key, const GeTensorDesc &tensor_desc, string &var_name); + + uint64_t session_id_; + uint32_t graph_id_; + string cache_path_; + ComputeGraphPtr compute_graph_; + std::set var_names_; + bool is_cache_path_valid_for_output; + static map graph_id_run_times_; +}; + +using ModelCacheHelperPtr = std::shared_ptr; +} // namespace ge + +#endif // GE_COMMON_HELPER_MODEL_CACHE_HELPER_H_ diff --git a/src/ge/common/types.cc b/src/ge/common/types.cc index da0853b6..e8ae5257 100644 --- a/src/ge/common/types.cc +++ b/src/ge/common/types.cc @@ -385,6 +385,7 @@ REGISTER_OPTYPE_DEFINE(STREAMSWITCH, "StreamSwitch"); REGISTER_OPTYPE_DEFINE(STREAMSWITCHN, "StreamSwitchN"); REGISTER_OPTYPE_DEFINE(STREAMACTIVE, "StreamActive"); REGISTER_OPTYPE_DEFINE(MEMCPYASYNC, "MemcpyAsync"); +REGISTER_OPTYPE_DEFINE(MEMCPYADDRASYNC, "MemcpyAddrAsync"); REGISTER_OPTYPE_DEFINE(STREAMMERGE, "StreamMerge"); REGISTER_OPTYPE_DEFINE(ENDGRAPH, "EndGraph"); REGISTER_OPTYPE_DEFINE(SEND, "Send"); @@ -392,6 +393,7 @@ REGISTER_OPTYPE_DEFINE(RECV, "Recv"); REGISTER_OPTYPE_DEFINE(LABELSET, "LabelSet"); REGISTER_OPTYPE_DEFINE(LABELGOTO, "LabelGoto"); +REGISTER_OPTYPE_DEFINE(LABELGOTOEX, "LabelGotoEx"); REGISTER_OPTYPE_DEFINE(LABELSWITCH, "LabelSwitch"); REGISTER_OPTYPE_DEFINE(LABELSWITCHBYINDEX, "LabelSwitchByIndex"); diff --git a/src/ge/common/util.cc b/src/ge/common/util.cc index 79ead57b..f1a2fe6c 100644 --- a/src/ge/common/util.cc +++ b/src/ge/common/util.cc @@ -196,7 +196,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std:: GE_CHK_BOOL_EXEC(!directory_path.empty(), return -1, "directory path is empty."); auto dir_path_len = directory_path.length(); if (dir_path_len >= PATH_MAX) { - GELOGE(ge::FAILED, "Directory path is too long."); + GELOGW("Directory path is too long."); return -1; } char tmp_dir_path[PATH_MAX] = {0}; @@ -207,7 +207,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std:: int32_t ret = mmMkdir(tmp_dir_path, S_IRUSR | S_IWUSR | S_IXUSR); // 700 if (ret != 0) { if (errno != EEXIST) { - GELOGE(ge::FAILED, "Cannot create directory %s. Make sure that the directory exists and writable.", + GELOGW("Cannot create directory %s. Make sure that the directory exists and writable.", directory_path.c_str()); return ret; } @@ -218,8 +218,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std:: int32_t ret = mmMkdir(const_cast(directory_path.c_str()), S_IRUSR | S_IWUSR | S_IXUSR); // 700 if (ret != 0) { if (errno != EEXIST) { - GELOGE(ge::FAILED, "Cannot create directory %s. Make sure that the directory exists and writable.", - directory_path.c_str()); + GELOGW("Cannot create directory %s. Make sure that the directory exists and writable.", directory_path.c_str()); return ret; } } @@ -339,7 +338,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string RealPath(const char FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const std::string &file_path) { // The specified path is empty if (file_path.empty()) { - GELOGE(ge::FAILED, "Path is empty."); + GELOGW("Path is empty."); return false; } @@ -358,23 +357,23 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const std::string real_path = RealPath(file_path.c_str()); // Unable to get absolute path (does not exist or does not have permission to access) if (real_path.empty()) { - GELOGE(ge::FAILED, "Can not get real path for %s, %s", file_path.c_str(), strerror(errno)); + GELOGW("Can not get real path for %s, %s", file_path.c_str(), strerror(errno)); return false; } // The absolute path points to a file that is not readable if (access(real_path.c_str(), R_OK) != 0) { - GELOGE(ge::FAILED, "Can not read file in %s, %s", file_path.c_str(), strerror(errno)); + GELOGW("Can not read file in %s, %s", file_path.c_str(), strerror(errno)); return false; } return true; } -FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) { // The specified path is empty if (file_path.empty()) { - GELOGE(ge::FAILED, "Path is empty."); + GELOGW("Path is empty."); return false; } @@ -394,8 +393,8 @@ FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) // Can get absolute path (file exists) if (!real_path.empty()) { // File is not readable or writable - if (access(real_path.c_str(), R_OK | W_OK | F_OK) != 0) { - GELOGE(ge::FAILED, "Path[ %s ] exists, but can not be write, %s", file_path.c_str(), strerror(errno)); + if (access(real_path.c_str(), W_OK | F_OK) != 0) { + GELOGW("Path[ %s ] exists, but can not be write, %s", file_path.c_str(), strerror(errno)); return false; } } else { @@ -413,7 +412,7 @@ FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) std::string prefix_path = std::string(file_path).substr(0, static_cast(path_split_pos)); // Determine whether the specified path is valid by creating the path if (CreateDirectory(prefix_path) != 0) { - GELOGE(ge::FAILED, "Can not create prefix path for path[ %s ].", file_path.c_str()); + GELOGW("Can not create prefix path for path[ %s ].", file_path.c_str()); return false; } } diff --git a/src/ge/executor/CMakeLists.txt b/src/ge/executor/CMakeLists.txt index 7401b062..2f09b50d 100755 --- a/src/ge/executor/CMakeLists.txt +++ b/src/ge/executor/CMakeLists.txt @@ -47,6 +47,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR} "../graph/load/new_model_manager/task_info/kernel_task_info.cc" "../graph/load/new_model_manager/task_info/label_goto_task_info.cc" "../graph/load/new_model_manager/task_info/label_set_task_info.cc" + "../graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc" "../graph/load/new_model_manager/task_info/memcpy_async_task_info.cc" "../graph/load/new_model_manager/task_info/profiler_trace_task_info.cc" "../graph/load/new_model_manager/task_info/stream_active_task_info.cc" diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.h b/src/ge/ge_local_engine/engine/host_cpu_engine.h index 88985f87..1987138d 100644 --- a/src/ge/ge_local_engine/engine/host_cpu_engine.h +++ b/src/ge/ge_local_engine/engine/host_cpu_engine.h @@ -21,7 +21,7 @@ #include "framework/common/ge_inner_error_codes.h" #include "graph/node.h" #include "graph/operator.h" -#include "register/register.h" +#include "inc/register/register.h" namespace ge { class HostCpuEngine { diff --git a/src/ge/ge_runtime/output.cc b/src/ge/ge_runtime/output.cc index 2f4ade89..5153f688 100644 --- a/src/ge/ge_runtime/output.cc +++ b/src/ge/ge_runtime/output.cc @@ -76,7 +76,7 @@ bool Output::CopyRslt(OutputData *rslt, uint32_t data_begin, uint32_t &data_inde DataBuffer data_buf = rslt->blobs[data_begin + data_count]; bool ret = SetDataBuf(data_buf, data_begin, data_count, i, support_mem_share); if (!ret) { - GELOGE(FAILED, "Copy data to host error. index: %lu", i); + GELOGE(FAILED, "Copy data to host error. index: %lu, addr: %p", i, v_input_data_addr_[i]); return ret; } data_index = data_begin + data_count; diff --git a/src/ge/ge_runtime/runtime_model.cc b/src/ge/ge_runtime/runtime_model.cc index ffb0d8a0..330ffc14 100644 --- a/src/ge/ge_runtime/runtime_model.cc +++ b/src/ge/ge_runtime/runtime_model.cc @@ -96,6 +96,7 @@ bool RuntimeModel::InitStream(std::shared_ptr &davinci_model) { GELOGE(RT_FAILED, "Call rt api rtModelBindStream failed, ret: 0x%X", rt_ret); return false; } + GELOGI("stream index:%u, stream:%p.", i, stream); } return true; @@ -446,8 +447,11 @@ bool RuntimeModel::InitConstantInfo(std::shared_ptr &davinci_model /// The logic of GetShapeSize is wrong, the scaler tensor's GetShapeSize is zero /// and that of unknown shape is zero too. /// Unknown shape will not appear here, so we can use zero judge a tensor is scaler or not. - int64_t elem_num = - (constant->weight_tensors[0].GetShapeSize() == 0) ? 1 : constant->weight_tensors[0].GetShapeSize(); + int64_t elem_num = constant->weight_tensors[0].GetShapeSize(); + if (elem_num == 0 && constant->weight_tensors[0].size == 0) { + elem_num = 1; + } + if (constant->weight_data.size() < sizeof(uint64_t)) { GELOGE(FAILED, "weight_data size is smaller than sizeof(uint64_t)"); return false; diff --git a/src/ge/ge_runtime/task/cce_task.cc b/src/ge/ge_runtime/task/cce_task.cc index e5ea99c0..04fd5610 100644 --- a/src/ge/ge_runtime/task/cce_task.cc +++ b/src/ge/ge_runtime/task/cce_task.cc @@ -82,6 +82,7 @@ bool CceTask::Distribute() { stub_func_ = nullptr; return false; } + GELOGI("CCETask: stub_func = %s [%p].", task_info_->stub_func().c_str(), stub_func_); // Flowtable if (is_flowtable_) { diff --git a/src/ge/ge_runtime/task/event_record_task.cc b/src/ge/ge_runtime/task/event_record_task.cc index 46ac7a1b..85ddc053 100644 --- a/src/ge/ge_runtime/task/event_record_task.cc +++ b/src/ge/ge_runtime/task/event_record_task.cc @@ -43,6 +43,8 @@ EventRecordTask::EventRecordTask(const ModelContext &model_context, EventRecordTask::~EventRecordTask() {} bool EventRecordTask::Distribute() { + GELOGI("EventRecordTask Distribute start, stream: %p, event: %p, stream_id: %u, event_id: %u.", stream_, event_, + task_info_->stream_id(), task_info_->event_id()); rtError_t rt_ret = rtEventRecord(event_, stream_); if (rt_ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); diff --git a/src/ge/ge_runtime/task/event_wait_task.cc b/src/ge/ge_runtime/task/event_wait_task.cc index e4cf986f..558c2a59 100644 --- a/src/ge/ge_runtime/task/event_wait_task.cc +++ b/src/ge/ge_runtime/task/event_wait_task.cc @@ -42,6 +42,9 @@ EventWaitTask::EventWaitTask(const ModelContext &model_context, const std::share EventWaitTask::~EventWaitTask() {} bool EventWaitTask::Distribute() { + GELOGI("EventWaitTask Distribute start, stream: %p, event: %p, stream_id: %u, event_id: %u.", stream_, event_, + task_info_->stream_id(), task_info_->event_id()); + rtError_t rt_ret = rtStreamWaitEvent(stream_, event_); if (rt_ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt api rtStreamWaitEvent failed, ret: 0x%X", rt_ret); diff --git a/src/ge/ge_runtime/task/hccl_task.cc b/src/ge/ge_runtime/task/hccl_task.cc index 0794c0e9..7a513597 100644 --- a/src/ge/ge_runtime/task/hccl_task.cc +++ b/src/ge/ge_runtime/task/hccl_task.cc @@ -101,6 +101,7 @@ bool HcclTask::Distribute() { char *private_def = reinterpret_cast(const_cast(task_info_->private_def().data())); auto private_def_len = static_cast(task_info_->private_def().size()); + GELOGI("the first address of the custom info, privateDef=%p", private_def); GELOGI("hcclStreamNum =%ld", task_info_->hccl_stream_num()); for (int64_t i = 0; i < task_info_->hccl_stream_num(); ++i) { @@ -117,6 +118,7 @@ bool HcclTask::Distribute() { return false; } + GELOGI("hccl_stream addr is=%p", stream); slave_stream_list_.push_back(stream); } diff --git a/src/ge/ge_runtime/task/stream_switch_task.cc b/src/ge/ge_runtime/task/stream_switch_task.cc index afbdba18..2adcb4bd 100644 --- a/src/ge/ge_runtime/task/stream_switch_task.cc +++ b/src/ge/ge_runtime/task/stream_switch_task.cc @@ -62,6 +62,9 @@ bool StreamSwitchTask::Distribute() { rtStream_t true_stream = stream_list_[task_info_->true_stream_id()]; rtSwitchDataType_t data_type = static_cast(task_info_->data_type()); + GELOGI("InitStreamSwitchTask, cond:%d, trueStream:%p, trueStreamID:%ld, datatype:%ld.", cond, true_stream, + task_info_->true_stream_id(), task_info_->data_type()); + GELOGI("StreamSwitchTask Distribute Start."); rtError_t rt_ret = rtStreamSwitchEx(input, cond, value, true_stream, stream_, data_type); if (rt_ret != RT_ERROR_NONE) { @@ -69,6 +72,7 @@ bool StreamSwitchTask::Distribute() { return false; } + GELOGI("Distribute StreamSwitch, cond:%d, trueStream:%p, datatype:%ld.", cond, true_stream, task_info_->data_type()); return true; } diff --git a/src/ge/ge_runtime/task/tbe_task.cc b/src/ge/ge_runtime/task/tbe_task.cc index 19056c1b..8a3c36a4 100644 --- a/src/ge/ge_runtime/task/tbe_task.cc +++ b/src/ge/ge_runtime/task/tbe_task.cc @@ -69,6 +69,7 @@ bool TbeTask::Distribute() { stub_func_ = nullptr; return false; } + GELOGI("TbeTask: stub_func = %s [%p].", task_info_->stub_func().c_str(), stub_func_); // Get args std::vector tensor_device_addrs; diff --git a/src/ge/graph/build/graph_builder.cc b/src/ge/graph/build/graph_builder.cc index 9424a4ed..957ddc2d 100644 --- a/src/ge/graph/build/graph_builder.cc +++ b/src/ge/graph/build/graph_builder.cc @@ -18,8 +18,8 @@ #include "common/ge/ge_util.h" #include "common/helper/model_helper.h" #include "common/opskernel/ops_kernel_info_types.h" -#include "graph/build/stream_graph_optimizer.h" #include "graph/build/run_context.h" +#include "graph/build/stream_graph_optimizer.h" #include "graph/manager/graph_var_manager.h" #include "graph/utils/node_utils.h" #include "graph/utils/type_utils.h" @@ -98,8 +98,10 @@ Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vector &subgraph_ptr_list, + ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map, uint64_t session_id) { GE_CHECK_NOTNULL(model_ptr); GE_CHECK_NOTNULL(comp_graph); @@ -190,7 +192,7 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr } StreamGraphOptimizer stream_optimizer; - ret = stream_optimizer.OptimizeStreamedSubGraph(comp_graph, subgraph_ptr_list, run_context.GetRunContext()); + ret = stream_optimizer.OptimizeStreamedSubGraph(comp_graph, subgraph_map, run_context.GetRunContext()); if (ret != SUCCESS) { GELOGE(ret, "Optimize streamed subGraph fail."); return ret; diff --git a/src/ge/graph/build/graph_builder.h b/src/ge/graph/build/graph_builder.h index c1c4f7b6..d0bf26e6 100644 --- a/src/ge/graph/build/graph_builder.h +++ b/src/ge/graph/build/graph_builder.h @@ -53,7 +53,7 @@ class GraphBuilder { private: Status CalcOpParam(const ge::ComputeGraphPtr &graph); Status GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr &model_ptr, ComputeGraphPtr &comp_graph, - std::vector &subgraph_ptr_list, uint64_t session_id = INVALID_SESSION_ID); + Graph2SubGraphInfoList &subgraph_map, uint64_t session_id = INVALID_SESSION_ID); Status SetInputSize(const ge::NodePtr &node_ptr); Status UpdateDataInputSize(const ge::NodePtr &node_ptr); Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector &subgraph_ptr_list); diff --git a/src/ge/graph/build/logical_stream_allocator.cc b/src/ge/graph/build/logical_stream_allocator.cc index 2b11347b..16c4935e 100644 --- a/src/ge/graph/build/logical_stream_allocator.cc +++ b/src/ge/graph/build/logical_stream_allocator.cc @@ -70,7 +70,7 @@ bool LogicalStreamPass::HasNonConstInputNode(const Subgraph &subgraph) const { return false; } -Status AssignByLabelPass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, Context &context) { +Status AssignByLabelPass::Run(ComputeGraphPtr graph, const vector &subgraphs, Context &context) { bool changed = false; int64_t &next_stream = context.next_stream; map label_streams; @@ -97,7 +97,7 @@ Status AssignByLabelPass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, Context &context) { +Status IndependentStreamPass::Run(ComputeGraphPtr graph, const vector &subgraphs, Context &context) { bool changed = false; int64_t &next_stream = context.next_stream; @@ -129,8 +129,7 @@ Status IndependentStreamPass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, - Context &context) { +Status AssignByDependencyPass::Run(ComputeGraphPtr graph, const vector &subgraphs, Context &context) { bool changed = false; if (IsHeadNodeExceeded(subgraphs)) { int64_t &next_stream = context.next_stream; @@ -298,7 +297,7 @@ int64_t AssignByDependencyPass::AssignNewStream(SubgraphPtr subgraph) { subgraph->stream_id = stream_id; engine_next_streams_[engine_name] = stream_id + 1; - assigned_subgraphs_.emplace(subgraph); + assigned_subgraphs_.emplace_back(subgraph); if ((stream_id + 1) > engine_stream_num_[engine_name]) { engine_stream_num_[engine_name] = stream_id + 1; @@ -311,6 +310,15 @@ int64_t AssignByDependencyPass::AssignNewStream(SubgraphPtr subgraph) { } void AssignByDependencyPass::UpdateAssignedSubgraphs(Context &context) { + // If the parent stream is valid, the first assigned stream will reuse the parent stream id + // and other streams use new id. To ensure that the id of the new stream is continuous, + // we first subtract one from next_stream. + int64_t to_be_updated_stream = kInvalidStream; + if (context.parent_stream != kInvalidStream) { + context.next_stream--; + to_be_updated_stream = context.next_stream; + } + // Update the starting stream id for each engine. int64_t &next_stream = context.next_stream; map engine_start_streams; @@ -320,10 +328,16 @@ void AssignByDependencyPass::UpdateAssignedSubgraphs(Context &context) { next_stream += stream_count; } - // Update the subgraphs assigned by the engine. + // Update the subgraph streams assigned by engine. for (auto &subgraph : assigned_subgraphs_) { subgraph->stream_id += engine_start_streams[subgraph->engine_conf.id]; - GELOGI("Stream of subgraph %s has been updated to %ld.", subgraph->name.c_str(), subgraph->stream_id); + if (subgraph->stream_id == to_be_updated_stream) { + subgraph->stream_id = context.parent_stream; + GELOGI("Subgraph %s of engine %s reuses parent stream %ld.", subgraph->name.c_str(), + subgraph->engine_conf.id.c_str(), context.parent_stream); + } else { + GELOGI("Stream of subgraph %s has been updated to %ld.", subgraph->name.c_str(), subgraph->stream_id); + } } } @@ -337,7 +351,7 @@ void AssignByDependencyPass::UpdateReusedSubgraphs() { } } -Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, Context &context) { +Status NodeStreamUpdatePass::Run(ComputeGraphPtr graph, const vector &subgraphs, Context &context) { // Check if all subgraphs have been assigned a stream. for (const SubgraphPtr &subgraph : subgraphs) { const string &engine_name = subgraph->engine_conf.id; @@ -353,7 +367,7 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vectorGetDirectNode()) { + for (NodePtr &node : graph->GetDirectNode()) { GE_CHECK_NOTNULL(node->GetOpDesc()); node->GetOpDesc()->SetStreamId(kInvalidStream); } @@ -375,76 +389,11 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vector &subgraphs, Context &context) { - if (!context.hcom_parallel) { - return NOT_CHANGED; - } - - GELOGI("AllReduceParallelPass is enabled."); - GraphUtils::DumpGEGraph(whole_graph, "BeforeAllReduceParallel"); - - // All successors of HcomAllReduce. - set all_reduce_succs; - - for (const NodePtr &node : whole_graph->GetDirectNode()) { - if (node->GetType() != HCOMALLREDUCE || node->GetInDataNodes().size() <= 1) { - continue; - } - - string reduce_stream_label; - GE_CHECK_NOTNULL(node->GetOpDesc()); - // ATTR_NAME_STREAM_LABEL is optional. - (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, reduce_stream_label); - - set cur_nodes = {node}; - while (!cur_nodes.empty()) { - set all_out_data_nodes; - for (auto &curr_node : cur_nodes) { - for (const NodePtr &out_node : curr_node->GetOutDataNodes()) { - string out_stream_label; - GE_CHECK_NOTNULL(out_node->GetOpDesc()); - // ATTR_NAME_STREAM_LABEL is optional. - (void)AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, out_stream_label); - if (out_stream_label == reduce_stream_label) { - all_reduce_succs.emplace(out_node); - all_out_data_nodes.emplace(out_node); - } - } - } - cur_nodes = all_out_data_nodes; - } - } - - map old_stream_to_new; - for (const NodePtr &node : all_reduce_succs) { - GE_CHECK_NOTNULL(node->GetOpDesc()); - auto old_stream = node->GetOpDesc()->GetStreamId(); - if (old_stream != kInvalidStream) { - int64_t new_stream = kInvalidStream; - auto iter = old_stream_to_new.find(old_stream); - if (iter != old_stream_to_new.end()) { - new_stream = iter->second; - } else { - new_stream = context.next_stream; - context.next_stream++; - old_stream_to_new.emplace(old_stream, new_stream); - } - - GELOGI("Stream of node %s has been updated from %ld to %ld.", node->GetName().c_str(), old_stream, new_stream); - node->GetOpDesc()->SetStreamId(new_stream); - } - } - - return !all_reduce_succs.empty() ? SUCCESS : NOT_CHANGED; -} - int64_t NodeStreamUpdatePass::GetSingleInoutStream(const NodePtr &node) const { set stream_ids; @@ -472,11 +421,11 @@ int64_t NodeStreamUpdatePass::GetSingleInoutStream(const NodePtr &node) const { return kInvalidStream; } -Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole_graph, +Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &graph, const vector &subgraphs) { set nodes_to_be_updated; - // Check if sub graph is engine skipped and without stream label or not + // Check if subgraph is engine skipped and without stream label or not for (const SubgraphPtr &subgraph : subgraphs) { if (IsEngineSkip(*subgraph) && !HasStreamLabel(*subgraph)) { auto graph = subgraph->subgraph_info.GetSubGraph(); @@ -492,7 +441,7 @@ Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole } // Try reassign the stream id - for (ge::NodePtr &node : whole_graph->GetDirectNode()) { + for (ge::NodePtr &node : graph->GetDirectNode()) { auto op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); int64_t stream_id = op_desc->GetStreamId(); @@ -509,6 +458,7 @@ Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole } } } + return SUCCESS; } @@ -525,40 +475,65 @@ bool NodeStreamUpdatePass::AreAllPredStreamsInvalid(const NodePtr &node) const { return true; } -void NodeStreamUpdatePass::RefreshContinuousStreams(ComputeGraphPtr whole_graph, Context &context) const { - int64_t stream_num = context.next_stream; - vector stream_has_node(stream_num); +Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vector &subgraphs, Context &context) { + if (!context.hcom_parallel) { + return NOT_CHANGED; + } - for (const NodePtr &node : whole_graph->GetDirectNode()) { - if (node != nullptr) { - auto op_desc = node->GetOpDesc(); - if (op_desc != nullptr) { - int64_t stream_id = op_desc->GetStreamId(); - if (stream_id != kInvalidStream && stream_id < stream_num) { - stream_has_node[stream_id] = true; - } - } + GELOGI("AllReduceParallelPass is enabled."); + GraphUtils::DumpGEGraph(graph, "BeforeAllReduceParallel"); + + // All successors of HcomAllReduce. + set all_reduce_succs; + + for (const NodePtr &node : graph->GetDirectNode()) { + if (node->GetType() != HCOMALLREDUCE || node->GetInDataNodes().size() <= 1) { + continue; } - } - context.next_stream = 0; - vector old_to_new_streams(stream_num, kInvalidStream); - for (size_t old_stream = 0; old_stream < stream_has_node.size(); ++old_stream) { - if (stream_has_node[old_stream]) { - old_to_new_streams[old_stream] = context.next_stream; - ++context.next_stream; + string reduce_stream_label; + GE_CHECK_NOTNULL(node->GetOpDesc()); + (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, reduce_stream_label); + + set cur_nodes = {node}; + while (!cur_nodes.empty()) { + set all_out_data_nodes; + for (auto &curr_node : cur_nodes) { + for (const NodePtr &out_node : curr_node->GetOutDataNodes()) { + string out_stream_label; + GE_CHECK_NOTNULL(out_node->GetOpDesc()); + (void)AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, out_stream_label); + if (out_stream_label == reduce_stream_label) { + all_reduce_succs.emplace(out_node); + all_out_data_nodes.emplace(out_node); + } + } + } + cur_nodes = all_out_data_nodes; } } - for (const NodePtr &node : whole_graph->GetDirectNode()) { - auto op_desc = node->GetOpDesc(); - if (op_desc != nullptr) { - int64_t stream_id = op_desc->GetStreamId(); - if (stream_id != kInvalidStream && stream_id < stream_num) { - op_desc->SetStreamId(old_to_new_streams[stream_id]); + map old_stream_to_new; + for (const NodePtr &node : all_reduce_succs) { + GE_CHECK_NOTNULL(node->GetOpDesc()); + auto old_stream = node->GetOpDesc()->GetStreamId(); + if (old_stream != kInvalidStream) { + int64_t new_stream = kInvalidStream; + auto iter = old_stream_to_new.find(old_stream); + if (iter != old_stream_to_new.end()) { + new_stream = iter->second; + } else { + new_stream = context.next_stream; + context.next_stream++; + old_stream_to_new.emplace(old_stream, new_stream); } + + GELOGI("Stream of node %s has been updated from %ld to %ld.", node->GetName().c_str(), old_stream, new_stream); + node->GetOpDesc()->SetStreamId(new_stream); } } + + return !all_reduce_succs.empty() ? SUCCESS : NOT_CHANGED; } LogicalStreamAllocator::LogicalStreamAllocator(const map &scheduler_confs, @@ -567,9 +542,10 @@ LogicalStreamAllocator::LogicalStreamAllocator(const map context_.hcom_parallel = hcom_parallel; } -Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const vector &subgraph_infos, +Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const Graph2SubGraphInfoList &subgraph_map, int64_t &stream_num) { GE_CHECK_NOTNULL(whole_graph); + map engine_confs; GE_TIMESTAMP_START(InitEngineConfs); for (const auto &item : scheduler_confs_) { @@ -583,16 +559,64 @@ Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const } GE_TIMESTAMP_END(InitEngineConfs, "GraphBuilder::AssignStreamInitEngineConfs"); + Status status = DoAssign(whole_graph, subgraph_map, engine_confs); + if (status != SUCCESS) { + GELOGE(status, "Assign streams failed."); + return status; + } + + vector subgraphs = whole_graph->GetAllSubgraphs(); + for (const ComputeGraphPtr &subgraph : subgraphs) { + Status status = DoAssign(subgraph, subgraph_map, engine_confs); + if (status != SUCCESS) { + GELOGE(status, "Assign streams failed."); + return status; + } + } + + RefreshContinuousStreams(whole_graph); + + stream_num = context_.next_stream; + GELOGI("Assigned logical stream num: %ld.", stream_num); + + return SUCCESS; +} + +Status LogicalStreamAllocator::DoAssign(const ComputeGraphPtr &graph, const Graph2SubGraphInfoList &subgraph_map, + const map &engine_confs) { + GE_CHECK_NOTNULL(graph); + + NodePtr parent_node = graph->GetParentNode(); + if (parent_node == nullptr || parent_node->GetOpDesc() == nullptr) { + context_.parent_stream = kInvalidStream; + } else { + context_.parent_stream = parent_node->GetOpDesc()->GetStreamId(); + } + + auto iter = subgraph_map.find(graph); + if (iter == subgraph_map.end()) { + GELOGE(FAILED, "Graph %s not found.", graph->GetName().c_str()); + return FAILED; + } + + const vector &subgraph_info_list = iter->second; vector subgraphs; GE_TIMESTAMP_START(ConvertSubgraphs); - Status status = ConvertSubgraphs(subgraph_infos, engine_confs, subgraphs); + Status status = ConvertSubgraphs(subgraph_info_list, engine_confs, subgraphs); GE_TIMESTAMP_END(ConvertSubgraphs, "GraphBuilder::AssignStreamConvertSubgraphs"); if (status != SUCCESS) { GELOGE(status, "Create subgraphs failed."); return status; } - return RunPasses(whole_graph, subgraphs, stream_num); + GELOGI("Subgraphs of graph %s:", graph->GetName().c_str()); + for (const auto &subgraph : subgraphs) { + if (subgraph != nullptr) { + GELOGI("subgraph: %s", subgraph->name.c_str()); + } + } + + return RunPasses(graph, subgraphs); } Status LogicalStreamAllocator::ConvertSubgraphs(const vector &subgraph_infos, @@ -631,8 +655,7 @@ Status LogicalStreamAllocator::ConvertSubgraphs(const vector &s return SUCCESS; } -Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &whole_graph, const vector &subgraphs, - int64_t &stream_num) { +Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &graph, const vector &subgraphs) { vector passes; passes.emplace_back(MakeShared()); passes.emplace_back(MakeShared()); @@ -643,7 +666,7 @@ Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &whole_graph, con for (auto &pass : passes) { GE_CHECK_NOTNULL(pass); - Status status = pass->Run(whole_graph, subgraphs, context_); + Status status = pass->Run(graph, subgraphs, context_); if (status == SUCCESS) { GELOGI("Stream pass %s return SUCCESS.", pass->GetName().c_str()); } else if (status == NOT_CHANGED) { @@ -654,9 +677,42 @@ Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &whole_graph, con } } - stream_num = context_.next_stream; - GELOGI("Assigned logical stream num: %ld.", stream_num); - return SUCCESS; } + +void LogicalStreamAllocator::RefreshContinuousStreams(const ComputeGraphPtr &graph) { + int64_t stream_num = context_.next_stream; + vector stream_has_node(stream_num); + + for (const NodePtr &node : graph->GetAllNodes()) { + if (node != nullptr) { + auto op_desc = node->GetOpDesc(); + if (op_desc != nullptr) { + int64_t stream_id = op_desc->GetStreamId(); + if (stream_id != kInvalidStream && stream_id < stream_num) { + stream_has_node[stream_id] = true; + } + } + } + } + + context_.next_stream = 0; + vector old_to_new_streams(stream_num, kInvalidStream); + for (size_t old_stream = 0; old_stream < stream_has_node.size(); ++old_stream) { + if (stream_has_node[old_stream]) { + old_to_new_streams[old_stream] = context_.next_stream; + ++context_.next_stream; + } + } + + for (const NodePtr &node : graph->GetAllNodes()) { + auto op_desc = node->GetOpDesc(); + if (op_desc != nullptr) { + int64_t stream_id = op_desc->GetStreamId(); + if (stream_id != kInvalidStream && stream_id < stream_num) { + op_desc->SetStreamId(old_to_new_streams[stream_id]); + } + } + } +} } // namespace ge diff --git a/src/ge/graph/build/logical_stream_allocator.h b/src/ge/graph/build/logical_stream_allocator.h index 2265a0f3..404d22f9 100644 --- a/src/ge/graph/build/logical_stream_allocator.h +++ b/src/ge/graph/build/logical_stream_allocator.h @@ -60,7 +60,7 @@ class LogicalStreamPass { }; struct Context { - // Next stream id. + int64_t parent_stream = kInvalidStream; int64_t next_stream = 0; bool hcom_parallel = false; }; @@ -71,7 +71,7 @@ class LogicalStreamPass { virtual ~LogicalStreamPass() = default; const std::string &GetName() const; - virtual Status Run(ComputeGraphPtr whole_graph, const std::vector &subgraphs, Context &context) = 0; + virtual Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) = 0; protected: bool IsEngineSkip(const Subgraph &subgraph) const; @@ -93,21 +93,21 @@ using LogicalStreamPassPtr = std::shared_ptr; class AssignByLabelPass : public LogicalStreamPass { public: STREAM_PASS_DEFAULT_FUNC(AssignByLabelPass); - Status Run(ComputeGraphPtr whole_graph, const std::vector &subgraphs, Context &context) override; + Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; }; // Engines such as hccl require independent Stream. class IndependentStreamPass : public LogicalStreamPass { public: STREAM_PASS_DEFAULT_FUNC(IndependentStreamPass); - Status Run(ComputeGraphPtr whole_graph, const std::vector &subgraphs, Context &context) override; + Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; }; // Reuse streams or assign new streams based on dependencies. class AssignByDependencyPass : public LogicalStreamPass { public: STREAM_PASS_DEFAULT_FUNC(AssignByDependencyPass); - Status Run(ComputeGraphPtr whole_graph, const std::vector &subgraphs, Context &context) override; + Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; private: void InitEndSubgraphMap(const std::vector &subgraphs, std::map &end_subgraph_map); @@ -132,7 +132,7 @@ class AssignByDependencyPass : public LogicalStreamPass { std::map engine_stream_num_; // Subgraphs of assign stream by engine - std::set assigned_subgraphs_; + std::vector assigned_subgraphs_; // std::vector> reused_subgraphs_; @@ -142,7 +142,7 @@ class AssignByDependencyPass : public LogicalStreamPass { class NodeStreamUpdatePass : public LogicalStreamPass { public: STREAM_PASS_DEFAULT_FUNC(NodeStreamUpdatePass); - Status Run(ComputeGraphPtr whole_graph, const std::vector &subgraphs, Context &context) override; + Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; private: /// Optimize for case like: @@ -150,19 +150,18 @@ class NodeStreamUpdatePass : public LogicalStreamPass { /// To case: /// NodeA(stream1) -> Const(stream1) -> NodeB(stream1) /// Which could reduce event number (Const could be other type which belong to skipped engine subgraph) - Status UpdateForSkippedEngine(const ComputeGraphPtr &whole_graph, const std::vector &subgraphs); + Status UpdateForSkippedEngine(const ComputeGraphPtr &graph, const std::vector &subgraphs); int64_t GetSingleInoutStream(const NodePtr &node) const; // Judge if all predecessors' streams of node are INVALID_STREAM bool AreAllPredStreamsInvalid(const NodePtr &node) const; - void RefreshContinuousStreams(ComputeGraphPtr whole_graph, Context &context) const; }; // AllReduce and backward operators execute in parallel. class AllReduceParallelPass : public LogicalStreamPass { public: STREAM_PASS_DEFAULT_FUNC(AllReduceParallelPass); - Status Run(ComputeGraphPtr whole_graph, const std::vector &subgraphs, Context &context) override; + Status Run(ComputeGraphPtr graph, const std::vector &subgraphs, Context &context) override; }; // Assign logical streams which is not limited by the number of tasks. @@ -178,13 +177,16 @@ class LogicalStreamAllocator { LogicalStreamAllocator &operator=(const LogicalStreamAllocator &) = delete; ~LogicalStreamAllocator() = default; - Status Assign(const ComputeGraphPtr &whole_graph, const std::vector &subgraphs, int64_t &stream_num); + Status Assign(const ComputeGraphPtr &whole_graph, const Graph2SubGraphInfoList &subgraph_map, int64_t &stream_num); private: + Status DoAssign(const ComputeGraphPtr &graph, const Graph2SubGraphInfoList &subgraph_map, + const map &engine_confs); Status ConvertSubgraphs(const std::vector &subgraph_infos, const std::map &engine_confs, std::vector &subgraphs); - Status RunPasses(const ComputeGraphPtr &whole_graph, const std::vector &subgraphs, int64_t &stream_num); + Status RunPasses(const ComputeGraphPtr &graph, const std::vector &subgraphs); + void RefreshContinuousStreams(const ComputeGraphPtr &graph); const std::map &scheduler_confs_; const std::map &max_parallel_num_; diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc index 77860e4d..4f55a569 100644 --- a/src/ge/graph/build/memory/block_mem_assigner.cc +++ b/src/ge/graph/build/memory/block_mem_assigner.cc @@ -805,6 +805,9 @@ void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t } } op_desc->SetOutputOffset(output_list); + GELOGI("[IMAS]Set %s name[%s] output[%d] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu].", + graph_name.c_str(), op_desc->GetName().c_str(), node_type_index.index, offset, op_desc->GetStreamId(), size, + real_size); } else if (node_type_index.mem_type == kWorkspace) { vector workspace_list; workspace_list = op_desc->GetWorkspace(); @@ -821,6 +824,9 @@ void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t workspace_list.at(node_type_index.index) = offset; } op_desc->SetWorkspace(workspace_list); + GELOGI("[IMAS]Set %s name[%s] workspace[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu].", + graph_name.c_str(), op_desc->GetName().c_str(), node_type_index.index, offset, op_desc->GetStreamId(), size, + real_size); } } diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc index 33e8fcad..bcae79ea 100644 --- a/src/ge/graph/build/memory/graph_mem_assigner.cc +++ b/src/ge/graph/build/memory/graph_mem_assigner.cc @@ -310,6 +310,11 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node) if (is_tensor_actual_size == 0) { AlignMemOffset(MEM_ALIGN_SIZE); } + GELOGI( + "[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] " + "real_size[%ld].", + node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx(), + pre_mem_offset, peer_op_desc->GetStreamId(), (memory_offset_[0].mem_offset_ - pre_mem_offset), tensor_desc_size); } return SUCCESS; @@ -340,6 +345,11 @@ Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node memory_offset_[0].mem_offset_ += tensor_desc_size; AlignMemOffset(MEM_ALIGN_SIZE); + GELOGI( + "[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] " + "real_size[%ld].", + node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(), + pre_mem_offset, out_op_desc->GetStreamId(), (memory_offset_[0].mem_offset_ - pre_mem_offset), tensor_desc_size); } out_op_desc->SetOutputOffset(output_list); @@ -413,8 +423,10 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() { pre_mem_offset, peer_op_desc->GetStreamId(), out_size, output_mem_size); } memory_offset_[0].mem_offset_ += extra_memory_size; - GELOGI("After reassign virtual input node[name:%s, type:%s] memory, memory offset = %zu.", - op_desc->GetName().c_str(), op_desc->GetType().c_str(), memory_offset_[0].mem_offset_); + size_t after_mem_offset = memory_offset_[0].mem_offset_; + AlignMemOffset(MEM_ALIGN_SIZE); + GELOGI("After reassign virtual input node[name:%s, type:%s] memory, memory offset = %zu, align memory = %zu.", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset, memory_offset_[0].mem_offset_); } } return SUCCESS; @@ -499,8 +511,10 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() { } op_desc->SetOutputOffset(output_list); memory_offset_[0].mem_offset_ += extra_memory_size; - GELOGI("After reassign virtual output node[name:%s, type:%s] memory, memory offset = %zu.", - op_desc->GetName().c_str(), op_desc->GetType().c_str(), memory_offset_[0].mem_offset_); + size_t after_mem_offset = memory_offset_[0].mem_offset_; + AlignMemOffset(MEM_ALIGN_SIZE); + GELOGI("After reassign virtual output node[name:%s, type:%s] memory, memory offset = %zu, align memory = %zu.", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset, memory_offset_[0].mem_offset_); } } return SUCCESS; @@ -567,6 +581,11 @@ Status GraphMemoryAssigner::ReAssignMergeMemory() { output_list[index] = data_output_offset; src_node->GetOpDesc()->SetOutputOffset(output_list); + GELOGI( + "[IMAS]ReAssignMergeMemory : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%ld] " + "real_size[%ld].", + n->GetOwnerComputeGraph()->GetName().c_str(), src_node->GetOpDesc()->GetName().c_str(), index, + data_output_offset, src_node->GetOpDesc()->GetStreamId(), max_output_size, max_output_size); input_list.emplace_back(data_output_offset); } @@ -897,6 +916,9 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) { } output_list[output_index] = memory_offset_[0].mem_offset_; + GELOGI("[IMAS]Atomic output : Set %s name[%s] output[%ld] offset to [%zu] stream_id[%ld] size[%ld] real_size[%ld].", + compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index, memory_offset_[0].mem_offset_, + op_desc->GetStreamId(), size, size); memory_offset_[0].mem_offset_ += size; AlignMemOffset(MEM_ALIGN_SIZE); @@ -933,6 +955,11 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc } workspace_vector[workspace_index] = memory_offset_[0].mem_offset_; + GELOGI( + "[IMAS]Atomic ordinary workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] " + "size[%ld] real_size[%ld].", + compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, memory_offset_[0].mem_offset_, + op_desc->GetStreamId(), workspace_size, workspace_size); memory_offset_[0].mem_offset_ += workspace_size; } @@ -958,6 +985,11 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt auto workspace_size = info_iter.second; size_t workspace_offset = memory_offset_[0].mem_offset_; + GELOGI( + "[IMAS]Atomic fusion workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] size[%ld] " + "real_size[%ld].", + compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, memory_offset_[0].mem_offset_, + op_desc->GetStreamId(), workspace_size, workspace_size); memory_offset_[0].mem_offset_ += workspace_size; index_offset.insert(std::make_pair(workspace_index, workspace_offset)); @@ -1005,7 +1037,8 @@ ge::Status GraphMemoryAssigner::SetInputOffset() { GELOGE(FAILED, "memory_offset_ is empty."); return FAILED; } - GEEVENT("[IMAS]AfterAssignMemory : %s", compute_graph_->GetName().c_str()); + GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu]", compute_graph_->GetName().c_str(), + memory_offset_[0].mem_offset_); for (const ge::NodePtr &node : compute_graph_->GetDirectNode()) { if (UpdateOpInputOffset(node) != ge::SUCCESS) { GELOGE(ge::FAILED, "Update op input offset failed"); @@ -1166,6 +1199,12 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, int64_t ato GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector), GELOGE(FAILED, "SetListInt failed."); return FAILED); + + GELOGI( + "[IMAS]SetAtomicCleanAttr : Set %s name[%s] output[%d] offset to [%ld] streamid[%ld] size[%ld] " + "realsize[%ld].", + node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), 0, atomic_mem_start, + node->GetOpDesc()->GetStreamId(), atomic_mem_size, atomic_mem_size); } } return SUCCESS; diff --git a/src/ge/graph/build/model_builder.cc b/src/ge/graph/build/model_builder.cc index af641dcc..ac61eeeb 100644 --- a/src/ge/graph/build/model_builder.cc +++ b/src/ge/graph/build/model_builder.cc @@ -28,6 +28,7 @@ #include "graph/common/omg_util.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_attr_value.h" +#include "graph/ge_context.h" #include "graph/ge_error_codes.h" #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/graph_var_manager.h" @@ -39,7 +40,6 @@ #include "graph/utils/op_desc_utils.h" #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" -#include "graph/ge_context.h" #include "init/gelib.h" #include "memory/memory_assigner.h" #include "omg/version.h" @@ -78,15 +78,16 @@ bool IsGeLocalOp(const ge::ConstOpDescPtr &op_desc) { ge::GeTensorDesc output_desc = op_desc->GetOutputDesc(0); return !(output_desc.GetDataType() == ge::DT_STRING); } - const set ge_local_set = { - ge::STREAMMERGE, ge::MEMCPYASYNC, ge::STREAMACTIVE, ge::STREAMSWITCH, ge::VARIABLE, ge::NOOP, ge::CONSTANT, - ge::ENTER, ge::REFENTER, ge::LOOPCOND, ge::NEXTITERATION, ge::REFNEXTITERATION, ge::EXIT, ge::REFEXIT}; + const set ge_local_set = {ge::STREAMMERGE, ge::MEMCPYASYNC, ge::STREAMACTIVE, ge::STREAMSWITCH, + ge::VARIABLE, ge::NOOP, ge::CONSTANT, ge::ENTER, + ge::REFENTER, ge::LOOPCOND, ge::NEXTITERATION, ge::REFNEXTITERATION, + ge::EXIT, ge::REFEXIT, ge::MEMCPYADDRASYNC}; return (ge_local_set.find(type) != ge_local_set.end()); } } // namespace namespace ge { -ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const vector &subgraphs, +ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const Graph2SubGraphInfoList &subgraphs, const map &stream_max_parallel_num, bool hcom_parallel, int mode) : mem_offset_(0), weight_offset_(kWeightsStartOffset), @@ -225,6 +226,25 @@ Status ModelBuilder::SetInputOutputDesc() { if (!is_loop_graph_ && node_op_desc->GetType() == LOOPCOND) { is_loop_graph_ = true; } + // if user set input node format ND, the expected node for data and netoutput format is ND in + // final graph. + if ((domi::GetContext().format == domi::DOMI_TENSOR_ND) && + ((node_op_desc->GetType() == DATA_TYPE) || (node_op_desc->GetType() == NETOUTPUT))) { + GELOGI("The node [%s] format should be set ND.", node_op_desc->GetName().c_str()); + auto inputDescsPtr = node_op_desc->GetAllInputsDescPtr(); + auto outputDescsPtr = node_op_desc->GetAllOutputsDescPtr(); + ge::Format format = ge::FORMAT_ND; + for (auto &inputDescPtr : inputDescsPtr) { + GE_CHECK_NOTNULL(inputDescPtr); + inputDescPtr->SetFormat(format); + inputDescPtr->SetOriginFormat(format); + } + for (auto &outputDescPtr : outputDescsPtr) { + GE_CHECK_NOTNULL(outputDescPtr); + outputDescPtr->SetFormat(format); + outputDescPtr->SetOriginFormat(format); + } + } if (node_op_desc->GetType() == DATA_TYPE || node_op_desc->GetType() == AIPP_DATA_TYPE) { GELOGD("Data node: %s.", n->GetName().c_str()); diff --git a/src/ge/graph/build/model_builder.h b/src/ge/graph/build/model_builder.h index 4bf03bdc..072126e3 100644 --- a/src/ge/graph/build/model_builder.h +++ b/src/ge/graph/build/model_builder.h @@ -37,7 +37,7 @@ namespace ge { class ModelBuilder { public: - ModelBuilder(ge::ComputeGraphPtr whole_graph, const std::vector &subgraphs, + ModelBuilder(ge::ComputeGraphPtr whole_graph, const Graph2SubGraphInfoList &subgraphs, const std::map &stream_max_parallel_num, bool hcom_parallel, int mode = static_cast(domi::BuildMode::GEN_TASK_WITHOUT_FUSION)); @@ -85,7 +85,7 @@ class ModelBuilder { ge::ComputeGraphPtr compute_graph_; - const std::vector &subgraphs_; + const Graph2SubGraphInfoList &subgraphs_; int64_t stream_num_; diff --git a/src/ge/graph/build/run_context.cc b/src/ge/graph/build/run_context.cc index e3230f5e..f2a41271 100644 --- a/src/ge/graph/build/run_context.cc +++ b/src/ge/graph/build/run_context.cc @@ -164,6 +164,9 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra return ret; } + GELOGI("CreateRunContext: data_mem_base_ = %p, weight_mem_base_ = %p, memory_size = %lu, weight_size = %lu", + data_mem_base_, weight_mem_base_, data_mem_size_, weight_mem_size_); + run_context_ = {rt_model_, nullptr, session_id, data_mem_size_, data_mem_base_, weight_mem_size_, weight_mem_base_, buffer, stream_list_, event_list_, label_list_}; return SUCCESS; diff --git a/src/ge/graph/build/stream_allocator.cc b/src/ge/graph/build/stream_allocator.cc index ffcc2315..88c5e055 100644 --- a/src/ge/graph/build/stream_allocator.cc +++ b/src/ge/graph/build/stream_allocator.cc @@ -40,7 +40,7 @@ const uint32_t kMaxSwitchStreamNum = 1; namespace ge { Status StreamAllocator::AssignLogicalStreams(const std::map &max_parallel_num, bool hcom_parallel) { - GELOGI("AssignLogicalStreams start."); + GELOGI("Assign logical streams start."); GE_CHECK_NOTNULL(whole_graph_); GraphUtils::DumpGEGraph(whole_graph_, "BeforeAssignedLogicalStreams"); GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "BeforeAssignedLogicalStreams"); @@ -52,7 +52,6 @@ Status StreamAllocator::AssignLogicalStreams(const std::map &m } const map &scheduler_confs = gelib->DNNEngineManagerObj().GetSchedulers(); - LogicalStreamAllocator logical_allocator(scheduler_confs, max_parallel_num, hcom_parallel); Status status = logical_allocator.Assign(whole_graph_, subgraphs_, stream_num_); if (status != SUCCESS) { @@ -62,7 +61,7 @@ Status StreamAllocator::AssignLogicalStreams(const std::map &m GraphUtils::DumpGEGraph(whole_graph_, "AfterAssignedLogicalStreams"); GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "AfterAssignedLogicalStreams"); - GELOGI("AssignLogicalStreams success."); + GELOGI("Assign logical streams success."); return SUCCESS; } @@ -136,7 +135,7 @@ Status StreamAllocator::RefreshRealStream(int64_t &stream_num, int64_t &event_nu GELOGI("None of nodes need to assign stream, stream num is 0, it will cause error, so change it to 1"); stream_num_ = 1; } - GELOGI("stream_num_: %ld, event_num_: %u.", stream_num_, event_num_); + GELOGI("stream num: %ld, event num: %u.", stream_num_, event_num_); GELOGI("RefreshRealStream successfully."); stream_num = stream_num_; @@ -148,7 +147,7 @@ Status StreamAllocator::RefreshRealStream(int64_t &stream_num, int64_t &event_nu // Split the stream according to the maximum number of nodes in the stream. Status StreamAllocator::SplitStreams() { if (stream_num_ == 0) { - GELOGI("stream_num_ is 0"); + GELOGI("The number of streams is 0 and no need to split."); return SUCCESS; } diff --git a/src/ge/graph/build/stream_allocator.h b/src/ge/graph/build/stream_allocator.h index e3901205..a18e00d7 100644 --- a/src/ge/graph/build/stream_allocator.h +++ b/src/ge/graph/build/stream_allocator.h @@ -30,7 +30,7 @@ namespace ge { class StreamAllocator { public: - StreamAllocator(ComputeGraphPtr whole_graph, const std::vector &subgraphs) + StreamAllocator(ComputeGraphPtr whole_graph, const Graph2SubGraphInfoList &subgraphs) : whole_graph_(std::move(whole_graph)), subgraphs_(subgraphs) {} StreamAllocator(const StreamAllocator &) = delete; StreamAllocator &operator=(const StreamAllocator &) = delete; @@ -75,7 +75,7 @@ class StreamAllocator { bool IsRecvNodeActivatedBySendNode(const NodePtr &send_node_ptr, const NodePtr &recv_node_ptr) const; ComputeGraphPtr whole_graph_; - const std::vector &subgraphs_; + const Graph2SubGraphInfoList &subgraphs_; int64_t stream_num_{0}; uint32_t event_num_{0}; diff --git a/src/ge/graph/build/stream_graph_optimizer.cc b/src/ge/graph/build/stream_graph_optimizer.cc index 6e0211de..42d1afc1 100644 --- a/src/ge/graph/build/stream_graph_optimizer.cc +++ b/src/ge/graph/build/stream_graph_optimizer.cc @@ -29,19 +29,21 @@ static const int64_t kInvalidStream = -1; namespace ge { StreamGraphOptimizer::~StreamGraphOptimizer() {} -void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, vector &subgraph_infos) { +void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map) { size_t node_size = comp_graph->GetDirectNodesSize(); GELOGI("Refresh placeholder and end nodeId start from node num: %zu", node_size); - for (const auto &sub_graph_info : subgraph_infos) { - ComputeGraphPtr sub_graph = sub_graph_info->GetSubGraph(); - if (sub_graph == nullptr) { - continue; - } - for (ge::NodePtr &node : sub_graph->GetDirectNode()) { - GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return ); - if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) { - node->GetOpDesc()->SetId(static_cast(node_size)); - node_size++; + for (const auto &subgraph_pair : subgraph_map) { + for (const auto &subgraph_info : subgraph_pair.second) { + ComputeGraphPtr subgraph = subgraph_info->GetSubGraph(); + if (subgraph == nullptr) { + continue; + } + for (ge::NodePtr &node : subgraph->GetDirectNode()) { + GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return ); + if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) { + node->GetOpDesc()->SetId(static_cast(node_size)); + node_size++; + } } } } @@ -71,67 +73,71 @@ bool StreamGraphOptimizer::IsSameStreamId(const ComputeGraphPtr &comp_graph) { } Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, - vector &subgraph_infos, + Graph2SubGraphInfoList &subgraph_map, struct RunContext &run_context) { - Status ret = SUCCESS; - GELOGI("Begin to Get optimize streamed subgraph."); + GELOGI("Optimize streamed subgraph start."); - RefreshNodeId(comp_graph, subgraph_infos); + RefreshNodeId(comp_graph, subgraph_map); std::shared_ptr instance = ge::GELib::GetInstance(); GE_CHECK_NOTNULL(instance); - for (auto &sub_graph_info : subgraph_infos) { - ComputeGraphPtr sub_graph = sub_graph_info->GetSubGraph(); - if (sub_graph == nullptr) { - continue; - } + for (const auto &subgraph_pair : subgraph_map) { + for (const auto &subgraph_info : subgraph_pair.second) { + ComputeGraphPtr subgraph = subgraph_info->GetSubGraph(); + GE_CHECK_NOTNULL(subgraph); - std::string engine_name = sub_graph_info->GetEngineName(); + GELOGI("Optimize subgraph %s", subgraph->GetName().c_str()); - vector graph_optimizers; - if (instance->DNNEngineManagerObj().IsEngineRegistered(engine_name)) { - instance->OpsKernelManagerObj().GetGraphOptimizerByEngine(engine_name, graph_optimizers); - GELOGI("Subgraph: %s start optimize streamed graph. engineName: %s, subgraph num: %zu, graph Optimizer num: %zu.", - sub_graph->GetName().c_str(), engine_name.c_str(), subgraph_infos.size(), graph_optimizers.size()); + std::string engine_name = subgraph_info->GetEngineName(); - auto nodes = sub_graph->GetDirectNode(); - if (nodes.empty()) { - continue; - } - if (!IsSameStreamId(sub_graph)) { - GELOGI("There are more than one stream in subgraph %s", sub_graph->GetName().c_str()); - continue; - } - OpDescPtr op_desc = nodes.at(0)->GetOpDesc(); - GE_CHECK_NOTNULL(op_desc); - int64_t stream_id = op_desc->GetStreamId(); - if (static_cast(stream_id) >= run_context.graphStreamList.size()) { - GELOGE(FAILED, "stream_id is bigger than run_context.graphStreamList.size()"); - return FAILED; - } - run_context.stream = run_context.graphStreamList[stream_id]; - GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.", - sub_graph->GetName().c_str(), engine_name.c_str(), stream_id, - static_cast(reinterpret_cast(run_context.stream))); - for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) { - GE_CHECK_NOTNULL(*iter); - ret = (*iter)->OptimizeStreamGraph(*sub_graph, run_context); - if (ret != SUCCESS) { - GELOGE(ret, - "[optimizeStreamedSubGraph]: optimize streamed subgraph failed, subgraph: %s, engine_name: %s, graph " - "Optimizer num: %zu, ret: %u", - sub_graph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size(), ret); - return ret; + vector graph_optimizers; + if (instance->DNNEngineManagerObj().IsEngineRegistered(engine_name)) { + instance->OpsKernelManagerObj().GetGraphOptimizerByEngine(engine_name, graph_optimizers); + GELOGI("Subgraph: %s start optimize streamed graph. engineName: %s, graph Optimizer num: %zu.", + subgraph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size()); + + auto nodes = subgraph->GetDirectNode(); + if (nodes.empty()) { + continue; + } + if (!IsSameStreamId(subgraph)) { + GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str()); + continue; + } + OpDescPtr op_desc = nodes.at(0)->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + int64_t stream_id = op_desc->GetStreamId(); + if (static_cast(stream_id) >= run_context.graphStreamList.size()) { + GELOGE(FAILED, "stream_id %ld is bigger than run_context.graphStreamList.size() %zu", stream_id, + run_context.graphStreamList.size()); + return FAILED; + } + run_context.stream = run_context.graphStreamList[stream_id]; + GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.", + subgraph->GetName().c_str(), engine_name.c_str(), stream_id, + static_cast(reinterpret_cast(run_context.stream))); + for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) { + GE_CHECK_NOTNULL(*iter); + Status ret = (*iter)->OptimizeStreamGraph(*subgraph, run_context); + if (ret != SUCCESS) { + GELOGE( + ret, + "[optimizeStreamedSubGraph]: optimize streamed subgraph failed, subgraph: %s, engine_name: %s, graph " + "Optimizer num: %zu, ret: %u", + subgraph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size(), ret); + return ret; + } + GELOGI( + "[optimizeStreamedSubGraph]: optimize streamed subgraph success, subgraph: %s, engine_name: %s, graph " + "Optimizer num: %zu!", + subgraph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size()); } - GELOGI( - "[optimizeStreamedSubGraph]: optimize streamed subgraph success, subgraph: %s, engine_name: %s, graph " - "Optimizer num: %zu!", - sub_graph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size()); } } } - return ret; + GELOGI("Optimize streamed subgraph success."); + return SUCCESS; } } // namespace ge diff --git a/src/ge/graph/build/stream_graph_optimizer.h b/src/ge/graph/build/stream_graph_optimizer.h index a65f95f2..3133d32d 100644 --- a/src/ge/graph/build/stream_graph_optimizer.h +++ b/src/ge/graph/build/stream_graph_optimizer.h @@ -35,11 +35,11 @@ class StreamGraphOptimizer { virtual ~StreamGraphOptimizer(); - Status OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, std::vector &subgraph_ptr_list, + Status OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map, struct RunContext &run_context); private: - void RefreshNodeId(const ComputeGraphPtr &comp_graph, std::vector &subgraph_ptr_list); + void RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map); bool IsSameStreamId(const ComputeGraphPtr &comp_graph); }; diff --git a/src/ge/graph/build/task_generator.cc b/src/ge/graph/build/task_generator.cc index e8f6dd26..cc34e352 100644 --- a/src/ge/graph/build/task_generator.cc +++ b/src/ge/graph/build/task_generator.cc @@ -221,10 +221,8 @@ Status TaskGenerator::SaveL1fusionNodes(map> &l1_f if (call_check) { auto input_group_id = *input_group_ids.begin(); if (group_id != input_group_id) { - GELOGE(INTERNAL_ERROR, - "L1Fusion: node[name:%s(%s) with group id:%ld and diff from it's input nodes's group id:%ld ", + GELOGW("L1Fusion: node[name:%s(%s) with group id:%ld and diff from it's input nodes's group id:%ld ", name.c_str(), type.c_str(), group_id, input_group_id); - return INTERNAL_ERROR; } } } diff --git a/src/ge/graph/label/label_maker.cc b/src/ge/graph/label/label_maker.cc index 9ab6824c..bf8949f0 100644 --- a/src/ge/graph/label/label_maker.cc +++ b/src/ge/graph/label/label_maker.cc @@ -172,7 +172,7 @@ NodePtr LabelMaker::AddLabelSetLeave(const ComputeGraphPtr &graph, const std::st GELOGI("LabelSet: Create node %s.", op_desc->GetName().c_str()); (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index); - NodePtr label_set = graph->AddNodeFront(op_desc); + NodePtr label_set = graph->AddNode(op_desc); GE_CHECK_NOTNULL_EXEC(label_set, return nullptr); // Link control edge to graph tail. @@ -202,7 +202,7 @@ NodePtr LabelMaker::AddLabelGotoEnter(const ComputeGraphPtr &graph, const std::s return nullptr; } - OpDescPtr op_desc = MakeShared(name, LABELGOTO); + OpDescPtr op_desc = MakeShared(name, LABELGOTOEX); GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); SetStreamIdEnter(graph, op_desc); @@ -238,7 +238,7 @@ NodePtr LabelMaker::AddLabelGotoLeave(const ComputeGraphPtr &graph, const std::s const NodePtr &node = *it; GE_CHECK_NOTNULL_EXEC(node, return nullptr); - OpDescPtr op_desc = MakeShared(name, LABELGOTO); + OpDescPtr op_desc = MakeShared(name, LABELGOTOEX); GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); SetStreamIdLeave(graph, op_desc); @@ -366,6 +366,7 @@ NodePtr LabelMaker::AddLabelSwitchIndex(const ComputeGraphPtr &graph, const std: OpDescPtr op_desc = MakeShared(name, DATA); GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr); + op_desc->SetStreamId(kInvalidStreamId); GELOGI("Data: Create node %s.", op_desc->GetName().c_str()); if (op_desc->AddOutputDesc(desc) != GRAPH_SUCCESS) { diff --git a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc index c3de44c9..9b3c7a0f 100644 --- a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc +++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc @@ -20,11 +20,11 @@ namespace { const uint32_t kCoreDim = 1; // for rtCpuKernelLaunch const char *const kCpuTaskModelEnqueue = "modelEnqueue"; -const char *const kCpuTaskPrepareInput = "modelPrepareInput"; const char *const kCpuTaskWaitEndGraph = "modelWaitEndGraph"; -const char *const kCpuTaskPrepareOutput = "modelPrepareOutput"; +const char *const kCpuTaskPrepareOutput = "bufferPrepareOutput"; const char *const kCpuTaskModelDequeue = "modelDequeue"; const char *const kCpuTaskModelRepeat = "modelRepeat"; +const char *const kCpuTaskZeroCopy = "zeroCpy"; } // namespace namespace ge { @@ -93,19 +93,19 @@ Status CpuTaskModelDequeue::Distribute() { /// /// @ingroup ge -/// @brief definiteness queue schedule, bind output queue to task. -/// @param [in] addr: NetOutput Op input tensor address. -/// @param [in] size: NetOutput Op input tensor size. -/// @param [in] in_mbuf: input mbuf addr for input data. +/// @brief definiteness queue schedule, zero copy. +/// @param [in] mbuf_list: input/output mbuf addr list for input/output data. +/// @param [in] outside_addrs: model input/output memory addr /// @return: 0 for success / others for failed /// -Status CpuTaskPrepareInput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf) { +Status CpuTaskZeroCopy::Init(std::vector &mbuf_list, + std::map> &outside_addrs) { if ((args_ != nullptr) || (args_size_ > 0)) { GELOGE(FAILED, "Task already initialized, size: %u", args_size_); return FAILED; } - args_size_ = sizeof(PrepareInputInfo); + args_size_ = sizeof(AddrMapInfo); rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (status != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); @@ -113,36 +113,99 @@ Status CpuTaskPrepareInput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mbu } GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_) - PrepareInputInfo prepare; - prepare.in_mbuf = in_mbuf; - prepare.mbuf_offset = 0; - prepare.data_size = size; - prepare.data_addr = addr; - status = rtMemcpy(args_, args_size_, &prepare, args_size_, RT_MEMCPY_HOST_TO_DEVICE); + AddrMapInfo addr_map_info; + for (const auto &addrs : outside_addrs) { + addr_map_info.addr_num += addrs.second.size(); + } + GELOGI("addr_map_info.addr_num is %zu", addr_map_info.addr_num); + + // init src_addrs/dst_addrs + size_t index = 0; + vector src_addrs; + vector dst_addrs; + for (const auto &addrs : outside_addrs) { + for (size_t i = 0; i < addrs.second.size(); ++i) { + src_addrs.push_back(mbuf_list.at(index)); + dst_addrs.push_back(reinterpret_cast(addrs.second.at(i))); + } + index++; + } + + // malloc mem for src_addrs/dst_addrs, and copy data of src_addrs/dst_addrs + status = rtMalloc(&src_addr_, src_addrs.size() * sizeof(uint64_t), RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + status = rtMemcpy(src_addr_, src_addrs.size() * sizeof(uint64_t), src_addrs.data(), + src_addrs.size() * sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE); if (status != RT_ERROR_NONE) { GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); return RT_FAILED; } + status = rtMalloc(&dst_addr_, dst_addrs.size() * sizeof(uint64_t), RT_MEMORY_HBM); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status); + return RT_FAILED; + } + status = rtMemcpy(dst_addr_, dst_addrs.size() * sizeof(uint64_t), dst_addrs.data(), + dst_addrs.size() * sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } + + // src_addr_list is init to src_addr, which is the point to src_addrs + if (!src_addrs.empty() && !dst_addrs.empty()) { + addr_map_info.src_addr_list = reinterpret_cast(src_addr_); + addr_map_info.dst_addr_list = reinterpret_cast(dst_addr_); + GELOGI("src_addr_list is %lu, dst_addr_list is %lu", addr_map_info.src_addr_list, addr_map_info.dst_addr_list); + } + + status = rtMemcpy(args_, args_size_, &addr_map_info, sizeof(AddrMapInfo), RT_MEMCPY_HOST_TO_DEVICE); + if (status != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status); + return RT_FAILED; + } return SUCCESS; } -Status CpuTaskPrepareInput::Distribute() { +Status CpuTaskZeroCopy::Distribute() { if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) { GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_); return FAILED; } - rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskPrepareInput, kCoreDim, args_, args_size_, nullptr, stream_); + rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskZeroCopy, kCoreDim, args_, args_size_, nullptr, stream_); if (status != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Call rt CpuKernelLaunch PrepareInput failed, status: 0x%X", status); + GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ZeroCopy failed, status: 0x%X", status); return RT_FAILED; } - GELOGI("Cpu kernel launch prepare input task success."); + GELOGI("Cpu kernel launch zero copy task success."); return SUCCESS; } +CpuTaskZeroCopy::~CpuTaskZeroCopy() { + if (src_addr_ == nullptr && dst_addr_ == nullptr) { + return; + } + if (src_addr_ != nullptr) { + rtError_t status = rtFree(src_addr_); + if (status != RT_ERROR_NONE) { + GELOGW("Call rt free failed, status: 0x%x", status); + } + } + if (dst_addr_ != nullptr) { + rtError_t status = rtFree(dst_addr_); + if (status != RT_ERROR_NONE) { + GELOGW("Call rt free failed, status: 0x%x", status); + } + } + src_addr_ = nullptr; + dst_addr_ = nullptr; +} /// /// @ingroup ge /// @brief definiteness queue schedule, bind output queue to task. diff --git a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h index 8a9af63f..c4ae4df5 100644 --- a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h +++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h @@ -47,6 +47,13 @@ struct PrepareOutputInfo { uintptr_t out_mbuf; // output mbuf addr }; +// For AICPU task "modelZeroCopy" +struct AddrMapInfo { + uint32_t addr_num = 0; + uint64_t src_addr_list; + uint64_t dst_addr_list; +}; + /// /// @ingroup ge /// @brief CpuTask base, inherit from TaskInfo used for manage. @@ -78,17 +85,21 @@ class CpuTaskModelDequeue : public CpuTaskInfo { /// /// @ingroup ge -/// @brief definiteness queue schedule, bind output queue to task. +/// @brief definiteness queue schedule, zero copy. /// -class CpuTaskPrepareInput : public CpuTaskInfo { +class CpuTaskZeroCopy : public CpuTaskInfo { public: - explicit CpuTaskPrepareInput(rtStream_t stream) : CpuTaskInfo(stream) {} - ~CpuTaskPrepareInput() override {} + explicit CpuTaskZeroCopy(rtStream_t stream) : CpuTaskInfo(stream) {} + ~CpuTaskZeroCopy() override; Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; } - Status Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf); + Status Init(std::vector &mbuf_list, std::map> &outside_addrs); Status Distribute() override; + + private: + void *src_addr_ = nullptr; + void *dst_addr_ = nullptr; }; /// diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc index 7b743f3c..19c0ab16 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.cc +++ b/src/ge/graph/load/new_model_manager/davinci_model.cc @@ -78,6 +78,7 @@ namespace { const uint32_t kDataIndex = 0; const uint32_t kTrueBranchStreamNum = 1; const uint32_t kThreadNum = 16; +const uint32_t kAddrLen = sizeof(void *); const int kDecimal = 10; const int kBytes = 8; const uint32_t kDataMemAlignSizeCompare = 64; @@ -100,17 +101,20 @@ class RtContextSwitchGuard { ret = rtCtxSetCurrent(current_); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to switch context to normal, device %u", device_id); + GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id); return; } + GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_); } ~RtContextSwitchGuard() { if (current_ != nullptr) { auto ret = rtCtxDestroy(current_); + GELOGD("Destory current context %p result %d", current_, ret); } if (last_ != nullptr) { auto ret = rtCtxSetCurrent(last_); + GELOGD("Recovery last context %p result %d.", last_, ret); } } @@ -149,7 +153,10 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM); if (var_addr == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to copy var %s from device, cant not get var addr", var->GetName().c_str()); + GELOGE(INTERNAL_ERROR, + "Failed to copy var %s from device, cant not get " + "var addr from logic addr %p", + var->GetName().c_str(), var_logic); return INTERNAL_ERROR; } @@ -177,6 +184,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt GELOGD("Copy var %s from device to host, size %ld", var->GetName().c_str(), var_size_bytes); var_data.swap(var_host); + GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr); + return SUCCESS; } @@ -230,7 +239,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats } } else if (trans_info.node_type == CAST) { auto input_shape = trans_info.input.GetShape(); - auto src_data_size = input_shape.GetShapeSize(); + auto src_data_size = input_shape.GetShapeSize() == 0 ? 1 : input_shape.GetShapeSize(); auto src_data_type = trans_info.input.GetDataType(); auto dst_data_type = trans_info.output.GetDataType(); GELOGD("Trans data type from %s to %s, input shape %s, data size %ld", @@ -284,6 +293,8 @@ Status ReAssignVarAddr(uint64_t session_id, const std::string &var_name, const G } *var_device = var_addr; + GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr); + return SUCCESS; } @@ -399,10 +410,10 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptrGetVarMemoryBase(RT_MEMORY_HBM); + GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id, + var_mem_base_, TotalVarMemSize()); } runtime_param_.mem_base = mem_base_; @@ -631,19 +648,16 @@ Status DavinciModel::DoTaskSink() { if (model_task_def_) { GELOGI("do task_sink."); - // will adjust stream indication, load fist. + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(InitTaskInfo(*model_task_def_.get()) != SUCCESS, return FAILED, + "InitTaskInfo failed."); GE_CHK_STATUS_RET(LoadWithQueue(), "LoadWithQueue failed."); - + // will adjust stream indication, load fist. for (size_t i = 0; i < stream_list_.size(); i++) { GE_IF_BOOL_EXEC(active_stream_indication_.count(i) > 0, GELOGI("rtModelBindStream[%zu]", i); GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, stream_list_[i], RT_INVALID_FLAG)); continue;); // bind rt_model_handel to all streams that relates to op - GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, stream_list_[i], 0)); + GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, stream_list_[i], RT_HEAD_STREAM)); } - - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(InitTaskInfo(*model_task_def_.get()) != SUCCESS, return FAILED, - "InitTaskInfo failed."); - GE_CHK_STATUS_RET(DistributeTask(), "Distribute failed."); GE_CHK_RT_RET(rtModelLoadComplete(rt_model_handle_)); @@ -715,6 +729,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size GE_DISMISS_GUARD(stream); stream_list_.push_back(stream); + GELOGD("Stream index:%u, stream:%p.", i, stream); } for (uint32_t i = 0; i < EventNum(); i++) { @@ -723,12 +738,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size event_list_.push_back(rt_event); } - for (uint32_t i = 0; i < LabelNum(); i++) { - rtLabel_t rt_label; - GE_CHK_RT_RET(rtLabelCreate(&rt_label)); - GE_CHK_BOOL_RET_STATUS(rt_label != nullptr, FAILED, "rt_label is nullptr."); - label_list_.push_back(rt_label); - } + label_list_.resize(LabelNum(), nullptr); // create model_handle to load model GE_CHK_RT_RET(rtModelCreate(&rt_model_handle_, 0)); @@ -803,11 +813,17 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size /// Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { uint32_t data_op_index = 0; - std::map> input_data_info; - GE_TIMESTAMP_CALLNUM_START(LoadTBEKernelBinToOpDesc); GE_TIMESTAMP_CALLNUM_START(InitTbeHandle); + typedef Status (DavinciModel::*OpDescCall)(const OpDescPtr &); + static std::map op_desc_handle = { + {VARIABLE, &DavinciModel::InitVariable}, {CONSTANTOP, &DavinciModel::InitConstant}, + {NETOUTPUT, &DavinciModel::InitNetOutput}, {ENDGRAPH, &DavinciModel::InitEndGraph}, + {STREAMACTIVE, &DavinciModel::InitStreamActive}, {STREAMSWITCH, &DavinciModel::InitStreamSwitch}, + {STREAMSWITCHN, &DavinciModel::InitStreamSwitchN}, {LABELSET, &DavinciModel::InitLabelSet}, + }; + auto nodes = compute_graph->GetAllNodes(); const TBEKernelStore &tbekernel_store = ge_model_->GetTBEKernelStore(); for (size_t i = 0; i < nodes.size(); i++) { @@ -825,7 +841,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc); if (IsDataOp(op_desc->GetType())) { - if (InitDataOp(node, data_op_index, input_data_info) != SUCCESS) { + if (InitDataOp(node, data_op_index) != SUCCESS) { GELOGE(PARAM_INVALID, "Data init failed, Name: %s", op_desc->GetName().c_str()); return PARAM_INVALID; } @@ -839,32 +855,15 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { continue; } - if (op_desc->GetType() == VARIABLE) { - variable_op_list_.push_back(op_desc); - continue; - } - - if (op_desc->GetType() == NETOUTPUT) { - if (InitNetOutput(op_desc) != SUCCESS) { + auto it = op_desc_handle.find(op_desc->GetType()); + if (it != op_desc_handle.end()) { + if ((this->*it->second)(op_desc) != SUCCESS) { GELOGE(PARAM_INVALID, "NetOutput init failed, Name: %s", op_desc->GetName().c_str()); return PARAM_INVALID; } continue; } - // Initialize constant op, only applies to training, ignoring inference constant op - if (op_desc->GetType() == CONSTANTOP) { - if (InitConstant(op_desc) != SUCCESS) { - GELOGE(PARAM_INVALID, "Constant init failed. %s", op_desc->GetName().c_str()); - return PARAM_INVALID; - } - continue; - } - - if (op_desc->GetType() == ENDGRAPH) { - end_graph_op_ = op_desc; - } - GE_TIMESTAMP_RESTART(InitTbeHandle); uint32_t run_mode = static_cast(domi::ImplyType::INVALID); if (AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, run_mode) && @@ -883,17 +882,11 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { } } GE_TIMESTAMP_ADD(InitTbeHandle); - - if (MarkActiveStream(op_desc) != SUCCESS) { - GELOGE(PARAM_INVALID, "MarkActiveStream failed, node:%s, opIndex:%zu", op_desc->GetName().c_str(), i); - return PARAM_INVALID; - } } - Status ret = CombineDataInfo(input_data_info); GE_TIMESTAMP_CALLNUM_END(LoadTBEKernelBinToOpDesc, "GraphLoader::LoadTBEKernelBinToOpDesc."); GE_TIMESTAMP_CALLNUM_END(InitTbeHandle, "GraphLoader::InitTbeHandle."); - return ret; + return SUCCESS; } /// @ingroup ge @@ -902,8 +895,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) { /// @param [in/out] data_op_index: NetOutput addr size info. /// @param [in/out] input_data_info: Data index and addr info {index, {size, addr}}. /// @return Status -Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, - std::map> &input_data_info) { +Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index) { // op_desc Checked by Init: Data, valid. auto op_desc = node->GetOpDesc(); uint32_t parent_index = 0; // Ignore subgraph Data Node. @@ -925,20 +917,20 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index, // Make information for copy input data. const vector output_size_list = ModelUtils::GetOutputSize(op_desc); - const vector output_addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc); - if (output_size_list.empty() || output_addr_list.empty() || (output_size_list.size() != output_addr_list.size())) { + const vector virtual_addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc, false); + if (output_size_list.empty() || virtual_addr_list.empty() || (output_size_list.size() != virtual_addr_list.size())) { GELOGE(PARAM_INVALID, "Data[%s] init failed: Output size is %zu, Output addr is %zu", op_desc->GetName().c_str(), - output_size_list.size(), output_addr_list.size()); + output_size_list.size(), virtual_addr_list.size()); return PARAM_INVALID; } auto data_index = data_op_index; if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) { - GELOGI("ge_train:get new index %u, old %u", data_index, data_op_index); + GELOGI("ge_train: get new index %u, old %u", data_index, data_op_index); } - input_data_info[data_index] = {output_size_list[kDataIndex], output_addr_list[kDataIndex]}; - SetInputOutsideAddr(output_addr_list); + input_data_info_[data_index] = {output_size_list[kDataIndex], virtual_addr_list[kDataIndex]}; + SetInputOutsideAddr(virtual_addr_list); data_op_index++; if (InitInputZeroCopyNodes(node) != SUCCESS) { GELOGE(PARAM_INVALID, "Input zero copy nodes init failed!"); @@ -1001,43 +993,78 @@ Status DavinciModel::InitNetOutput(const OpDescPtr &op_desc) { // Make information for copy output data. const vector input_size_list = ModelUtils::GetInputSize(op_desc); - const vector input_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc); - if (input_size_list.empty() && input_addr_list.empty()) { + const vector virtual_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc, false); + if (input_size_list.empty() && virtual_addr_list.empty()) { GELOGI("NetOutput[%s] is empty.", op_desc->GetName().c_str()); return SUCCESS; } - if (input_size_list.empty() || input_size_list.size() != input_addr_list.size() || + if (input_size_list.empty() || input_size_list.size() != virtual_addr_list.size() || input_size_list.size() != output_size_list.size()) { GELOGE(PARAM_INVALID, "NetOutput[%s] init failed: Input size is %zu, Input addr is %zu, Output size is %zu", - op_desc->GetName().c_str(), input_size_list.size(), input_addr_list.size(), output_size_list.size()); + op_desc->GetName().c_str(), input_size_list.size(), virtual_addr_list.size(), output_size_list.size()); return PARAM_INVALID; } - output_size_list_.insert(output_size_list_.end(), input_size_list.begin(), input_size_list.end()); - output_addr_list_.insert(output_addr_list_.end(), input_addr_list.begin(), input_addr_list.end()); - SetOutputOutsideAddr(input_addr_list); + size_t num = output_data_info_.size(); + for (size_t idx = 0; idx < input_size_list.size(); ++idx) { + output_data_info_[num + idx] = {input_size_list[idx], virtual_addr_list[idx]}; + } + + SetOutputOutsideAddr(virtual_addr_list); return SUCCESS; } /// @ingroup ge -/// @brief Make Input and Output addr for feature use. -/// @param [in] input_data_info: Data index and addr info {index, {size, addr}}. +/// @brief LabelSet Op Initialize. +/// @param [in] op_desc: LabelSet Op descriptor. /// @return Status -Status DavinciModel::CombineDataInfo(const std::map> &input_data_info) { - input_size_list_.resize(data_op_list_.size()); - input_addr_list_.resize(data_op_list_.size()); - for (size_t index = 0; index < data_op_list_.size(); ++index) { - auto it = input_data_info.find(index); - if (it == input_data_info.end()) { - GELOGE(PARAM_INVALID, "Data init failed: index %zu, Data Op size is %zu, Input addr is %zu", index, - data_op_list_.size(), input_data_info.size()); - return INTERNAL_ERROR; - } - input_size_list_[index] = it->second.first; - input_addr_list_[index] = it->second.second; +Status DavinciModel::InitLabelSet(const OpDescPtr &op_desc) { + uint32_t label_index = 0; + if (!AttrUtils::GetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, label_index)) { + GELOGE(INTERNAL_ERROR, "InitLabelSet: %s attr [%s] not exist.", op_desc->GetName().c_str(), + ATTR_NAME_LABEL_SWITCH_INDEX.c_str()); + return INTERNAL_ERROR; } + if (label_index >= LabelNum()) { + GELOGE(INTERNAL_ERROR, "InitLabelSet: label index: %u >= label size: %zu.", label_index, LabelNum()); + return INTERNAL_ERROR; + } + if (label_id_indication_.count(label_index) > 0) { + GELOGE(INTERNAL_ERROR, "InitLabelSet: %s label index: %u already used.", op_desc->GetName().c_str(), label_index); + return INTERNAL_ERROR; + } + + rtStream_t stream = nullptr; + uint32_t stream_id = static_cast(op_desc->GetStreamId()); + if (stream_list_.size() == 1) { + stream = stream_list_[0]; + } else if (stream_list_.size() > stream_id) { + stream = stream_list_[stream_id]; + } else { + GELOGE(INTERNAL_ERROR, "InitLabelSet: stream index: %u >= stream size: %zu.", stream_id, stream_list_.size()); + return INTERNAL_ERROR; + } + + rtLabel_t rt_label = nullptr; + rtError_t rt_error = rtLabelCreate(&rt_label); + if (rt_error != RT_ERROR_NONE || rt_label == nullptr) { + GELOGE(INTERNAL_ERROR, "InitLabelSet: %s create label failed, error=0x%x.", op_desc->GetName().c_str(), rt_error); + return INTERNAL_ERROR; + } + + GELOGI("InitLabelSet: label[%u]=%p stream[%u]=%p.", label_index, rt_label, stream_id, stream); + label_id_indication_.insert(label_index); + label_list_[label_index] = rt_label; + return SUCCESS; +} - GELOGI("Data init success, input size %zu, output size %zu", input_size_list_.size(), output_size_list_.size()); +Status DavinciModel::InitVariable(const OpDescPtr &op_desc) { + variable_op_list_.push_back(op_desc); + return SUCCESS; +} + +Status DavinciModel::InitEndGraph(const OpDescPtr &op_desc) { + end_graph_op_ = op_desc; return SUCCESS; } @@ -1070,31 +1097,34 @@ Status DavinciModel::LoadWithQueue() { return SUCCESS; } - if (input_queue_ids_.size() != data_op_list_.size()) { + if (input_queue_ids_.size() != input_data_info_.size()) { GELOGE(PARAM_INVALID, "Input queue ids not match model: input_queue=%zu input_data=%zu", input_queue_ids_.size(), - data_op_list_.size()); + input_data_info_.size()); return PARAM_INVALID; } - if (output_queue_ids_.size() != output_size_list_.size()) { + if (output_queue_ids_.size() != output_data_info_.size()) { GELOGE(PARAM_INVALID, "Output queue ids not match model: output_queue=%zu output_data=%zu", - output_queue_ids_.size(), output_size_list_.size()); + output_queue_ids_.size(), output_data_info_.size()); return PARAM_INVALID; } // create stream instance which rt_model_handel is running on, this is S0. GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_AICPU)); is_inner_model_stream_ = true; - GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, rt_model_stream_, 0)); + GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, rt_model_stream_, RT_HEAD_STREAM)); // Binding input_queue and Data Op. GE_CHK_STATUS_RET(BindInputQueue(), "Launch bind input queue failed."); - - GE_CHK_STATUS_RET(BindActiveStream(), "Launch active entry stream failed."); - GE_CHK_STATUS_RET(CpuWaitEndGraph(), "Launch wait end graph failed."); + GE_CHK_STATUS_RET(CpuTaskModelZeroCopy(input_mbuf_list_, input_outside_addrs_), "Launch zero copy failed."); // Binding output_queue and NetOutput Op. GE_CHK_STATUS_RET(BindOutputQueue(), "Launch bind output queue failed."); + GE_CHK_STATUS_RET(CpuTaskModelZeroCopy(output_mbuf_list_, output_outside_addrs_), "Launch zero copy failed."); + + GE_CHK_STATUS_RET(BindActiveStream(), "Launch active entry stream failed."); + GE_CHK_STATUS_RET(CpuWaitEndGraph(), "Launch wait end graph failed."); + GE_CHK_STATUS_RET(BindEnqueue(), "Launch enqueue failed.") GE_CHK_STATUS_RET(CpuModelRepeat(), "Launch model repeat failed."); return SUCCESS; @@ -1106,9 +1136,15 @@ Status DavinciModel::LoadWithQueue() { Status DavinciModel::BindInputQueue() { // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size() for (size_t i = 0; i < input_queue_ids_.size(); ++i) { + auto it = input_data_info_.find(i); + if (it == input_data_info_.end()) { + GELOGE(FAILED, "Input not match: tensor num=%zu, Queue id index=%zu", input_data_info_.size(), i); + return FAILED; + } + uint32_t queue_id = input_queue_ids_[i]; - uint32_t data_size = input_size_list_[i]; - uintptr_t data_addr = reinterpret_cast(input_addr_list_[i]); + uint32_t data_size = static_cast(it->second.first); + uintptr_t data_addr = reinterpret_cast(it->second.second); GELOGI("BindInputToQueue: graph_%u index[%zu] queue id[%u] output addr[0x%lx] output size[%u]", runtime_param_.graph_id, i, queue_id, data_addr, data_size); @@ -1116,7 +1152,7 @@ Status DavinciModel::BindInputQueue() { return INTERNAL_ERROR; } - if (CpuModelDequeue(queue_id, data_addr, data_size) != SUCCESS) { + if (CpuModelDequeue(queue_id) != SUCCESS) { return INTERNAL_ERROR; } } @@ -1125,57 +1161,12 @@ Status DavinciModel::BindInputQueue() { } /// @ingroup ge -/// @brief queue schedule, bind output queue to NetOutput input address. -/// @return: 0 for success / others for failed -Status DavinciModel::BindOutputQueue() { - // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size() - for (size_t i = 0; i < output_queue_ids_.size(); ++i) { - uint32_t queue_id = output_queue_ids_[i]; - uint32_t data_size = output_size_list_[i]; - uintptr_t data_addr = reinterpret_cast(output_addr_list_[i]); - GELOGI("BindOutputToQueue: graph_%u index[%zu] queue id[%u] input addr[0x%lx] input size[%u]", - runtime_param_.graph_id, i, queue_id, data_addr, data_size); - - if (rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_OUTPUT_QUEUE) != RT_ERROR_NONE) { - return INTERNAL_ERROR; - } - - if (CpuModelEnqueue(queue_id, data_addr, data_size) != SUCCESS) { - return INTERNAL_ERROR; - } - } - - return SUCCESS; -} - -/// @ingroup ge -/// @brief queue schedule, active stream will schedule by S0. -/// @return: 0 for success / others for failed -Status DavinciModel::BindActiveStream() { - // Stream not in active_stream_indication_ is active stream. - std::vector active_stream_list; - for (size_t i = 0; i < stream_list_.size(); ++i) { - if (active_stream_indication_.count(i) == 0) { - active_stream_list.push_back(stream_list_[i]); - active_stream_indication_.insert(i); // deactive all model stream. - } - } - - // Active stream add to active entry, will active by S0. - if (CpuActiveStream(active_stream_list) != SUCCESS) { - return INTERNAL_ERROR; - } - - return SUCCESS; -} - -/// @ingroup ge /// @brief definiteness queue schedule, bind input queue to task. /// @param [in] queue_id: input queue id from user. /// @param [in] addr: Data Op output tensor address. /// @param [in] size: Data Op output tensor size. /// @return: 0 for success / others for failed -Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t size) { +Status DavinciModel::CpuModelDequeue(uint32_t queue_id) { GELOGI("Set CpuKernel model dequeue task enter."); std::shared_ptr dequeue_task = MakeShared(rt_model_stream_); if (dequeue_task == nullptr) { @@ -1189,20 +1180,55 @@ Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t return FAILED; } - std::shared_ptr prepare_input = MakeShared(rt_model_stream_); - if (dequeue_task == nullptr) { - GELOGE(FAILED, "Make CpuTaskPrepareInput task failed."); + cpu_task_list_.push_back(dequeue_task); + input_mbuf_list_.push_back(in_mbuf); + GELOGI("Set CpuKernel model dequeue task success."); + return SUCCESS; +} + +Status DavinciModel::CpuTaskModelZeroCopy(std::vector &mbuf_list, + std::map> &outside_addrs) { + GELOGI("Set CpuKernel model zero_copy task enter."); + std::shared_ptr zero_copy = MakeShared(rt_model_stream_); + if (zero_copy == nullptr) { + GELOGE(FAILED, "Make CpuTaskZeroCopy task failed."); return FAILED; } - if (prepare_input->Init(addr, size, in_mbuf) != SUCCESS) { + if (zero_copy->Init(mbuf_list, outside_addrs) != SUCCESS) { return FAILED; } + cpu_task_list_.push_back(zero_copy); + GELOGI("Set CpuKernel model zero_copy task success."); + return SUCCESS; +} + +/// @ingroup ge +/// @brief queue schedule, bind output queue to NetOutput input address. +/// @return: 0 for success / others for failed +Status DavinciModel::BindOutputQueue() { + // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size() + for (size_t i = 0; i < output_queue_ids_.size(); ++i) { + auto it = output_data_info_.find(i); + if (it == output_data_info_.end()) { + GELOGE(FAILED, "Output not match: tensor num=%zu, Queue id index=%zu", output_data_info_.size(), i); + return FAILED; + } + + uint32_t queue_id = output_queue_ids_[i]; + uint32_t data_size = static_cast(it->second.first); + uintptr_t data_addr = reinterpret_cast(it->second.second); + GELOGI("BindOutputToQueue: graph_%u index[%zu] queue id[%u] input addr[0x%lx] input size[%u]", + runtime_param_.graph_id, i, queue_id, data_addr, data_size); + + if (rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_OUTPUT_QUEUE) != RT_ERROR_NONE) { + return INTERNAL_ERROR; + } + if (CpuModelPrepareOutput(data_addr, data_size) != SUCCESS) { + return INTERNAL_ERROR; + } + } - cpu_task_list_.push_back(dequeue_task); - cpu_task_list_.push_back(prepare_input); - input_mbuf_list_.push_back(in_mbuf); - GELOGI("Set CpuKernel model dequeue task success."); return SUCCESS; } @@ -1212,7 +1238,7 @@ Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t /// @param [in] addr: NetOutput Op input tensor address. /// @param [in] size: NetOutput Op input tensor size. /// @return: 0 for success / others for failed -Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t size) { +Status DavinciModel::CpuModelPrepareOutput(uintptr_t addr, uint32_t size) { GELOGI("Set CpuKernel model enqueue task enter."); if (input_mbuf_list_.empty()) { GELOGE(FAILED, "Need input mbuf for fill output mbuf head info."); @@ -1230,20 +1256,30 @@ Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t return FAILED; } - std::shared_ptr model_enqueue = MakeShared(rt_model_stream_); - if (model_enqueue == nullptr) { - GELOGE(FAILED, "Make CpuTaskModelEnqueue task failed."); - return FAILED; + cpu_task_list_.push_back(prepare_output); + output_mbuf_list_.push_back(out_mbuf); + GELOGI("Set CpuKernel model enqueue task success."); + return SUCCESS; +} + +/// @ingroup ge +/// @brief queue schedule, active stream will schedule by S0. +/// @return: 0 for success / others for failed +Status DavinciModel::BindActiveStream() { + // Stream not in active_stream_indication_ is active stream. + std::vector active_stream_list; + for (size_t i = 0; i < stream_list_.size(); ++i) { + if (active_stream_indication_.count(i) == 0) { + active_stream_list.push_back(stream_list_[i]); + active_stream_indication_.insert(i); // deactive all model stream. + } } - if (model_enqueue->Init(queue_id, out_mbuf) != SUCCESS) { - return FAILED; + // Active stream add to active entry, will active by S0. + if (CpuActiveStream(active_stream_list) != SUCCESS) { + return INTERNAL_ERROR; } - cpu_task_list_.push_back(prepare_output); - cpu_task_list_.push_back(model_enqueue); - output_mbuf_list_.push_back(out_mbuf); - GELOGI("Set CpuKernel model enqueue task success."); return SUCCESS; } @@ -1293,6 +1329,38 @@ Status DavinciModel::CpuWaitEndGraph() { return SUCCESS; } +Status DavinciModel::BindEnqueue() { + for (size_t i = 0; i < output_queue_ids_.size(); ++i) { + auto it = output_data_info_.find(i); + if (it == output_data_info_.end()) { + GELOGE(FAILED, "Output not match: tensor num=%zu, Queue id index=%zu", output_data_info_.size(), i); + return FAILED; + } + + uint32_t queue_id = output_queue_ids_[i]; + if (CpuModelEnqueue(queue_id, output_mbuf_list_[i]) != SUCCESS) { + return INTERNAL_ERROR; + } + } + return SUCCESS; +} + +Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf) { + GELOGI("Set CpuKernel model enqueue task enter."); + std::shared_ptr model_enqueue = MakeShared(rt_model_stream_); + if (model_enqueue == nullptr) { + GELOGE(FAILED, "Make CpuTaskModelEnqueue task failed."); + return FAILED; + } + + if (model_enqueue->Init(queue_id, out_mbuf) != SUCCESS) { + return FAILED; + } + cpu_task_list_.push_back(model_enqueue); + GELOGI("Set CpuKernel model enqueue task enter."); + return SUCCESS; +} + /// @ingroup ge /// @brief definiteness queue schedule, repeat run model. /// @return: 0 for success / others for failed @@ -1589,17 +1657,35 @@ ge::Format DavinciModel::GetFormat() { } Status DavinciModel::CopyInputData(const InputData ¤t_data, bool device_data) { - Status ret = SUCCESS; - uint32_t data_op_index = 0; + rtMemcpyKind_t kind = device_data ? RT_MEMCPY_DEVICE_TO_DEVICE : RT_MEMCPY_HOST_TO_DEVICE; + const std::vector &blobs = current_data.blobs; + for (const auto &data : input_data_info_) { + if (data.first >= blobs.size()) { + GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld", blobs.size(), + input_data_info_.size(), data.first, data.second.first); + return FAILED; + } - for (auto op_desc : data_op_list_) { - ret = CopyInputDataToModel(current_data.blobs, data_op_index, device_data); + const DataBuffer &data_buf = blobs[data.first]; + // if data attr support zero copy, then update addrs info to flowtable + bool flag = data_buf.isDataSupportMemShare && support_mem_shared_flag_; + if (flag) { + GELOGI("No need to copy input data, user's input data buffer can be shared."); + continue; + } + + void *mem_addr = data.second.second; + uint32_t mem_size = static_cast(data.second.first); + GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length, PARAM_INVALID, + "input data size(%u) does not match model required size(%u), ret failed.", data_buf.length, + mem_size); - GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret failed, index:%u, model id:%u", - current_data.index, current_data.model_id); - data_op_index++; + GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%u] datasize[%u]", + runtime_param_.graph_id, data.first, mem_addr, mem_size, data_buf.length); + GE_CHK_RT_RET(rtMemcpy(mem_addr, mem_size, data_buf.data, data_buf.length, kind)); } - return ret; + + return SUCCESS; } Status DavinciModel::SyncVarData() { @@ -1917,134 +2003,6 @@ void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) { } return; } -/// -/// @ingroup domi_ome -/// @brief copy input data to Model's firat OP. Address already malloced when Load -/// @copy need datatype transfer: FLOAT to FP16, 4D to 5D; -/// @param [in] data data pointer to be copy -/// @return Status result -/// @author -/// -Status DavinciModel::CopyInputDataToModel(const std::vector &data, uint32_t data_op_index, - bool device_data) { - GE_CHK_BOOL_RET_STATUS(!data_op_list_.empty(), PARAM_INVALID, "data_op_list_ is empty!"); - - GE_CHK_BOOL_RET_STATUS(data_op_list_.size() == data.size(), PARAM_INVALID, - "The input data list size (%zu) does not match the model input list size (%zu)", data.size(), - data_op_list_.size()); - - GE_CHK_BOOL_RET_STATUS(data_op_index < data_op_list_.size(), PARAM_INVALID, - "input data op index(%zu) is invalid, exceeds input op size(%zu)", data_op_index, - data_op_list_.size()); - - /// input datatype conversion, converting FLOAT to FP16, 4D to 5D at the same time. - /// Choose respective mode in API parameters. - auto op_def = data_op_list_[data_op_index]; - GE_CHK_BOOL_EXEC(op_def != nullptr, return PARAM_INVALID, "op_def is null!"); - - auto data_index = data_op_index; - if (AttrUtils::GetInt(op_def, "index", data_index)) { - GELOGI("ge_train:get new index %u , old %u", data_index, data_op_index); - } - - GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size()); - GE_CHK_BOOL_RET_STATUS(op_def->GetInputsSize() == 1 && op_def->GetOutputsSize() == 1, PARAM_INVALID, - "Data Op has invalid input_desc_size(%zu) or output_desc_size(%zu)", op_def->GetInputsSize(), - op_def->GetOutputsSize()); - - // float to float16 - bool need_trans_flag = ModelUtils::IsInputTensorNeedTrans(data_op_list_[data_op_index], 0); - - int64_t output_size = 0; - GE_CHK_STATUS(TensorUtils::GetSize(*op_def->GetOutputDescPtr(0), output_size), "get output size failed."); - GE_CHK_BOOL_RET_STATUS(output_size >= data[data_index].length, PARAM_INVALID, - "input data size(%u) does not match model required size(%zu), ret failed.", - data[data_index].length, output_size); - - vector outputs = op_def->GetOutputOffset(); - if (device_data) { - return CopyPlainData(data, data_index, data_op_index, outputs, RT_MEMCPY_DEVICE_TO_DEVICE); - } else if (need_trans_flag) { - return CopyTransData(data, data_index, data_op_index, outputs); - } else { - return CopyPlainData(data, data_index, data_op_index, outputs, RT_MEMCPY_HOST_TO_DEVICE); - } -} - -Status DavinciModel::CopyTransData(const std::vector &data, uint32_t data_index, uint32_t data_op_index, - const std::vector &outputs) { - GE_CHECK_VECTOR_NOT_EMPTY(outputs); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(outputs[0] == -1, return PARAM_INVALID, "output offset is -1"); - GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size()); - - auto input_tensor_desc = data_op_input_tensor_desc_map_[data_op_list_[data_op_index]->GetName()]; - auto output_tensor_desc = data_op_output_tensor_desc_map_[data_op_list_[data_op_index]->GetName()]; - - uint8_t *src_data = reinterpret_cast(data[data_index].data); - - formats::TransResult tmp_result{}; - auto input_shape = input_tensor_desc->GetShape(); - auto src_data_size = input_shape.GetShapeSize(); - auto src_data_type = input_tensor_desc->GetDataType(); - auto dst_data_type = output_tensor_desc->GetDataType(); - GELOGD("Trans data type from %s to %s, input shape %s, data size %zu", - TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), - src_data_size); - auto ret = - formats::TransDataType({src_data, static_cast(src_data_size), src_data_type, dst_data_type}, tmp_result); - if (ret != SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %zu, error code %u", - TypeUtils::DataTypeToSerialString(src_data_type).c_str(), - TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(), - src_data_size, ret); - return ret; - } - - void *mem_addr = mem_base_ + outputs[0]; - auto rt_ret = rtMemcpy(mem_addr, static_cast(runtime_param_.mem_size - outputs[0]), - reinterpret_cast(tmp_result.data.get()), static_cast(tmp_result.length), - RT_MEMCPY_HOST_TO_DEVICE); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", tmp_result.length); - return RT_FAILED; - } - GELOGI("[IMAS]CopyTransData memcpy graph_%u type[F] name[%s] output[%d] memaddr[%p] datasize[%zu]", - runtime_param_.graph_id, data_op_list_[data_op_index]->GetName().c_str(), 0, mem_addr, tmp_result.length); - return SUCCESS; -} - -Status DavinciModel::CopyPlainData(const std::vector &data, uint32_t data_index, uint32_t data_op_index, - const std::vector &outputs, rtMemcpyKind_t kind) { - GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size()); - bool flag = data[data_index].isDataSupportMemShare && support_mem_shared_flag_; - // if data attr support zero cpy,then update addrs info to flowtable - if (flag) { - GELOGI("No need to copy input data, user's input data buffer can be shared."); - return SUCCESS; - } - - GE_CHECK_VECTOR_NOT_EMPTY(outputs); - // P2P memory space parameters - void *host_data_addr = data[data_index].data; - uint32_t copy_size = data[data_index].length; - GELOGD("data output tensor is aipp tensor,copy data only."); - - void *data_out_addr = nullptr; - if (VarManager::Instance(session_id_)->IsVarAddr(outputs[0])) { - data_out_addr = var_mem_base_ + outputs[0] - runtime_param_.logic_var_base; - } else { - data_out_addr = mem_base_ + outputs[0]; - GELOGI("output[0]=%ld, copy_size=%u, total_size=%zu", outputs[0], copy_size, TotalMemSize()); - - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(((uint64_t)outputs[0] + (uint64_t)copy_size) > TotalMemSize(), return INTERNAL_ERROR, - "input offset add size is large than total memory."); - } - - GE_CHK_RT_RET(rtMemcpy(data_out_addr, copy_size, host_data_addr, copy_size, kind)); - - return SUCCESS; -} /// /// @ingroup domi_ome @@ -2061,9 +2019,9 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data) { } else { output_data.index = data_id; output_data.model_id = model_id_; - GE_CHK_BOOL_RET_STATUS(output_data.blobs.size() == output_size_list_.size(), INTERNAL_ERROR, + GE_CHK_BOOL_RET_STATUS(output_data.blobs.size() == output_data_info_.size(), INTERNAL_ERROR, "output buffer size[%zu] not equal output_size_list[%zu] size!", output_data.blobs.size(), - output_size_list_.size()); + output_data_info_.size()); // index of data in output_data uint32_t output_data_index = 0; @@ -2100,6 +2058,9 @@ Status DavinciModel::CopyOutputDataToUser(OpDescPtr &op_desc, std::vectorGetName().c_str(), i, data_buf.data, data_buf.length, v_output_size[i]); GE_CHK_RT_RET(rtMemcpy(data_buf.data, size, v_output_data_addr[i], size, RT_MEMCPY_DEVICE_TO_DEVICE)); } @@ -2417,7 +2378,6 @@ void *DavinciModel::Run(DavinciModel *model) { CsaInteract::GetInstance().WriteInternalErrorCode(); GELOGI("Model run end, model id:%u", model->model_id_); - GEEVENT("Model Run thread end, model_id:%u.", model->model_id_); return nullptr; } @@ -2778,20 +2738,20 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64 /// /// @ingroup ge /// @brief Copy Inputs and Outputs addr to model for direct use. -/// @param [in] const domi::InputData &input_data: model input data. -/// @param [in] domi::OutputData &output_data: model output data. +/// @param [in] const InputData &input_data: model input data. +/// @param [in] OutputData &output_data: model output data. /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input /// @return SUCCESS handle successfully / PARAM_INVALID for failed /// Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic_input) { - if (ZeroCopyBlobs(input_addr_list_, input_size_list_, input_data.blobs, is_dynamic_input, kInputZeroCopy, - input_data.batch_label) != SUCCESS) { + if (ZeroCopyBlobs(input_data_info_, input_data.blobs, is_dynamic_input, kInputZeroCopy, input_data.batch_label) != + SUCCESS) { GELOGE(PARAM_INVALID, "Copy input data to model failed."); return PARAM_INVALID; } - if (ZeroCopyBlobs(output_addr_list_, output_size_list_, output_data.blobs, is_dynamic_input, kOutputZeroCopy, - input_data.batch_label) != SUCCESS) { + if (ZeroCopyBlobs(output_data_info_, output_data.blobs, is_dynamic_input, kOutputZeroCopy, input_data.batch_label) != + SUCCESS) { GELOGE(PARAM_INVALID, "Copy output data to model failed."); return PARAM_INVALID; } @@ -2804,31 +2764,37 @@ Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &outp /// /// @ingroup ge /// @brief Copy Data addr to model for direct use. -/// @param [in] const vector &addrs: model input memory addr list. -/// @param [in] const vector &sizes: model input memory size list. +/// @param [in] const vstd::map> &data_info: model memory addr/size list. /// @param [in] const std::vector &blobs: user input data list. /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy /// @param [in] string batch_label: batch label for multi-batch scenes /// @return SUCCESS handle successfully / others handle failed /// -Status DavinciModel::ZeroCopyBlobs(const std::vector &addr_list, const std::vector &size_list, +Status DavinciModel::ZeroCopyBlobs(const std::map> &data_info, const std::vector &blobs, bool is_dynamic_input, ZeroCopyMode zero_copy_mode, std::string batch_label) { - if ((blobs.size() != addr_list.size()) || (blobs.size() != size_list.size())) { - GELOGE(FAILED, "Blobs not match: blobs=%zu addr=%zu size=%zu", blobs.size(), addr_list.size(), size_list.size()); + if (blobs.size() != data_info.size()) { + GELOGE(FAILED, "Blobs not match: blobs=%zu datas=%zu", blobs.size(), data_info.size()); return FAILED; } - for (size_t idx = 0; idx < size_list.size(); ++idx) { - const DataBuffer &data_buf = blobs[idx]; + for (const auto &data : data_info) { + if (data.first >= blobs.size()) { + GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u", blobs.size(), data_info.size(), data.first); + return FAILED; + } + int64_t mem_size = data.second.first; + void *mem_addr = data.second.second; + + const DataBuffer &data_buf = blobs[data.first]; if (data_buf.data == nullptr) { - GELOGE(FAILED, "data_buf.data is nullptr, index=%zu", idx); + GELOGE(FAILED, "data_buf.data is nullptr, index=%u", data.first); return FAILED; } - GELOGI("Copy Blobs %zu: Input data length is %u, Op data size is %u.", idx, data_buf.length, size_list[idx]); - if (!CheckInputAndModelSize(data_buf.length, size_list[idx], is_dynamic_input)) { + GELOGI("Copy Blobs %u: Input data length is %u, Op data size is %ld.", data.first, data_buf.length, mem_size); + if (!CheckInputAndModelSize(data_buf.length, mem_size, is_dynamic_input)) { GELOGE(FAILED, "Check input size and model size failed"); return FAILED; } @@ -2838,14 +2804,14 @@ Status DavinciModel::ZeroCopyBlobs(const std::vector &addr_list, const s } if (zero_copy_mode == kInputZeroCopy) { - if (ZeroCopyInputBlobs(addr_list[idx], size_list[idx], data_buf, zero_copy_mode, batch_label) != SUCCESS) { + if (ZeroCopyInputBlobs(mem_addr, mem_size, data_buf, zero_copy_mode, batch_label) != SUCCESS) { GELOGE(FAILED, "Zero copy input blobs failed"); return FAILED; } } if (zero_copy_mode == kOutputZeroCopy && !is_dynamic_input) { - if (ZeroCopyImpl(addr_list[idx], data_buf, zero_copy_mode, batch_label) != SUCCESS) { + if (ZeroCopyImpl(mem_addr, data_buf, zero_copy_mode, batch_label) != SUCCESS) { GELOGE(FAILED, "Output zero copy data node copy failed"); return FAILED; } @@ -2940,11 +2906,21 @@ Status DavinciModel::ZeroCopyImpl(const void *src_addr, const DataBuffer &data_b if (!CheckDynamicBatchZeroCopyAddr(addr, dynamic_input_addrs, fix_input_addrs)) { continue; } - __builtin_prefetch(addr); - rtError_t rt_err = rtMemcpy(addr, sizeof(void *), &dst_addr, sizeof(void *), RT_MEMCPY_HOST_TO_DEVICE); - if (rt_err != RT_ERROR_NONE) { - GELOGE(FAILED, "ZeroCopyImpl: rtMemcpy failed"); - return FAILED; + + if (is_async_mode_) { + rtError_t rt_err = + rtMemcpyAsync(addr, kAddrLen, &dst_addr, kAddrLen, RT_MEMCPY_HOST_TO_DEVICE_EX, rt_model_stream_); + if (rt_err != RT_ERROR_NONE) { + GELOGE(FAILED, "ZeroCopyImpl: rtMemcpyAsync failed"); + return FAILED; + } + } else { + __builtin_prefetch(addr); + rtError_t rt_err = rtMemcpy(addr, kAddrLen, &dst_addr, kAddrLen, RT_MEMCPY_HOST_TO_DEVICE); + if (rt_err != RT_ERROR_NONE) { + GELOGE(FAILED, "ZeroCopyImpl: rtMemcpy failed"); + return FAILED; + } } GELOGI("[IMAS]refresh in/out addr new:%p, old:%p", dst_addr, src_addr); } @@ -2999,7 +2975,7 @@ const char *DavinciModel::GetRegisterStub(const string &binfile, const string &s /// @brief Constant Op Init. /// @return Status /// -Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const { +Status DavinciModel::InitConstant(const OpDescPtr &op_desc) { auto v_weights = ModelUtils::GetWeights(op_desc); auto v_output_size = ModelUtils::GetOutputSize(op_desc); auto v_output_addr = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc); @@ -3023,17 +2999,24 @@ Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const { /// the logic of GetShapeSize is wrong, the scaler tensor's GetShapeSize is zero /// and that of unknown shape is zero too. /// unknown shape will not appear here, so we can use zero judge a tensor is scaler or not - int64_t elem_num = tensor_shape.GetShapeSize() == 0 ? 1 : tensor_shape.GetShapeSize(); + int64_t elem_num = tensor_shape.GetShapeSize(); + if (elem_num == 0 && tensor_shape.GetDims().size() == 0) { + elem_num = 1; + } uint64_t *buff = reinterpret_cast(tensor->MutableData().data()); GE_CHK_BOOL_RET_STATUS(ge::CheckInt64Uint32MulOverflow(elem_num, kBytes) == SUCCESS, FAILED, "Shape size is invalid"); - int64_t offset = elem_num * kBytes; + uint64_t offset = static_cast(elem_num * kBytes); - uint64_t hbm_raw_data_base_addr = reinterpret_cast(v_output_addr[0]) + offset; + uint64_t hbm_raw_data_base_addr = + reinterpret_cast(reinterpret_cast(v_output_addr[0])) + offset; for (int64_t i = elem_num - 1; i >= 0; --i) { buff[i] = hbm_raw_data_base_addr + (buff[i] - buff[0]); } } + GELOGI("[IMAS]InitConstant memcpy graph_%u type[V] name[%s] output[%d] memaddr[%p] mem_size[%u] datasize[%zu]", + runtime_param_.graph_id, op_desc->GetName().c_str(), 0, v_output_addr[0], v_output_size[0], + tensor->GetData().size()); GE_CHK_RT_RET(rtMemcpy(v_output_addr[0], v_output_size[0], tensor->GetData().data(), tensor->GetData().size(), RT_MEMCPY_HOST_TO_DEVICE)); @@ -3143,45 +3126,48 @@ void DavinciModel::CleanTbeHandle() { /// @brief insert active_stream_indication_ /// @return Status /// -Status DavinciModel::MarkActiveStream(const OpDescPtr &op_desc) { - GE_CHECK_NOTNULL(op_desc); - std::string type = op_desc->GetType(); - GE_IF_BOOL_EXEC( - type == STREAMSWITCH, std::vector active_stream_list; - GE_LOGI_IF(!ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), - "GetInt ACTIVE_STREAM_LIST failed."); - if (active_stream_list.size() != kTrueBranchStreamNum) { - GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum); - return INTERNAL_ERROR; - } uint32_t true_stream_id = active_stream_list.front(); - active_stream_indication_.insert(true_stream_id); - GELOGI("flowctrl_op_index_map node:%s, true_stream_id=%u.", op_desc->GetName().c_str(), true_stream_id);); - GE_IF_BOOL_EXEC( - type == STREAMACTIVE, if (op_desc->HasAttr(ATTR_NAME_SWITCH_BRANCH_NODE_LABEL)) { - std::vector active_stream_list; - GE_CHK_BOOL_EXEC(AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), - return INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM failed."); - - for (size_t j = 0; j < active_stream_list.size(); ++j) { - active_stream_indication_.insert(active_stream_list[j]); - GELOGI("flowctrl_op_index_map node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), - active_stream_list[j]); - } - }); - - if (type == STREAMSWITCHN) { +Status DavinciModel::InitStreamActive(const OpDescPtr &op_desc) { + if (op_desc->HasAttr(ATTR_NAME_SWITCH_BRANCH_NODE_LABEL)) { std::vector active_stream_list; - if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) { - GELOGE(INTERNAL_ERROR, "StreamSwitchNOp get attr ACTIVE_STREAM failed."); - return INTERNAL_ERROR; - } + GE_CHK_BOOL_EXEC(AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), + return INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM failed."); for (size_t j = 0; j < active_stream_list.size(); ++j) { active_stream_indication_.insert(active_stream_list[j]); - GELOGI("StreamSwitchNOp node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]); - }; + GELOGI("flowctrl_op_index_map node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]); + } + } + + return SUCCESS; +} + +Status DavinciModel::InitStreamSwitch(const OpDescPtr &op_desc) { + std::vector active_stream_list; + GE_LOGI_IF(!ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list), + "GetInt ACTIVE_STREAM_LIST failed."); + if (active_stream_list.size() != kTrueBranchStreamNum) { + GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum); + return INTERNAL_ERROR; + } + + uint32_t true_stream_id = active_stream_list.front(); + active_stream_indication_.insert(true_stream_id); + GELOGI("flowctrl_op_index_map node:%s, true_stream_id=%u.", op_desc->GetName().c_str(), true_stream_id); + + return SUCCESS; +} + +Status DavinciModel::InitStreamSwitchN(const OpDescPtr &op_desc) { + std::vector active_stream_list; + if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) { + GELOGE(INTERNAL_ERROR, "StreamSwitchNOp get attr ACTIVE_STREAM failed."); + return INTERNAL_ERROR; + } + + for (size_t j = 0; j < active_stream_list.size(); ++j) { + active_stream_indication_.insert(active_stream_list[j]); + GELOGI("StreamSwitchNOp node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]); } - GELOGI("Flow control: active_stream_indication_ size = %zu.", active_stream_indication_.size()); return SUCCESS; } @@ -3205,12 +3191,11 @@ bool DavinciModel::IsBroadCastOpData(const ge::NodePtr &var_node) { /// @ingroup domi_ome /// @brief Init model stream for NN model. /// @param [in] stream user input model stream. -/// @param [in] async_mode is asynchronize mode. /// @return Status /// -Status DavinciModel::InitModelStream(rtStream_t stream, bool async_mode) { +Status DavinciModel::InitModelStream(rtStream_t stream) { // asynchronize mode, use user input stream. - if (async_mode) { + if (is_async_mode_) { rt_model_stream_ = stream; is_inner_model_stream_ = false; return SUCCESS; @@ -3245,16 +3230,12 @@ Status DavinciModel::InitModelStream(rtStream_t stream, bool async_mode) { /// Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputData &input_data, OutputData &output_data) { - GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, async_mode); - GE_CHK_STATUS(InitModelStream(stream, async_mode), "Init model stream failed."); - - GELOGI("do rtModelExecute task sink, model id:%u", input_data.model_id); + is_async_mode_ = async_mode; + GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, is_async_mode_); + GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed."); - auto enable_dump = false; auto dump_path = PropertiesManager::Instance().GetDumpOutputPath(); - if (!dump_path.empty()) { - enable_dump = true; - } + auto enable_dump = !dump_path.empty(); auto dump_op_env = std::getenv("DUMP_OP"); if (dump_op_env != nullptr) { @@ -3275,9 +3256,9 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa output_use_zero_copy_ = false; } - // Asynchronous mode depends on zero copy. - if (async_mode && !input_use_zero_copy_ && !output_use_zero_copy_ && !task_list_.empty()) { - GELOGE(FAILED, "Asynchronous mode but zero copy disabled."); + // Empty task, Just copy input to output, need direct copy. + if (task_list_.empty() && (input_use_zero_copy_ || output_use_zero_copy_)) { + GELOGE(FAILED, "Empty task, Just copy input to output, need direct copy."); return FAILED; } @@ -3298,15 +3279,16 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa GELOGI("rtModelExecute end"); } - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_START)); - ret = output_use_zero_copy_ ? SyncDataAndDump() : CopyOutputData(input_data.index, output_data); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy Output data to user failed."); - GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_END)); + if (!is_async_mode_) { + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_START)); + ret = output_use_zero_copy_ ? SyncDataAndDump() : CopyOutputData(input_data.index, output_data); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy Output data to user failed."); + GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_END)); + } // report model time data GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), (void)SinkTimeProfile(input_data)); GELOGI("Model run end, model id:%u", model_id_); - GEEVENT("Model Run thread end, model_id:%u", model_id_); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/davinci_model.h b/src/ge/graph/load/new_model_manager/davinci_model.h index 9ce02a42..76edd4a4 100644 --- a/src/ge/graph/load/new_model_manager/davinci_model.h +++ b/src/ge/graph/load/new_model_manager/davinci_model.h @@ -340,13 +340,6 @@ class DavinciModel { vector &output_desc, std::vector &inputFormats, std::vector &output_formats); - /// - /// @ingroup domi_ome - /// @brief copy input data to model - /// @return Status - /// - Status CopyInputDataToModel(const std::vector &data, uint32_t data_op_index, bool device_data); - Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data); Status ReturnNoOutput(uint32_t data_id); @@ -413,20 +406,6 @@ class DavinciModel { /// uint32_t GetDeviceId() const { return device_id_; } - /// - /// @ingroup domi_ome - /// @brief Set Train Mode - /// @return void - /// - void SetTrainMode(bool mode) { is_train_mode_ = mode; } - - /// - /// @ingroup domi_ome - /// @brief Get Train Mode - /// @return bool true - /// - bool GetTrainMode() { return is_train_mode_; } - GeModelPtr GetGeModel() { return ge_model_; } const RuntimeParam &GetRuntimeParam() { return runtime_param_; } @@ -519,15 +498,14 @@ class DavinciModel { /// /// @ingroup ge /// @brief Copy Data addr to model for direct use. - /// @param [in] const vector &addrs: model input memory addr list. - /// @param [in] const vector &sizes: model input memory size list. + /// @param [in] const std::map> &data_info: model memory addr/size list. /// @param [in] const std::vector &blobs: user input data list. /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy /// @param [in] string batch_label: batch label for multi-batch scenes /// @return SUCCESS handle successfully / others handle failed /// - Status ZeroCopyBlobs(const std::vector &addr_list, const std::vector &size_list, + Status ZeroCopyBlobs(const std::map> &data_info, const std::vector &blobs, bool is_dynamic_input, ZeroCopyMode zero_copy_mode, string batch_label); @@ -610,11 +588,9 @@ class DavinciModel { /// @brief Data Op Initialize. /// @param [in] NodePtr: Data Op. /// @param [in/out] data_op_index: NetOutput addr size info. - /// @param [in/out] input_data_info: Data index and addr info {index, {size, addr}}. /// @return Status /// - Status InitDataOp(const NodePtr &node, uint32_t &data_op_index, - std::map> &input_data_info); + Status InitDataOp(const NodePtr &node, uint32_t &data_op_index); /// /// @ingroup ge @@ -633,19 +609,27 @@ class DavinciModel { Status InitNetOutput(const OpDescPtr &op_desc); /// - /// @ingroup ge - /// @brief Make Input and Output addr for feature use. - /// @param [in] input_data_info: Data index and addr info {index, {size, addr}}. - /// @return Status - /// - Status CombineDataInfo(const std::map> &input_data_info); - - /// /// @ingroup domi_ome /// @brief Constant Op Init. /// @return Status /// - Status InitConstant(const ConstOpDescPtr &op_desc) const; + Status InitConstant(const OpDescPtr &op_desc); + + Status InitVariable(const OpDescPtr &op_desc); + + Status InitEndGraph(const OpDescPtr &op_desc); + + /// @ingroup ge + /// @brief LabelSet Op Initialize. + /// @param [in] op_desc: LabelSet Op descriptor. + /// @return Status + Status InitLabelSet(const OpDescPtr &op_desc); + + Status InitStreamSwitch(const OpDescPtr &op_desc); + + Status InitStreamActive(const OpDescPtr &op_desc); + + Status InitStreamSwitchN(const OpDescPtr &op_desc); /// /// @ingroup domi_ome @@ -662,7 +646,7 @@ class DavinciModel { /// @brief Init model stream for NN model. /// @return Status /// - Status InitModelStream(rtStream_t stream, bool async_mode); + Status InitModelStream(rtStream_t stream); /// /// @ingroup ge @@ -678,12 +662,16 @@ class DavinciModel { /// Status BindInputQueue(); + Status CpuTaskModelZeroCopy(std::vector &mbuf_list, + std::map> &outside_addrs); + /// /// @ingroup ge /// @brief ACL, Bind NetOutput Op addr to output queue. /// @return: 0 for success / others for fail /// Status BindOutputQueue(); + Status CpuModelPrepareOutput(uintptr_t addr, uint32_t size); /// /// @ingroup ge @@ -693,13 +681,6 @@ class DavinciModel { Status BindActiveStream(); /// - /// @ingroup domi_ome - /// @brief insert active_stream_indication_ - /// @return Status - /// - Status MarkActiveStream(const OpDescPtr &op_desc); - - /// /// @ingroup ge /// @brief definiteness queue schedule, bind input queue to task. /// @param [in] queue_id: input queue id from user. @@ -707,7 +688,7 @@ class DavinciModel { /// @param [in] size: Data Op output tensor size. /// @return: 0 for success / others for fail /// - Status CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t size); + Status CpuModelDequeue(uint32_t queue_id); /// /// @ingroup ge @@ -734,6 +715,8 @@ class DavinciModel { /// Status CpuWaitEndGraph(); + Status BindEnqueue(); + Status CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf); /// /// @ingroup ge /// @brief definiteness queue schedule, repeat run model. @@ -783,10 +766,8 @@ class DavinciModel { vector variable_op_list_; - vector output_size_list_; // Init by NetOutput Input Tensor - vector output_addr_list_; // Init by NetOutput Input Tensor - vector input_size_list_; // Init by Data Output Tensor - vector input_addr_list_; // Init by Data Output Tensor + std::map> input_data_info_; // Init by Data Output Tensor + std::map> output_data_info_; // Init by NetOutput Input Tensor // output op: save cce op actual needed memory size vector output_memory_size_list_; @@ -813,6 +794,7 @@ class DavinciModel { vector event_list_; vector label_list_; + set label_id_indication_; std::mutex outside_addrs_mutex_; std::map> input_outside_addrs_; @@ -830,6 +812,8 @@ class DavinciModel { bool is_inner_model_stream_; + bool is_async_mode_; // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_. + // ACL queue schedule, save queue ids for Init. std::vector cpu_task_list_; std::vector input_queue_ids_; // input queue ids created by caller. @@ -847,8 +831,6 @@ class DavinciModel { uint32_t device_id_; - bool is_train_mode_; - std::mutex flowctrl_op_index_internal_map_mutex_; std::map flowctrl_op_index_internal_map_; std::set active_stream_indication_; diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc index a1fefff2..1b6b30c2 100644 --- a/src/ge/graph/load/new_model_manager/model_manager.cc +++ b/src/ge/graph/load/new_model_manager/model_manager.cc @@ -358,26 +358,17 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vectorGetDataList()) { - GE_CHECK_NOTNULL(op); - GE_CHECK_GE(inputs.size(), 1); - GE_CHECK_GE(inputs.size() - 1, index); - + for (size_t i = 0; i < inputs.size(); ++i) { DataBuffer data; - data.data = inputs[index].data.data; - data.length = inputs[index].data.length; + data.data = inputs[i].data.data; + data.length = inputs[i].data.length; input_data.blobs.push_back(data); - index++; } - CHECK_FALSE_EXEC(input_data.blobs.size() >= inputs.size(), - GELOGW("cur_inputs size = %zu, inputs size = %zu.", input_data.blobs.size(), inputs.size());); - OutputData output_data; output_data.model_id = model_id; output_data.index = 0; - for (size_t i = 0; i < outputs.size(); i++) { + for (size_t i = 0; i < outputs.size(); ++i) { DataBuffer data; data.data = outputs[i].data.data; data.length = outputs[i].data.length; @@ -675,6 +666,15 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model break; } davinci_model->SetId(model_id); + + int32_t device_id = 0; + rtError_t rt_ret = rtGetDevice(&device_id); + if (rt_ret != RT_ERROR_NONE || device_id < 0) { + GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id); + return FAILED; + } + davinci_model->SetDeviceId(device_id); + ret = davinci_model->Init(dev_ptr, mem_size, weight_ptr, weight_size); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "DavinciInit failed."); diff --git a/src/ge/graph/load/new_model_manager/model_utils.cc b/src/ge/graph/load/new_model_manager/model_utils.cc index 360a537f..dd2d20f6 100644 --- a/src/ge/graph/load/new_model_manager/model_utils.cc +++ b/src/ge/graph/load/new_model_manager/model_utils.cc @@ -53,27 +53,6 @@ bool ModelUtils::IsOutput(ConstOpDescPtr op_desc) { /// /// @ingroup domi_ome -/// @brief Check is the Input need trans code. -/// @return bool -/// -bool ModelUtils::IsInputTensorNeedTrans(ConstOpDescPtr op_desc, size_t tensor_index) { - GE_CHECK_NOTNULL_EXEC(op_desc, return false); - const auto &input_desc = op_desc->MutableInputDesc(static_cast(tensor_index)); - const auto &output_desc = op_desc->MutableOutputDesc(static_cast(tensor_index)); - GE_CHECK_NOTNULL_EXEC(input_desc, return false); - GE_CHECK_NOTNULL_EXEC(output_desc, return false); - - if ((output_desc->GetFormat() == FORMAT_NC1HWC0) && (output_desc->GetDataType() == DT_INT8)) { - // AIPP input, add attribute in data op to tag aipp - return false; - } - - return (input_desc->GetFormat() != output_desc->GetFormat()) || - (input_desc->GetDataType() != output_desc->GetDataType()); -} - -/// -/// @ingroup domi_ome /// @brief Get input size. /// @return vector /// @@ -398,6 +377,8 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, data_offset)); uint8_t *weight_addr = static_cast(weight_base + data_offset - logic_weight_base); v_input_data_addr.push_back(weight_addr); + GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id, + op_desc->GetName().c_str(), i, weight_addr); }); non_const_index++; continue; @@ -411,7 +392,10 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co non_const_index++; GE_IF_BOOL_EXEC(var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset), uint8_t *variable_addr = var_base + input_offset - logic_var_base; - v_input_data_addr.push_back(variable_addr); continue;); + v_input_data_addr.push_back(variable_addr); + GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]", + model_param.graph_id, op_desc->GetName().c_str(), i, variable_addr); + continue;); bool input_tensor = false; GE_IF_BOOL_EXEC(TensorUtils::GetInputTensor(op_desc->GetOutputDesc(i), input_tensor) != GRAPH_SUCCESS, @@ -421,12 +405,14 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co uint8_t *mem_addr = nullptr; // l1 fusion if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { - mem_addr = reinterpret_cast(input_offset); + mem_addr = reinterpret_cast(reinterpret_cast(input_offset)); v_input_data_addr.push_back(mem_addr); } else { mem_addr = static_cast(mem_base + input_offset - logic_mem_base); v_input_data_addr.push_back(mem_addr); } + GELOGI("[IMAS]GetInputDataAddrs graph_%u type[F] name[%s] input[%zu] memaddr[%p]", model_param.graph_id, + op_desc->GetName().c_str(), i, mem_addr); } return v_input_data_addr; @@ -487,12 +473,14 @@ vector ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C uint8_t *mem_addr = nullptr; // l1 fusion if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { - mem_addr = reinterpret_cast(v_output_offset[i]); + mem_addr = reinterpret_cast(reinterpret_cast(v_output_offset[i])); v_output_data_addr.push_back(mem_addr); } else { mem_addr = static_cast(mem_base + v_output_offset[i] - logic_mem_base); v_output_data_addr.push_back(mem_addr); } + GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[F] name[%s] output[%zu] memaddr[%p]", model_param.graph_id, + op_desc->GetName().c_str(), i, mem_addr); } return v_output_data_addr; } @@ -530,7 +518,7 @@ vector ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) { v_workspace_data_addr.push_back(reinterpret_cast(v_workspace_offset[i])); GELOGI("L1Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i, - reinterpret_cast(v_workspace_offset[i])); + reinterpret_cast(reinterpret_cast(v_workspace_offset[i]))); } else { int64_t workspace_offset = v_workspace_offset[i]; int64_t workspace_bytes = v_workspace_bytes[i]; @@ -558,6 +546,7 @@ Status ModelUtils::ConvertVirtualAddressToPhysical(uint8_t *virtual_address, uin return RT_FAILED; } + GELOGD("virtual_address=%p, physical_address=%p", virtual_address, physical_address); return SUCCESS; } } // namespace ge diff --git a/src/ge/graph/load/new_model_manager/model_utils.h b/src/ge/graph/load/new_model_manager/model_utils.h index 1a15c930..479cc431 100644 --- a/src/ge/graph/load/new_model_manager/model_utils.h +++ b/src/ge/graph/load/new_model_manager/model_utils.h @@ -42,13 +42,6 @@ class ModelUtils { /// /// @ingroup domi_ome - /// @brief Check is the Input need trans code. - /// @return bool - /// - static bool IsInputTensorNeedTrans(ConstOpDescPtr op_desc, size_t tensor_index); - - /// - /// @ingroup domi_ome /// @brief Get input size. /// @return vector /// diff --git a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc index cb30092c..75acf548 100644 --- a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc @@ -38,6 +38,7 @@ Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin } model_ = davinci_model->GetRtModelHandle(); + GELOGI("InitEndGraphTaskInfo Init Success, model:%p, stream:%p", model_, stream_); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc index 52511f03..3fa5eee2 100644 --- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc @@ -125,6 +125,7 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m return RT_FAILED; } + GELOGD("hccl_stream addr is=%p", stream); hccl_stream_list_.push_back(stream); davinci_model->PushHcclStream(stream); } @@ -245,6 +246,8 @@ void HcclTaskInfo::GetPrivateDefByTaskDef(const domi::TaskDef &task) { GELOGE(RT_FAILED, "Call rtMemcpy Fail, ret = 0x%X.", ret); return; } + + GELOGI("The first address of the custom info, privateDef=%p.", private_def_); } } } diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc index 88e8a1bb..faaa3f82 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc @@ -41,6 +41,7 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin } auto kernel_ex_def = task_def.kernel_ex(); + const RuntimeParam &rts_param = davinci_model->GetRuntimeParam(); // 1. Copy context from kernelExDef.private to workspace uint32_t op_index = kernel_ex_def.op_index(); @@ -50,12 +51,12 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin return INTERNAL_ERROR; } - if (CopyTaskInfo(kernel_ex_def, davinci_model->GetRuntimeParam(), op_desc) != SUCCESS) { + if (CopyTaskInfo(kernel_ex_def, rts_param, op_desc) != SUCCESS) { GELOGE(FAILED, "copy task info to workspace failed."); return FAILED; } - vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc); + const vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc); if (workspace_data_addrs.empty()) { GELOGE(FAILED, "workspace_data_addrs is empty."); return FAILED; @@ -79,16 +80,16 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin uint64_t step_id_addr = 0; OpDescPtr step_id_node = davinci_model->GetVariableOp(NODE_NAME_GLOBAL_STEP); if (step_id_node != nullptr) { - vector v_step_id_addr = ModelUtils::GetOutputDataAddrs(davinci_model->GetRuntimeParam(), step_id_node); + vector v_step_id_addr = ModelUtils::GetOutputDataAddrs(rts_param, step_id_node); if (!v_step_id_addr.empty()) { step_id_addr = static_cast(reinterpret_cast(v_step_id_addr[0])); } } // 3. Set workspaceaddr, inputOutputDataAddr - uint64_t workspace_base_addr = reinterpret_cast(workspace_data_addrs[0]); - vector input_addrs = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); - vector output_addrs = ModelUtils::GetOutputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); + uint64_t workspace_base_addr = reinterpret_cast(reinterpret_cast(workspace_data_addrs[0])); + const vector input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc); + const vector output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc); vector io_addrs; io_addrs.insert(io_addrs.end(), input_addrs.begin(), input_addrs.end()); io_addrs.insert(io_addrs.end(), output_addrs.begin(), output_addrs.end()); @@ -132,7 +133,13 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin rt_ret = rtMemcpy(kernel_buf_, sizeof(STR_FWK_OP_KERNEL), static_cast(&fwk_op_kernel), sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;) - davinci_model->SetZeroCopyAddr(op_desc, io_addrs, input_output_addr_); + + vector virtual_io_addrs; // use virtual address for zero copy key. + const vector virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false); + const vector virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false); + virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end()); + virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end()); + davinci_model->SetZeroCopyAddr(op_desc, virtual_io_addrs, input_output_addr_); kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL); davinci_model_ = davinci_model; diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h index 9aab55e7..a6419f9f 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h @@ -25,6 +25,7 @@ class KernelExTaskInfo : public TaskInfo { public: KernelExTaskInfo() : task_id_(0), + stream_id_(0), dump_flag_(RT_KERNEL_DEFAULT), kernel_buf_size_(0), davinci_model_(nullptr), diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc index 407efd63..47956cf2 100644 --- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc @@ -221,13 +221,13 @@ Status KernelTaskInfo::SuperKernelLaunch() { return RT_FAILED; } // Call the fuse API - skt::SuperKernel *superKernel; + skt::SuperKernel *superKernel = nullptr; if (factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info_.last_block_dim, superKernel) != SUCCESS) { GELOGE(RT_FAILED, "SuperKernelLaunch: fuse call failed"); return RT_FAILED; } // Launch a super kernel - if (superKernel->Launch(skt_info_.last_stream, true) != SUCCESS) { + if (superKernel->Launch(skt_info_.last_stream, RT_KERNEL_DUMPFLAG) != SUCCESS) { GELOGE(RT_FAILED, "SuperKernelLaunch: launch failed"); return RT_FAILED; } @@ -341,6 +341,7 @@ Status KernelTaskInfo::Distribute() { rtError_t rt_ret = RT_ERROR_NONE; char *skt_enable_env = getenv("SKT_ENABLE"); int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0; + bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_); if (kernel_type_ == cce::ccKernelType::AI_CPU) { // blockDim is reserved parameter, set to 1 rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast(so_name_.c_str()), @@ -348,11 +349,10 @@ Status KernelTaskInfo::Distribute() { nullptr, stream_, dump_flag_); } else { /* default: not skt launch */ - bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_); GELOGI( - "KernelTaskInfo Distribute Start, sktenable:%ld taskid:%u sktid:%u last_sktid:%u stubfunc_name:%s " + "KernelTaskInfo Distribute Start, sktenable:%d taskid:%u sktid:%u last_sktid:%u stubfunc_name:%s " "stubfunc:%p blockdim:%u stream:%p", - env_flag, task_id_, skt_id_, skt_info_.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); + call_skt, task_id_, skt_id_, skt_info_.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); // l1 fusion enable and env flag open (kCloseSkt for skt debug) if (call_skt && (env_flag != kCloseSkt)) { GE_RETURN_IF_ERROR(SuperKernelDistribute()); @@ -371,7 +371,7 @@ Status KernelTaskInfo::Distribute() { GELOGI( "KernelTaskInfo Distribute Success. sktenable:%d taskid:%d sktid:%d stubfunc_name:%s stubfunc:%p " "blockdim:%d stream:%p", - env_flag, task_id_, skt_id_, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); + call_skt, task_id_, skt_id_, stub_func_name_.c_str(), stub_func_, block_dim_, stream_); return SUCCESS; } @@ -423,12 +423,12 @@ Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset, stub_func_ = const_cast(bin_file_key); } - const vector input_data_addrs = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); - const vector output_data_addrs = ModelUtils::GetOutputDataAddrs(davinci_model->GetRuntimeParam(), op_desc); - const vector workspace_data_addrs = - ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc); - vector tensor_device_addrs; + const RuntimeParam &rts_param = davinci_model->GetRuntimeParam(); + const vector input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc); + const vector output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc); + const vector workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc); + vector tensor_device_addrs; tensor_device_addrs.insert(tensor_device_addrs.end(), input_data_addrs.begin(), input_data_addrs.end()); tensor_device_addrs.insert(tensor_device_addrs.end(), output_data_addrs.begin(), output_data_addrs.end()); tensor_device_addrs.insert(tensor_device_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end()); @@ -468,7 +468,13 @@ Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset, reinterpret_cast(reinterpret_cast(args_) + offset + sizeof(void *) * input_data_addrs.size()); } - davinci_model_->SetZeroCopyAddr(op_desc, tensor_device_addrs, static_cast(args_) + offset); + vector virtual_io_addrs; // use virtual address for zero copy key. + const vector virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false); + const vector virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false); + virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end()); + virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end()); + davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, static_cast(args_) + offset); + // update origin l2 data string sm_desc = kernel_def.sm_desc(); char *sm_contrl = nullptr; @@ -516,6 +522,7 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::mapsecond; + const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); const domi::KernelContext &context = kernel_def.context(); const uint32_t kCustomAicpuArgsLen = 5; @@ -534,11 +541,8 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::map(const_cast(context.args_offset().data())))[i]; } - const std::vector input_data_addrs = - ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc); - const std::vector output_data_addrs = - ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc); - + const std::vector input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc); + const std::vector output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc); Status ret = StoreInputOutputTensor(input_data_addrs, output_data_addrs, ModelUtils::GetInputDescs(op_desc), ModelUtils::GetOutputDescs(op_desc)); @@ -583,15 +587,15 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::map(args + ctx_.argsOffset[0])) = - reinterpret_cast(custom_info_.input_descs); // arg 0 + reinterpret_cast(reinterpret_cast(custom_info_.input_descs)); // arg 0 *(reinterpret_cast(args + ctx_.argsOffset[1])) = - reinterpret_cast(custom_info_.input_addrs); // arg 1 + reinterpret_cast(reinterpret_cast(custom_info_.input_addrs)); // arg 1 *(reinterpret_cast(args + ctx_.argsOffset[2])) = - reinterpret_cast(custom_info_.output_descs); // arg 2 + reinterpret_cast(reinterpret_cast(custom_info_.output_descs)); // arg 2 *(reinterpret_cast(args + ctx_.argsOffset[3])) = - reinterpret_cast(custom_info_.output_addrs); // arg 3 + reinterpret_cast(reinterpret_cast(custom_info_.output_addrs)); // arg 3 *(reinterpret_cast(args + ctx_.argsOffset[4])) = - reinterpret_cast(custom_info_.attr_handle); // arg 4 + reinterpret_cast(reinterpret_cast(custom_info_.attr_handle)); // arg 4 rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM); if (rt_ret != RT_ERROR_NONE) { @@ -606,8 +610,10 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::mapSetZeroCopyAddr(op_desc, input_data_addrs, custom_info_.input_addrs); - davinci_model_->SetZeroCopyAddr(op_desc, output_data_addrs, custom_info_.output_addrs); + const vector virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false); + const vector virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false); + davinci_model_->SetZeroCopyAddr(op_desc, virtual_in_addrs, custom_info_.input_addrs); + davinci_model_->SetZeroCopyAddr(op_desc, virtual_out_addrs, custom_info_.output_addrs); return SUCCESS; } @@ -714,8 +720,10 @@ Status KernelTaskInfo::InitAicpuTask(const std::map &op_lis } OpDescPtr op_desc = iter->second; - vector input_addrs = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc); - vector output_addrs = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc); + const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam(); + + vector input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc); + vector output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc); vector io_addrs; io_addrs.insert(io_addrs.end(), input_addrs.begin(), input_addrs.end()); io_addrs.insert(io_addrs.end(), output_addrs.begin(), output_addrs.end()); @@ -752,7 +760,13 @@ Status KernelTaskInfo::InitAicpuTask(const std::map &op_lis sizeof(void *) * input_addrs.size()); } - davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, static_cast(args_) + sizeof(aicpu::AicpuParamHead)); + vector virtual_io_addrs; // use virtual address for zero copy key. + const vector virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false); + const vector virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false); + virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end()); + virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end()); + davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, + static_cast(args_) + sizeof(aicpu::AicpuParamHead)); return SUCCESS; } @@ -977,7 +991,7 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe *(reinterpret_cast( args + (reinterpret_cast(const_cast(context.args_offset().data())))[0])) = - reinterpret_cast(flowtable_); + reinterpret_cast(reinterpret_cast(flowtable_)); } return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc new file mode 100644 index 00000000..9c5e4c29 --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc @@ -0,0 +1,149 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h" + +#include "framework/common/debug/ge_log.h" +#include "graph/load/new_model_manager/davinci_model.h" + +namespace ge { +Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) { + GELOGI("MemcpyAddrAsyncTaskInfo Init Start."); + if (davinci_model == nullptr) { + GELOGE(PARAM_INVALID, "davinci_model is null!"); + return PARAM_INVALID; + } + + Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList()); + if (ret != SUCCESS) { + return ret; + } + + auto memcpy_async_def = task_def.memcpy_async(); + + uint64_t logic_dst = memcpy_async_def.dst(); + uint64_t logic_src = memcpy_async_def.src(); + + dst_max_ = memcpy_async_def.dst_max(); + + uint64_t update_base_addr = 0; + ret = GetUpdateBaseAddr(davinci_model, logic_src, update_base_addr); + if (ret != SUCCESS) { + return ret; + } + src_ = reinterpret_cast(update_base_addr + logic_src); + if (src_ == nullptr) { + GELOGE(PARAM_INVALID, "src_ is null!"); + return PARAM_INVALID; + } + + uint64_t mem_base = reinterpret_cast(davinci_model->MemBase()); + uint64_t logic_mem_base = davinci_model->GetRtBaseAddr(); + dst_ = reinterpret_cast(mem_base + (logic_dst - logic_mem_base)); + if (dst_ == nullptr) { + GELOGE(PARAM_INVALID, "dst_ is null!"); + return PARAM_INVALID; + } + + count_ = memcpy_async_def.count(); + kind_ = memcpy_async_def.kind(); + + // malloc args memory + size_t args_size = sizeof(void *); + rtError_t rt_ret = rtMalloc(&args_, args_size * 2, RT_MEMORY_HBM); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } + + // copy orign src + GELOGI("src_args:%p, destMax:%zu, src_:%p, count=%zu, kind=%u", args_, args_size, src_, args_size, + RT_MEMCPY_HOST_TO_DEVICE); + rt_ret = rtMemcpy(args_, args_size, &src_, args_size, RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api for src failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } + + // copy orign dst + GELOGI("dst_args:%p, destMax:%zu, dst_:%p, count=%zu, kind=%u", + reinterpret_cast(reinterpret_cast(args_) + args_size), args_size, dst_, args_size, + RT_MEMCPY_HOST_TO_DEVICE); + rt_ret = rtMemcpy(reinterpret_cast(reinterpret_cast(args_) + args_size), args_size, &dst_, + args_size, RT_MEMCPY_HOST_TO_DEVICE); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api for dst failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } + + GELOGI("InitMemcpyAddrAsyncTaskInfo, logic_src:%p, logic_dst:%p, src:%p, dst:%p, src_args:%p, dst_args:%p", + reinterpret_cast(reinterpret_cast(logic_src)), + reinterpret_cast(reinterpret_cast(logic_dst)), src_, dst_, args_, + reinterpret_cast(reinterpret_cast(args_) + args_size)); + + return SUCCESS; +} + +Status MemcpyAddrAsyncTaskInfo::Distribute() { + GELOGI("MemcpyAddrAsyncTaskInfo Distribute Start."); + GELOGI("Distribute MemcpyAddrAsync, dst_max:%lu, count:%lu, kind:%u.", dst_max_, count_, kind_); + + rtError_t rt_ret = rtMemcpyAsync(reinterpret_cast(reinterpret_cast(args_) + sizeof(void *)), + dst_max_, args_, count_, static_cast(kind_), stream_); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); + return RT_FAILED; + } + + return SUCCESS; +} + +Status MemcpyAddrAsyncTaskInfo::GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, + uint64_t &base_addr) { + GE_CHECK_NOTNULL(davinci_model); + uint64_t data_base_addr = + reinterpret_cast(reinterpret_cast(davinci_model->MemBase())) - davinci_model->GetRtBaseAddr(); + uint64_t weight_base_addr = reinterpret_cast(reinterpret_cast(davinci_model->WeightsMemBase())) - + davinci_model->GetRtWeightAddr(); + uint64_t var_base_addr = reinterpret_cast(reinterpret_cast(davinci_model->VarMemBase())) - + davinci_model->GetRtVarAddr(); + + uint64_t data_base_addr_start = davinci_model->GetRtBaseAddr(); + uint64_t data_base_addr_end = davinci_model->GetRtBaseAddr() + davinci_model->TotalMemSize(); + uint64_t wight_base_addr_start = davinci_model->GetRtWeightAddr(); + uint64_t wight_base_addr_end = davinci_model->GetRtWeightAddr() + davinci_model->TotalWeightsMemSize(); + uint64_t varible_base_addr_start = davinci_model->GetRtVarAddr(); + uint64_t varible_base_addr_end = davinci_model->GetRtVarAddr() + davinci_model->TotalVarMemSize(); + + if ((data_base_addr_start <= update_addr) && (update_addr <= data_base_addr_end)) { + base_addr = data_base_addr; + GELOGI("The update_addr is data address."); + } else if ((wight_base_addr_start <= update_addr) && (update_addr <= wight_base_addr_end)) { + base_addr = weight_base_addr; + GELOGI("The update_addr is weight address."); + } else if ((varible_base_addr_start <= update_addr) && (update_addr <= varible_base_addr_end)) { + base_addr = var_base_addr; + GELOGI("The update_addr is variable address."); + } else if (update_addr != 0) { + base_addr = 0; + GELOGE(PARAM_INVALID, "The update_addr is abnormal."); + return PARAM_INVALID; + } + return SUCCESS; +} + +REGISTER_TASK_INFO(RT_MODEL_TASK_MEMCPY_ADDR_ASYNC, MemcpyAddrAsyncTaskInfo); +} // namespace ge diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h new file mode 100644 index 00000000..9252e43a --- /dev/null +++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h @@ -0,0 +1,55 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_ +#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_ +#include "graph/load/new_model_manager/task_info/task_info.h" + +namespace ge { +class MemcpyAddrAsyncTaskInfo : public TaskInfo { + public: + MemcpyAddrAsyncTaskInfo() : dst_(nullptr), dst_max_(0), src_(nullptr), args_(nullptr), count_(0), kind_(0) {} + + ~MemcpyAddrAsyncTaskInfo() override { + src_ = nullptr; + dst_ = nullptr; + + if (args_ != nullptr) { + rtError_t ret = rtFree(args_); + if (ret != RT_ERROR_NONE) { + GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", ret); + } + } + + args_ = nullptr; + } + + Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override; + + Status Distribute() override; + + private: + Status GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, uint64_t &base_addr); + + void *dst_; + uint64_t dst_max_; + void *src_; + void *args_; + uint64_t count_; + uint32_t kind_; +}; +} // namespace ge +#endif // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_ diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc index cdd9eb37..c783c718 100644 --- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc @@ -51,6 +51,9 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da count_ = memcpy_async_def.count(); kind_ = memcpy_async_def.kind(); + GELOGI("MemcpyAsyncTaskInfo Init Success, logic_src:%p, logic_dst:%p, src:%p, dst:%p", + reinterpret_cast(reinterpret_cast(logic_src)), + reinterpret_cast(reinterpret_cast(logic_dst)), src_, dst_); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc index 3d73b9cb..21c80c83 100644 --- a/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc @@ -63,6 +63,8 @@ Status StreamActiveTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d active_stream_ = davinci_model->GetStreamList()[active_stream_index_list[internal_index]]; active_stream_id_ = stream_active_def.active_stream_id(); + GELOGI("InitStreamActiveTaskInfo Init Success, index:%u, activeStream:%p, activeStreamID:%u.", internal_index, + active_stream_, active_stream_id_); return SUCCESS; } @@ -74,6 +76,8 @@ Status StreamActiveTaskInfo::Distribute() { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } + + GELOGI("StreamActiveTaskInfo Distribute Success. activeStreamID:%p.", active_stream_); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc index c14a0e1f..a54bf012 100644 --- a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc +++ b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc @@ -95,6 +95,10 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d } data_type_ = static_cast(data_type); } + + GELOGI("InitStreamSwitchTaskInfo Init Success, cond:%d, trueStream:%p, trueStreamID:%u, datatype:%d.", cond_, + true_stream_, true_stream_id_, data_type_); + return SUCCESS; } @@ -105,6 +109,8 @@ Status StreamSwitchTaskInfo::Distribute() { GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret); return RT_FAILED; } + + GELOGI("StreamSwitchTaskInfo Distribute Success. cond:%d, stream:%p, datatype:%d.", cond_, true_stream_, data_type_); return SUCCESS; } diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc index 38dbd8b3..b8fc77ac 100644 --- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc @@ -19,17 +19,17 @@ namespace ge { namespace skt { -Status SuperKernel::Launch(rtStream_t stream, bool dump_flag) { +Status SuperKernel::Launch(rtStream_t stream, uint32_t dump_flag) { const void *func_stub_ = this->GetFuncStub(); - const void *args[] = {this->GetNavTablePtr(), (const void *)this->GetNavTableSize()}; + const void *args[] = {this->GetNavTablePtr(), + reinterpret_cast(reinterpret_cast(this->GetNavTableSize()))}; - void *device_args_addr = nullptr; - rtError_t rt_ret = rtMalloc((void **)&(device_args_addr), sizeof(args), RT_MEMORY_HBM); + rtError_t rt_ret = rtMalloc((void **)&(device_args_addr_), sizeof(args), RT_MEMORY_HBM); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;) - rt_ret = rtMemcpy((void *)device_args_addr, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE); + rt_ret = rtMemcpy((void *)device_args_addr_, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;) - rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr, sizeof(args), NULL, stream, + rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr_, sizeof(args), NULL, stream, dump_flag); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelLaunchWithFlag failied. error: 0x%X", rt_ret); return FAILED;) diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h index b662d97b..1c31acd1 100644 --- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h @@ -25,6 +25,7 @@ namespace ge { namespace skt { class SuperKernel { private: + void *device_args_addr_ = nullptr; const void *func_stub_; void *dev_nav_table_; uint64_t nav_table_size_; @@ -33,8 +34,18 @@ class SuperKernel { public: SuperKernel(const void *stub, void *ptr, uint64_t sz, uint32_t dim) : func_stub_(stub), dev_nav_table_(ptr), nav_table_size_(sz), block_dim_(dim) {} - ~SuperKernel() {} - Status Launch(rtStream_t stream, bool dump_flag); + ~SuperKernel() { + // free memory when all releasing + if (device_args_addr_ != nullptr) { + GE_CHK_RT(rtFree(device_args_addr_)); + GELOGI("SKT: super_kernel args addr free."); + } + if (dev_nav_table_ != nullptr) { + GE_CHK_RT(rtFree(dev_nav_table_)); + GELOGI("SKT: super_kernel args addr free."); + } + } + Status Launch(rtStream_t stream, uint32_t dump_flag); const void *GetFuncStub() const { return func_stub_; } const void *GetNavTablePtr() const { return dev_nav_table_; } uint64_t GetNavTableSize() const { return nav_table_size_; } diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc index ab3f68f1..63107f5e 100644 --- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc @@ -30,26 +30,26 @@ Status SuperKernelFactory::Init() { rt_ret = rtGetFunctionByName(this->sk_stub_name_.c_str(), &this->func_stub_); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetFunctionByName " - "failied. stub_func: %s", + "failed. stub_func: %s", this->sk_stub_name_.c_str()); return FAILED;) rt_ret = rtGetAddrByFun(this->func_stub_, &this->func_ptr_); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret); return FAILED;) if (this->use_physical_address_ != nullptr) { void *skt_func = nullptr; rt_ret = rtKernelConfigTransArg(this->func_ptr_, sizeof(uint64_t), 0, &skt_func); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret); return FAILED;) GELOGD( "SKT: fuseKernels super_kernel_template subFunc %p, device func " "address %p, device physic PC %p", - (uint64_t)this->func_stub_, (uint64_t)this->func_ptr_, (uint64_t)skt_func); + this->func_stub_, this->func_ptr_, skt_func); } else { GELOGD( "SKT: fuseKernels super_kernel_template subFunc %p, device func " "address %p", - (uint64_t)this->func_stub_, (uint64_t)this->func_ptr_); + this->func_stub_, this->func_ptr_); } } is_init_ = true; @@ -94,63 +94,66 @@ Status SuperKernelFactory::FuseKernels(const std::vector &stub_func_list uint64_t nav_table_size = 2 * stub_func_list.size() * sizeof(int64_t); rtError_t rt_ret; + void *hbm_nav_table_addr = nullptr; if (this->use_physical_address_ != nullptr) { for (unsigned i = 0; i < stub_func_list.size(); i++) { void *sub_device_func = nullptr; rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret); return FAILED;) void *sub_device_func_pys = nullptr; void *args_addr_pys = nullptr; rt_ret = rtKernelConfigTransArg(sub_device_func, sizeof(uint64_t), 0, &sub_device_func_pys); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret); return FAILED;) rt_ret = rtKernelConfigTransArg(args_addr_list[i], sizeof(uint64_t), 0, &args_addr_pys); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret); return FAILED;) GELOGD( "SKT: fuseKernels subFunc %p, device func address %p, device " "physic func address %p", - stub_func_list[i], (uint64_t)sub_device_func, (uint64_t)sub_device_func_pys); - nav_table[i * 2] = (uint64_t)sub_device_func_pys / 4; - GELOGD("SKT: CALL offet %p", nav_table[i * 2]); - nav_table[i * 2 + 1] = (uint64_t)args_addr_pys; + stub_func_list[i], sub_device_func, sub_device_func_pys); + // store two uint64_t address + // address divided by 4 because of 32bits encoding, call offset will *4 when calculating + nav_table[i * 2] = reinterpret_cast(reinterpret_cast(sub_device_func_pys)) / 4; + GELOGD("SKT: CALL offset %p", nav_table[i * 2]); + nav_table[i * 2 + 1] = reinterpret_cast(reinterpret_cast(args_addr_pys)); + GELOGD("SKT: fuseKernels args base address %p", nav_table[i * 2 + 1]); } - void *hbm_nav_table_addr = nullptr; void *hbm_nav_table_addr_pys = nullptr; rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;) + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;) rt_ret = rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;) + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret); return FAILED;) rt_ret = rtKernelConfigTransArg(hbm_nav_table_addr, sizeof(uint64_t), 0, &hbm_nav_table_addr_pys); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret); return FAILED;) - GELOGD("SKT: hbm_nav_table_addr %p, hbm_nav_table_addr_pys %p", (uint64_t)hbm_nav_table_addr, - (uint64_t)hbm_nav_table_addr_pys); + GELOGD("SKT: hbm_nav_table_addr %p, hbm_nav_table_addr_pys %p", hbm_nav_table_addr, hbm_nav_table_addr_pys); // Create the necessary metadata for the super kernel h = new SuperKernel(this->func_stub_, hbm_nav_table_addr_pys, nav_table_size, block_dim); } else { for (unsigned i = 0; i < stub_func_list.size(); i++) { void *sub_device_func = nullptr; rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret); + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret); return FAILED;) - GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], (uint64_t)sub_device_func); - nav_table[i * 2] = (uint64_t)sub_device_func / 4; + GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func); + // store two uint64_t address + // address divided by 4 because of 32bits encoding, call offset will *4 when calculating + nav_table[i * 2] = reinterpret_cast(reinterpret_cast(sub_device_func)) / 4; GELOGD("SKT: CALL offet %p", nav_table[i * 2]); - nav_table[i * 2 + 1] = (uint64_t)args_addr_list[i]; + nav_table[i * 2 + 1] = reinterpret_cast(reinterpret_cast(args_addr_list[i])); GELOGD("SKT: fuseKernels args base address %p", nav_table[i * 2 + 1]); } - void *hbm_nav_table_addr = nullptr; rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;) + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;) rt_ret = rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE); - GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;) + GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret); return FAILED;) // Create the necessary metadata for the super kernel h = new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim); } diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h index 7b59d4bf..7ceb5cfa 100644 --- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h +++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h @@ -31,12 +31,12 @@ class SuperKernelFactory { const char *use_physical_address_ = getenv("GE_USE_PHYSICAL_ADDRESS"); bool is_init_ = false; SuperKernelFactory(){}; + ~SuperKernelFactory(){}; public: SuperKernelFactory(SuperKernelFactory const &) = delete; void operator=(SuperKernelFactory const &) = delete; static SuperKernelFactory &GetInstance(); - SuperKernelFactory(const std::string &sk_stub_name_, const std::string &bin_file); Status Init(); Status Uninitialize(); Status FuseKernels(const std::vector &stub_func_list, const std::vector &args_addr_list, diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc index f6fc8389..d4680d94 100644 --- a/src/ge/graph/manager/graph_manager.cc +++ b/src/ge/graph/manager/graph_manager.cc @@ -33,6 +33,7 @@ #include "framework/common/debug/ge_log.h" #include "framework/common/ge_inner_error_codes.h" #include "framework/common/ge_types.h" +#include "graph/manager/util/rt_context_util.h" #include "graph/common/transop_util.h" #include "graph/debug/ge_attr_define.h" #include "graph/ge_context.h" @@ -117,6 +118,7 @@ Status GraphManager::Initialize(const std::map &options) { } graph_map_.clear(); + cache_helper_map_.clear(); init_flag_ = true; thread_run_flag_ = true; @@ -180,6 +182,7 @@ Status GraphManager::Finalize() { } } graph_map_.clear(); + cache_helper_map_.clear(); // graph context if (graph_context_ != nullptr) { @@ -426,6 +429,13 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorSetSubGraph(merged_compute_graph); // set subgraphlist to graphnode graph_node->SetSubGraph(sub_graph_list); + // when set incre build, save om model and var manager + auto save_ret = SaveCacheAfterBuild(graph_node->GetGraphId(), merged_compute_graph, ge_model); + if (save_ret != SUCCESS) { + GELOGW("Fail to save cache."); + } + // release rts generate context + RtContextUtil::GetInstance().DestroyrtContexts(); GE_TIMESTAMP_END(PreRun, "GraphManager::PreRun"); GEEVENT("[GEPERFTRACE] GE PreRun End"); return ret; @@ -444,10 +454,14 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: return PARAM_INVALID; } GeModelPtr ge_model = nullptr; - ret = PreRun(graph_node, inputs, ge_models, ge_model, session_id); + // check need incre build. + ret = IncreBuild(graph_node, ge_model); if (ret != SUCCESS) { - GELOGE(ret, "PreRun Failed."); - return ret; + ret = PreRun(graph_node, inputs, ge_models, ge_model, session_id); + if (ret != SUCCESS) { + GELOGE(ret, "PreRun Failed."); + return ret; + } } ret = LoadGraph(ge_model, graph_node); if (ret != SUCCESS) { @@ -492,6 +506,90 @@ Status GraphManager::LoadGraph(const GeModelPtr &ge_model, const GraphNodePtr &g return SUCCESS; } +Status GraphManager::LoadFromCache(const GraphNodePtr &graph_node, const ModelCacheHelperPtr &cache_helper, + GeModelPtr &ge_model) { + auto graph_id = graph_node->GetGraphId(); + auto ret = cache_helper->LoadOmModelFromCache(ge_model); + if (ret != SUCCESS) { + GELOGW("Fail to load om model from cache."); + if (cache_helper->ClearCache(graph_id) != SUCCESS) { + GELOGW("Fail to clear cache of graph %u.", graph_id); + } + return FAILED; + } + ret = cache_helper->RecoverVarManagerFromCache(); + if (ret != SUCCESS) { + GELOGW("Fail to recover VarManager from cache."); + if (cache_helper->ClearCache(graph_id) != SUCCESS) { + GELOGW("Fail to clear cache of graph %u.", graph_id); + } + return FAILED; + } + ComputeGraphPtr compute_graph_in_model = GraphUtils::GetComputeGraph(ge_model->GetGraph()); + if (compute_graph_in_model == nullptr) { + GELOGW("Error occurred when get compute graph from om, abandon."); + return FAILED; + } else { + graph_node->SetComputeGraph(compute_graph_in_model); + graph_node->SetGeModel(ge_model); + GELOGI("Load model and graph form cache om file."); + } + return SUCCESS; +} + +Status GraphManager::SaveCacheBeforeBuild(uint32_t graph_id, const ModelCacheHelperPtr &cache_helper) { + auto ret = cache_helper->SaveCacheInfoToCache(); + if (ret != SUCCESS) { + GELOGW("Fail to save cache info of graph[%d] to cache.", graph_id); + return FAILED; + } + ret = cache_helper->SaveVarManagerToCache(true); + if (ret != SUCCESS) { + GELOGW("Fail to save var manager to cache."); + cache_helper->ClearCache(graph_id); + return FAILED; + } + GELOGI("Cache files have been saved."); + return SUCCESS; +} + +Status GraphManager::SaveCacheAfterBuild(uint32_t graph_id, ge::ComputeGraphPtr graph, GeModelPtr &ge_model) { + std::shared_ptr instance_ptr = ge::GELib::GetInstance(); + if ((instance_ptr == nullptr) || !instance_ptr->InitFlag()) { + GELOGW("GELib not initialized."); + return FAILED; + } + + if (instance_ptr->IsIncreBuild()) { + auto iter = cache_helper_map_.find(graph_id); + if (iter == cache_helper_map_.end()) { + GELOGW("Can not find ModelCacheHelper of graph[%u]", graph_id); + return FAILED; + } else { + ModelCacheHelperPtr cache_helper = iter->second; + auto ret = cache_helper->RefreshComputeGraph(graph); + if (ret != SUCCESS) { + cache_helper->ClearCache(graph_id); + GELOGW("Fail to refresh cache helper's compute graph"); + return FAILED; + } + ret = cache_helper->SaveVarManagerToCache(false); + if (ret != SUCCESS) { + cache_helper->ClearCache(graph_id); + GELOGW("Fail to save VarManager to cache"); + return FAILED; + } + ret = cache_helper->SaveOmModelToCache(ge_model); + if (ret != SUCCESS) { + cache_helper->ClearCache(graph_id); + GELOGW("Fail to save om model to cache"); + return FAILED; + } + } + } + return SUCCESS; +} + Status GraphManager::InnerRunGraph(GraphNodePtr &graph_node, const GraphId &graph_id, const std::vector &inputs, std::vector &outputs) { Status ret = graph_executor_.SetCondition(&sync_run_mutex_, &condition_, graph_run_listener_); @@ -551,6 +649,9 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector ge_models; if (options_.local_fmk_op_flag) { @@ -583,7 +684,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vectorGetSubGraph(); if (IsCheckpointGraph(checkPointGraph)) { - ret = CheckpointHandle(graph_id, outputs); + ret = CheckpointHandle(graph_id, checkPointGraph, outputs); if (ret != SUCCESS) { GELOGE(ret, "[RunGraph] CheckpointHandle failed!"); } @@ -667,6 +768,15 @@ Status GraphManager::SaveParams(ge::GeModel &model, const std::string &type, con return SUCCESS; } +void GraphManager::RemoveModelCacheHelper(const GraphId &graph_id) { + auto iter = cache_helper_map_.find(graph_id); + if (iter != cache_helper_map_.end()) { + cache_helper_map_.erase(iter); + } else { + GELOGW("[GraphManager] cache helper does not exist, graph_id = %u", graph_id); + } +} + Status GraphManager::RemoveGraph(const GraphId &graph_id) { auto it = graph_map_.find(graph_id); if (it == graph_map_.end()) { @@ -716,6 +826,9 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { } var_acc_ctrl_.RemoveGraph(graph_id); graph_map_.erase(it); + + RemoveModelCacheHelper(graph_id); + auto ge_model = graph_node->GetGeModel(); if (ge_model != nullptr) { GELOGI("Unload model %u.", ge_model->GetModelId()); @@ -1106,21 +1219,15 @@ Status GraphManager::SummaryHandle(const GraphId &graph_id, std::vector &outputs) { +Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGraphPtr &compute_graph, + const std::vector &outputs) { GELOGI("[GraphManager] CheckpointHandle, outputsSize=%zu.", outputs.size()); std::vector outputs_desc = graph_executor_.GetOutputsDesc(); GELOGI("[GraphManager] CheckpointHandle, outputsDescSize=%zu.", outputs_desc.size()); - // find graph - GraphNodePtr graph_node = nullptr; - Status ret = GetGraphNode(graph_id, graph_node); - if (ret != SUCCESS) { - GELOGE(ret, "[CheckpointHandle] graph not exist, graph_id = %u.", graph_id); - return ret; - } - ComputeGraphPtr compute_graph_ptr = GraphUtils::GetComputeGraph(*(graph_node->GetGraph())); + std::map save_results; NodePtr netoutput = nullptr; - for (const auto &node : compute_graph_ptr->GetDirectNode()) { + for (const auto &node : compute_graph->GetDirectNode()) { if (node->GetType() == kNetOutput) { netoutput = node; break; @@ -1248,6 +1355,8 @@ bool GraphManager::CheckTransOpForCheckpointGraph(NodePtr &node) { return true; } +static inline bool CheckConstanOpForCheckpointGraph(NodePtr &node) { return node->GetOutDataNodes().empty(); } + bool GraphManager::IsCheckpointGraph(ComputeGraphPtr &compute_graph) { if (compute_graph == nullptr) { GELOGE(GE_GRAPH_PARAM_NULLPTR, "[IsCheckpointGraph] computeGraph is nullptr."); @@ -1268,6 +1377,10 @@ bool GraphManager::IsCheckpointGraph(ComputeGraphPtr &compute_graph) { if (!CheckTransOpForCheckpointGraph(node)) { return false; } + } else if (op->GetType() == CONSTANTOP) { + if (!CheckConstanOpForCheckpointGraph(node)) { + return false; + } } else if (op->GetType() != kSend && op->GetType() != kRecv) { GELOGI("this node is not allow in checkpoint sub graph, node_type: %s, node_name: %s.", op->GetType().c_str(), op->GetName().c_str()); @@ -1439,8 +1552,6 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra names_to_passes.emplace_back("ReshapeRemovePass", &trans_op_nearby_allreduce_fusion_pass); ReshapeRemovePass reshape_remove_pass; names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass); - ReplaceWithEmptyConstPass replace_with_empty_const_pass; - names_to_passes.emplace_back("ReplaceWithEmptyConstPass", &replace_with_empty_const_pass); ConstantFoldingPass constant_folding_pass; names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass); DimensionAdjustPass dimension_adjust_pass; @@ -1632,6 +1743,51 @@ Status GraphManager::RunGraphAsync(const GraphId &graph_id, const std::vector instance_ptr = ge::GELib::GetInstance(); + if (instance_ptr != nullptr && instance_ptr->IsIncreBuild()) { + auto iter = cache_helper_map_.find(graph_id); + if (iter == cache_helper_map_.end()) { + ModelCacheHelperPtr cache_helper = MakeShared(session_id, graph_id, compute_graph); + if (cache_helper != nullptr) { + cache_helper_map_.emplace(std::make_pair(graph_id, cache_helper)); + } else { + GELOGW("Cache helper make shared failed, graph_id = %u.", graph_id); + } + } + } +} + +Status GraphManager::IncreBuild(const GraphNodePtr &graph_node, GeModelPtr &ge_model) { + std::shared_ptr instance_ptr = ge::GELib::GetInstance(); + if (instance_ptr == nullptr || !instance_ptr->IsIncreBuild()) { + return FAILED; + } + const uint32_t graph_id = graph_node->GetGraphId(); + auto iter = cache_helper_map_.find(graph_id); + if (iter == cache_helper_map_.end()) { + GELOGW("Can not find ModelCacheHelper of graph[%u]", graph_id); + return FAILED; + } + ModelCacheHelperPtr cache_helper = iter->second; + if (cache_helper->IsModelCacheHit()) { + GEEVENT("Model cache hit."); + Status ret = LoadFromCache(graph_node, cache_helper, ge_model); + if (ret == SUCCESS) { + return SUCCESS; + } else { + GELOGW("Error occurred when load from cache, abandon."); + } + } else { + GEEVENT("Model cache miss."); + } + if (SaveCacheBeforeBuild(graph_node->GetGraphId(), cache_helper) != SUCCESS) { + GELOGW("Error occurred when save cache."); + } + return FAILED; +} + void GraphManager::PreRunThread(GraphManager *graph_manager) { if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) { GELOGW("Set thread name failed."); @@ -1685,6 +1841,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { return; } } + // when set incre build, save cache helper. + graph_manager->AddModelCacheHelperToMap(args.graph_id, args.session_id, compute_graph_tmp); std::vector ge_models; @@ -1707,12 +1865,15 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { return; } - ret = graph_manager->PreRun(graph_node, ge_inputs, ge_models, ge_model, args.session_id); - if (ret != SUCCESS) { - graph_node->SetRunFlag(false); - ReturnError(graph_manager, args.callback, ret, "PreRun failed, thread exit."); - graph_node->Unlock(); - return; + // check need incre build. + if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) { + ret = graph_manager->PreRun(graph_node, ge_inputs, ge_models, ge_model, args.session_id); + if (ret != SUCCESS) { + graph_node->SetRunFlag(false); + ReturnError(graph_manager, args.callback, ret, "PreRun Failed, thread exit.."); + graph_node->Unlock(); + return; + } } graph_node->SetBuildFlag(true); graph_manager->var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId()); diff --git a/src/ge/graph/manager/graph_manager.h b/src/ge/graph/manager/graph_manager.h index 5a296b91..92ea48c5 100644 --- a/src/ge/graph/manager/graph_manager.h +++ b/src/ge/graph/manager/graph_manager.h @@ -27,6 +27,7 @@ #include "common/blocking_queue.h" #include "common/ge_inner_error_codes.h" +#include "common/helper/model_cache_helper.h" #include "external/graph/types.h" #include "ge/ge_api_types.h" #include "graph/build/graph_builder.h" @@ -211,7 +212,8 @@ class GraphManager { Status SummaryHandle(const GraphId &graph_id, std::vector &outputs); - Status CheckpointHandle(const GraphId &graph_id, const std::vector &outputs); + Status CheckpointHandle(const GraphId &graph_id, const ComputeGraphPtr &compute_graph, + const std::vector &outputs); // call the callback function of ME to push summary result data to ME Status PushSummaryData2ME(const GraphId &graph_id, const std::map &summary_data); @@ -260,6 +262,13 @@ class GraphManager { bool IsGraphNeedBuild(const GraphNodePtr &graph_node); + Status LoadFromCache(const GraphNodePtr &graph_node, const ModelCacheHelperPtr &cache_helper, GeModelPtr &ge_model); + Status SaveCacheBeforeBuild(uint32_t graph_id, const ModelCacheHelperPtr &cache_helper); + Status SaveCacheAfterBuild(uint32_t graph_id, ComputeGraphPtr graph, GeModelPtr &ge_model); + void AddModelCacheHelperToMap(const GraphId &graph_id, uint64_t session_id, ComputeGraphPtr &compute_graph); + Status IncreBuild(const GraphNodePtr &graph_node, GeModelPtr &ge_model); + void RemoveModelCacheHelper(const GraphId &graph_id); + static void PreRunThread(GraphManager *graph_manager); static void RunThread(GraphManager *graph_manager); static void StopQueue(GraphManager *graph_manager); @@ -274,6 +283,8 @@ class GraphManager { std::map graph_map_; + std::map cache_helper_map_; + // for run graph synchronous return std::mutex sync_run_mutex_; std::condition_variable condition_; diff --git a/src/ge/graph/manager/graph_var_manager.cc b/src/ge/graph/manager/graph_var_manager.cc index d5ffbd03..f40ca7ce 100644 --- a/src/ge/graph/manager/graph_var_manager.cc +++ b/src/ge/graph/manager/graph_var_manager.cc @@ -64,6 +64,10 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens return SUCCESS; } +void VarResource::GetAllVarAddrMgr(std::unordered_map &var_addr_mgr_map) { + var_addr_mgr_map = var_addr_mgr_map_; +} + void VarResource::SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr, rtMemType_t memory_type) { std::string var_key = VarKey(var_name, tensor_desc); @@ -170,6 +174,14 @@ void VarResource::SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &b var_broad_cast_info_[graph_id][broad_cast_info.var_name] = broad_cast_info; } +ge::Status VarResource::GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info) { + if (var_broad_cast_info_.count(graph_id) == 0 || var_broad_cast_info_[graph_id].count(var_name) == 0) { + return FAILED; + } + broad_cast_info = var_broad_cast_info_[graph_id][var_name]; + return SUCCESS; +} + ge::Status VarResource::SyncVarData2BroadCast(uint32_t graph_id, const std::string &var_name, const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr) { if (var_op_desc == nullptr) { @@ -282,11 +294,17 @@ Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uin // align 512 BYTE var_mem_size_ = var_mem_size_ + kSessionMemAlignSize; + GELOGI( + "[IMAS]AssignVarMem Set session_%lu name[%s] output[%d]" + "offset to [%zu] size[%lu] realsize[%lu].", + session_id, var_name.c_str(), 0, mem_offset, (var_mem_size_ - mem_offset), real_size); return SUCCESS; } int64_t MemResource::GetVarMemSize() const { return var_mem_size_; } +void MemResource::UpdateVarMemSize(int64_t mem_size) { var_mem_size_ = mem_size; }; + VarManager::VarManager(uint64_t session_id) : version_(SessionVersion::OTHER_VERSION), session_id_(session_id), @@ -363,6 +381,21 @@ ge::Status VarManager::SetVarAddr(const std::string &var_name, const ge::GeTenso return ge::SUCCESS; } +ge::Status VarManager::SaveVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *address, + rtMemType_t memory_type) { + GELOGI("VarManager::SaveVarAddr var_name = %s, data_type = %s, data_format = %s.", var_name.c_str(), + ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str()); + + std::lock_guard lock(mutex_); + if (var_resource_ == nullptr) { + GELOGW("VarManager has not been init."); + return ge::INTERNAL_ERROR; + } + var_resource_->SaveVarAddr(var_name, tensor_desc, address, memory_type); + return ge::SUCCESS; +} + ge::Status VarManager::GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr, rtMemType_t &memory_type) { std::lock_guard lock(mutex_); @@ -388,6 +421,10 @@ ge::Status VarManager::GetVarAddr(const std::string &var_name, const ge::GeTenso return GetVarAddr(var_name, tensor_desc, dev_ptr, memory_type); } +void VarManager::GetAllVarAddrMgr(std::unordered_map &var_addr_mgr_map) { + var_resource_->GetAllVarAddrMgr(var_addr_mgr_map); +} + int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { std::lock_guard lock(mutex_); MemResource *mem_resource = nullptr; @@ -405,14 +442,36 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) { return mem_resource->GetVarMemSize(); } +Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) { + std::lock_guard lock(mutex_); + MemResource *mem_resource = nullptr; + auto iter = mem_resource_map_.find(memory_type); + if (iter == mem_resource_map_.end()) { + mem_resource = new (std::nothrow) MemResource(); + if (mem_resource == nullptr) { + GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type); + return ge::INTERNAL_ERROR; + } else { + mem_resource_map_[memory_type] = mem_resource; + } + } else { + mem_resource = iter->second; + } + + if (mem_resource == nullptr) { + GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid."); + return FAILED; + } + mem_resource->UpdateVarMemSize(mem_size); + return SUCCESS; +} + ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, rtMemType_t memory_type) { std::lock_guard lock(mutex_); - GELOGI( - "VarManager::AssignVarMem var_name = %s, data_type = %s, data_format = " - "%s.", - var_name.c_str(), ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), - ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str()); + GELOGI("VarManager::AssignVarMem var_name = %s, data_type = %s, data_format = %s.", var_name.c_str(), + ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str()); int64_t tensor_desc_size = 0; size_t mem_offset = 0; @@ -475,14 +534,13 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen if (cur_tensor_desc.GetFormat() != tensor_desc.GetFormat() || cur_tensor_desc.GetDataType() != tensor_desc.GetDataType() || cur_tensor_desc.GetShape().GetDims() != tensor_desc.GetShape().GetDims()) { - GELOGI( - "var %s assigned new memory (format, data type, shape) (%s, %s, " - "%zu) from (%s, %s, %zu)", - var_name.c_str(), ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), - ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(), tensor_desc.GetShape().GetDims().size(), - ge::TypeUtils::DataTypeToSerialString(cur_tensor_desc.GetDataType()).c_str(), - ge::TypeUtils::FormatToSerialString(cur_tensor_desc.GetFormat()).c_str(), - cur_tensor_desc.GetShape().GetDims().size()); + GELOGI("var %s assigned new memory (format, data type, shape) (%s, %s, %zu) from (%s, %s, %zu)", var_name.c_str(), + ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(), + tensor_desc.GetShape().GetDims().size(), + ge::TypeUtils::DataTypeToSerialString(cur_tensor_desc.GetDataType()).c_str(), + ge::TypeUtils::FormatToSerialString(cur_tensor_desc.GetFormat()).c_str(), + cur_tensor_desc.GetShape().GetDims().size()); var_resource_->SetVarAddr(var_name, tensor_desc, reinterpret_cast(reinterpret_cast(mem_offset)), memory_type); } @@ -550,6 +608,16 @@ ge::Status VarManager::SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastIn return SUCCESS; } +ge::Status VarManager::GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info) { + std::lock_guard lock(mutex_); + + if (var_resource_ == nullptr) { + GELOGW("VarManager has not been init."); + return ge::INTERNAL_ERROR; + } + return var_resource_->GetBroadCastInfo(graph_id, var_name, broad_cast_info); +} + ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPtr op_desc) { std::lock_guard lock(mutex_); GELOGD("VarManager::RenewCurVarDesc var_name = %s.", var_name.c_str()); @@ -672,6 +740,7 @@ Status VarManager::SetMemoryMallocSize(const map &options) { GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse graph memory manager malloc max size failed."); return ge::GE_GRAPH_OPTIONS_INVALID; } + GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_); } it = options.find(VARIABLE_MEMORY_MAX_SIZE); diff --git a/src/ge/graph/manager/graph_var_manager.h b/src/ge/graph/manager/graph_var_manager.h index a2b974e4..8b551e06 100644 --- a/src/ge/graph/manager/graph_var_manager.h +++ b/src/ge/graph/manager/graph_var_manager.h @@ -101,6 +101,8 @@ class VarResource { ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr, rtMemType_t &memory_type); + void GetAllVarAddrMgr(std::unordered_map &var_addr_mgr_map); + void SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr, rtMemType_t rtMemType_t); @@ -113,6 +115,8 @@ class VarResource { void SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &broad_cast_info); + ge::Status GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info); + ge::Status SyncVarData2BroadCast(uint32_t graph_id, const std::string &var_name, const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr); @@ -175,6 +179,8 @@ class MemResource { int64_t GetVarMemSize() const; + void UpdateVarMemSize(int64_t mem_size); + private: uint64_t total_size_; uint64_t var_mem_size_; @@ -196,9 +202,14 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager { ge::Status SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr, rtMemType_t memory_type); + ge::Status SaveVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *address, + rtMemType_t memory_type); + ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr, rtMemType_t &memory_type); + void GetAllVarAddrMgr(std::unordered_map &var_addr_mgr_map); + ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr); ge::Status SyncVarData(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc, @@ -206,6 +217,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager { ge::Status SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &broad_cast_info); + ge::Status GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info); + ge::Status SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc, uint8_t *base_ptr); @@ -251,6 +264,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager { int64_t GetVarMemSize(rtMemType_t memory_type); + Status UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size); + bool IsVarExist(const std::string &var_name, const ge::GeTensorDesc &tensor_desc); bool IsVarExist(const std::string &var_name); diff --git a/src/ge/graph/partition/graph_partition.cc b/src/ge/graph/partition/graph_partition.cc index feced331..bc8c9b9b 100644 --- a/src/ge/graph/partition/graph_partition.cc +++ b/src/ge/graph/partition/graph_partition.cc @@ -238,6 +238,14 @@ Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_co return FAILED; } GE_TIMESTAMP_END(MergeGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting"); + // flush all nodes' engine of merged graph + GE_TIMESTAMP_START(MergeGraphEnginePlacerRun); + graph_info_.engine_placer_.SetComputeGraph(output_merged_compute_graph); + if (graph_info_.engine_placer_.Run() != SUCCESS) { + GELOGE(GE_GRAPH_INIT_FAILED, "[GraphPartitioner]: engine_placer run failed"); + return FAILED; + } + GE_TIMESTAMP_END(MergeGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun"); GELOGI("Graph merge ends."); return SUCCESS; } diff --git a/src/ge/graph/passes/atomic_addr_clean_pass.cc b/src/ge/graph/passes/atomic_addr_clean_pass.cc index 6c312efa..e95f0680 100644 --- a/src/ge/graph/passes/atomic_addr_clean_pass.cc +++ b/src/ge/graph/passes/atomic_addr_clean_pass.cc @@ -200,7 +200,18 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) { vector op_info_vec = ops_kernel_manager.GetOpsKernelInfo(op_desc->GetType()); for (const auto &op_info : op_info_vec) { if (op_info.isAtomic) { - GELOGI("Recognized atomic op %s from HCCL engine.", op_desc->GetName().c_str()); + GELOGI("Recognized atomic op %s from DNN_HCCL engine.", op_desc->GetName().c_str()); + // check peer input is DATA + for (auto &in_data_anchor : node->GetAllInDataAnchors()) { + if (in_data_anchor->GetPeerOutAnchor() != nullptr && + in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) { + auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode(); + if (peer_in_node->GetType() == DATA) { + GELOGI("Recognized atomic op %s from DNN_HCCL engine and input is DATA.", op_desc->GetName().c_str()); + return false; + } + } + } hcom_node_vec_.push_back(node); return true; } diff --git a/src/ge/graph/passes/folding_kernel/cast_kernel.cc b/src/ge/graph/passes/folding_kernel/cast_kernel.cc index bcd26f70..99944c20 100644 --- a/src/ge/graph/passes/folding_kernel/cast_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/cast_kernel.cc @@ -49,9 +49,11 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetData().data(); - if (op_desc_ptr == nullptr || src_data == nullptr) { - GELOGE(PARAM_INVALID, "Parameter's invalid, Input opDescPtr or src_data is nullptr."); + // src_data == nullptr is supported + if (op_desc_ptr == nullptr) { + GELOGE(PARAM_INVALID, "Parameter's invalid, Input opDescPtr is nullptr."); return PARAM_INVALID; } GeTensorDesc op_desc = op_desc_ptr->GetOutputDesc(0); @@ -73,7 +75,7 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorGetData().GetSize()); + // const_weight_ptr->GetData().GetSize() == 0 is supported auto src_data_size = src_shape.GetShapeSize(); if (src_data_size == 0 && static_cast(const_weight_ptr->GetData().GetSize()) == GetSizeByDataType(src_data_type)) { @@ -113,7 +115,6 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vectorSetData(trans_result.data.get(), trans_result.length) != SUCCESS) { GELOGW("Compute: SetData failed"); - return FAILED; } v_output.push_back(output_ptr); return SUCCESS; diff --git a/src/ge/graph/passes/folding_kernel/kernel_utils.cc b/src/ge/graph/passes/folding_kernel/kernel_utils.cc index 9448b232..2002643a 100644 --- a/src/ge/graph/passes/folding_kernel/kernel_utils.cc +++ b/src/ge/graph/passes/folding_kernel/kernel_utils.cc @@ -113,12 +113,26 @@ bool KernelUtils::CheckSizeForTransOp(const ge::ConstGeTensorPtr &const_weight_p GELOGI("Const real value Size:%zu, op_desc Shape Size:%ld, data_type:%s.", data_size, cal_size, TypeUtils::DataTypeToSerialString(data_type).c_str()); - if ((shape_size != 0) || (length != 0 && (data_size / static_cast(length) != 1))) { - if (!(data_size == static_cast(cal_size) && data_size != 0)) { + if (shape_size != 0) { + // Standard tensor + if (data_size != static_cast(cal_size) || data_size == 0) { + GELOGW("Const input data size is not equal with tensor desc shape"); + return false; + } + } else if (data_shape.GetDimNum() != 0) { + // Empty tensor, has zero in shape vector + if (data_size != 0) { + GELOGW("Const input data size is not equal with tensor desc shape"); + return false; + } + } else { + // Scalar tensor, has only one element in tensor + if (length != 0 && (data_size / static_cast(length) != 1)) { GELOGW("Const input data size is not equal with tensor desc shape"); return false; } } + return true; } diff --git a/src/ge/graph/passes/folding_kernel/kernel_utils.h b/src/ge/graph/passes/folding_kernel/kernel_utils.h index 9eadf4ca..17b645aa 100644 --- a/src/ge/graph/passes/folding_kernel/kernel_utils.h +++ b/src/ge/graph/passes/folding_kernel/kernel_utils.h @@ -29,6 +29,7 @@ namespace ge { class KernelUtils { public: KernelUtils() = delete; + ~KernelUtils() = delete; static Status CheckDimensionNodeInfo(const NodePtr &node_ptr); static bool CheckFormatSupported(const NodePtr &node_ptr); static bool CheckSizeForTransOp(const ConstGeTensorPtr &const_weight_ptr, const OpDescPtr &op_desc_ptr); @@ -41,7 +42,7 @@ class KernelUtils { * @param [out] output the tensor for save sequence of numbers * @author */ - template + template static Status GenData(const int64_t data_num, const T value, const GeTensorPtr &output) { if (data_num > 0) { if (!CheckInt64MulOverflow(data_num, static_cast(sizeof(T)))) { @@ -69,12 +70,12 @@ class KernelUtils { } /** - * Calculate dimension - * @param [in] dims save the tensor of the dimension - * @param [in] vec_dim results of each dimension - * @param [out] data_num total size of data - * @author - */ + * Calculate dimension + * @param [in] dims save the tensor of the dimension + * @param [in] vec_dim results of each dimension + * @param [out] data_num total size of data + * @author + */ template static Status CalcDims(const ConstGeTensorPtr dims, std::vector &vec_dim, int64_t &data_num) { data_num = 1; diff --git a/src/ge/graph/passes/folding_kernel/pack_kernel.cc b/src/ge/graph/passes/folding_kernel/pack_kernel.cc index c79acd76..5db3b394 100644 --- a/src/ge/graph/passes/folding_kernel/pack_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/pack_kernel.cc @@ -67,8 +67,8 @@ Status PackKernel::ValidateKernelParams(const ge::OpDescPtr &op_desc_ptr, return PARAM_INVALID; } if (!(AttrUtils::GetInt(op_desc_ptr, PACK_ATTR_NAME_NUM, n_))) { - GELOGE(PARAM_INVALID, "Attr %s is not exist.", PACK_ATTR_NAME_NUM.c_str()); - return PARAM_INVALID; + n_ = 0; + GELOGD("Attr %s is not set, default value %ld is used.", PACK_ATTR_NAME_NUM.c_str(), n_); } if (!(AttrUtils::GetInt(op_desc_ptr, ATTR_NAME_AXIS, axis_))) { GELOGE(PARAM_INVALID, "Attr %s is not exist.", ATTR_NAME_AXIS.c_str()); @@ -105,11 +105,7 @@ Status PackKernel::ValidateInputs(const ge::OpDescPtr &op_desc_ptr, const std::v GELOGW("Input %ld of pack kernel %s is null.", i, op_desc_ptr->GetName().c_str()); return PARAM_INVALID; } - // check if tensor contains data - if (input[i]->GetData().size() == 0) { - GELOGW("Inputs %ld do not have value.", i); - return NOT_CHANGED; - } + if (i == 0) { // get first input shape shape = input[0]->GetTensorDesc().GetShape(); @@ -127,8 +123,8 @@ Status PackKernel::ValidateInputs(const ge::OpDescPtr &op_desc_ptr, const std::v auto dst_shape = tensor_desc.GetShape(); int64_t num = 1; for (auto dim : dst_shape.GetDims()) { - if (dim < 1) { - GELOGW("Invalid zero dim in the shape %s", formats::ShapeToString(shape).c_str()); + if (dim < 0) { + GELOGW("Invalid dim ld% in the shape %s", dim, formats::ShapeToString(shape).c_str()); return NOT_CHANGED; } num *= dim; @@ -141,6 +137,12 @@ Status PackKernel::ValidateInputs(const ge::OpDescPtr &op_desc_ptr, const std::v GELOGW("Shape of input %ld is not equal wiht input 0.", i); return NOT_CHANGED; } + + // check tensor data size is zero ot not + if (input[i]->GetData().size() == 0 && num != 0) { + GELOGW("Inputs %ld do not have value.", i); + return NOT_CHANGED; + } } return SUCCESS; } @@ -167,6 +169,13 @@ void PackKernel::ExpandDims(const int64_t axis, const std::vector &input, ge::GeTensorPtr &output_ptr) { + output_ptr->MutableTensorDesc().SetShape(final_shape); + output_ptr->MutableTensorDesc().SetDataType(DataType(data_type_)); + if (final_shape.GetShapeSize() == 0 && final_shape.GetDims().size() != 0) { + // means has zero in shape list, output tnesor data is []. + return SUCCESS; + } + int64_t times = 1; int64_t unit = 1; // calculate data unit @@ -210,8 +219,6 @@ Status PackKernel::CopyOutputData(const GeShape &final_shape, const std::vector< if (output_ptr->SetData(buf.get(), static_cast(output_size * data_size)) != GRAPH_SUCCESS) { GELOGW("CopyOutputData: SetData failed"); } - output_ptr->MutableTensorDesc().SetShape(final_shape); - output_ptr->MutableTensorDesc().SetDataType(DataType(data_type_)); return SUCCESS; } diff --git a/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc b/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc index 76a67dac..b7fd11b1 100644 --- a/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc @@ -63,10 +63,7 @@ Status ReduceProdKernel::ReduceProdCheck(const ge::OpDescPtr &op_desc_ptr, GELOGE(PARAM_INVALID, "Axis must be at most rank 1, node node: %s", op_desc_ptr->GetName().c_str()); return PARAM_INVALID; } - if (data_tensor->GetData().size() == 0 || axis_tensor->GetData().size() == 0) { - GELOGE(PARAM_INVALID, "ReduceProdKernel data size of inputs is 0, node node: %s", op_desc_ptr->GetName().c_str()); - return PARAM_INVALID; - } + DataType data_type = data_tensor->GetTensorDesc().GetDataType(); if (kReduceProdSupportedType.find(data_type) == kReduceProdSupportedType.end()) { GELOGE(PARAM_INVALID, "ReduceProdKernel data type %s not support, node name: %s", @@ -151,7 +148,6 @@ Status ReduceProdKernel::DataCal(const std::vector &input, static_cast(head_dim_ * end_dim_ * sizeof(int32_t))) != GRAPH_SUCCESS, GELOGW("set data failed"); return INTERNAL_ERROR); - output_ptr->MutableTensorDesc().SetDataType(data_dtype); } return SUCCESS; } @@ -260,19 +256,32 @@ Status ReduceProdKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vec if (ret != SUCCESS) { return NOT_CHANGED; } + } else if (input.at(kReduceProdAxisIndex)->GetData().size() == 0) { + // axis tensor value is [], means no process for input + output_ptr->MutableTensorDesc().SetShape(input.at(kReduceProdDataIndex)->GetTensorDesc().GetShape()); + output_ptr->MutableTensorDesc().SetDataType(input.at(kReduceProdDataIndex)->GetTensorDesc().GetDataType()); + if (output_ptr->SetData(input.at(kReduceProdDataIndex)->GetData()) != GRAPH_SUCCESS) { + GELOGW("Compute: SetData failed"); + } } else { // calculate axis to reduce ret = AxisCal(input); if (ret != SUCCESS) { return NOT_CHANGED; } - // calculate data and data type - ret = DataCal(input, output_ptr); - if (ret != SUCCESS) { - return NOT_CHANGED; - } - // calculate shape + // calculate and set shape ShapeCal(op_desc_ptr, input, output_ptr); + // set data type + output_ptr->MutableTensorDesc().SetDataType(input.at(kReduceProdDataIndex)->GetTensorDesc().GetDataType()); + + // data size == 0 means input tensor has zero in shape, and tensor value is []. + if (input.at(kReduceProdDataIndex)->GetData().size() != 0) { + // calculate data and data type + ret = DataCal(input, output_ptr); + if (ret != SUCCESS) { + return NOT_CHANGED; + } + } } // print output tensor information, and will be deleted diff --git a/src/ge/graph/passes/folding_kernel/transdata_kernel.cc b/src/ge/graph/passes/folding_kernel/transdata_kernel.cc index b1bfe92d..d3637169 100644 --- a/src/ge/graph/passes/folding_kernel/transdata_kernel.cc +++ b/src/ge/graph/passes/folding_kernel/transdata_kernel.cc @@ -48,8 +48,9 @@ Status TransdataKernel::ValidateInput(const OpDescPtr &op_desc_ptr, const std::v GELOGE(PARAM_INVALID, "Input const_weight_ptr is nullptr."); return PARAM_INVALID; } - const uint8_t *src_data = const_weight_ptr->GetData().data(); - if (op_desc_ptr == nullptr || src_data == nullptr) { + + // src_data == nullptr is supported + if (op_desc_ptr == nullptr) { GELOGE(PARAM_INVALID, "Input opDescPtr is nullptr."); return PARAM_INVALID; } diff --git a/src/ge/graph/passes/pass_utils.h b/src/ge/graph/passes/pass_utils.h index a8b1cfe3..b889a056 100644 --- a/src/ge/graph/passes/pass_utils.h +++ b/src/ge/graph/passes/pass_utils.h @@ -26,6 +26,7 @@ namespace ge { class PassUtils { public: PassUtils() = delete; + ~PassUtils() = delete; static NodePtr GetInDataNode(const ConstNodePtr &node, int index); diff --git a/src/ge/graph/passes/switch_op_pass.cc b/src/ge/graph/passes/switch_op_pass.cc index 5ed1cb1c..b21f962b 100644 --- a/src/ge/graph/passes/switch_op_pass.cc +++ b/src/ge/graph/passes/switch_op_pass.cc @@ -137,7 +137,7 @@ Status SwitchOpPass::ReplaceSwitchNode(ComputeGraphPtr &graph, NodePtr &switch_n NodePtr out_node = peer_in_anchor->GetOwnerNode(); GE_CHK_STATUS_RET(GetOriginalType(out_node, type), "Get node type fail."); if ((type == MERGE) || (type == REFMERGE)) { - NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, peer_data_anchor); + NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, peer_data_anchor, false); GE_CHK_BOOL_EXEC(memcpy_node != nullptr, return FAILED, "Create memcpy_async node fail."); GE_CHK_STATUS(GraphUtils::AddEdge(peer_data_anchor, memcpy_node->GetInDataAnchor(0)), "MemcpyAsync node add edge fail."); @@ -234,16 +234,18 @@ Status SwitchOpPass::ReplaceMergeNode(ComputeGraphPtr &graph, NodePtr &merge_nod need_label_nodes_.emplace_back(stream_merge); } + bool multi_batch_flag = false; if (merge_op_desc->HasAttr(ATTR_INSERT_BY_MBATCH)) { if (!ge::AttrUtils::SetBool(op_desc, ATTR_INSERT_BY_MBATCH, true)) { GELOGE(FAILED, "Set attr ATTR_INSERT_BY_MBATCH fail, StreamMerge:%s.", node_name.c_str()); return FAILED; } + multi_batch_flag = true; } (void)bypass_nodes_.insert(merge_node); - GE_CHK_STATUS_RET(AddMemcpyAsyncNodes(graph, stream_merge), "StreamMerge add memcpy node fail."); + GE_CHK_STATUS_RET(AddMemcpyAsyncNodes(graph, stream_merge, multi_batch_flag), "StreamMerge add memcpy node fail."); return SUCCESS; } @@ -302,17 +304,20 @@ NodePtr SwitchOpPass::CreateStreamSwitchNode(ComputeGraphPtr &graph, const NodeP /// @brief Add MemcpyAsync Node /// @param [in] graph /// @param [in] in_node +/// @param [in] multi_batch_flag /// @return ge::NodePtr /// -NodePtr SwitchOpPass::CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor) { +NodePtr SwitchOpPass::CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor, + bool multi_batch_flag) { GE_CHK_BOOL_EXEC(out_data_anchor != nullptr, return nullptr, "Param of input node is null."); OpDescPtr pre_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc(); GE_CHK_BOOL_EXEC(pre_op_desc != nullptr, return nullptr, "OpDesc of pre node is invalid."); - std::string node_name = pre_op_desc->GetName() + "_" + MEMCPYASYNC; + std::string memcpy_type = multi_batch_flag ? MEMCPYADDRASYNC : MEMCPYASYNC; + std::string node_name = pre_op_desc->GetName() + "_" + memcpy_type; node_name = CheckDuplicateName(node_name); GELOGI("Create MemcpyAsync op:%s.", node_name.c_str()); - OpDescPtr op_desc = MakeShared(node_name, MEMCPYASYNC); + OpDescPtr op_desc = MakeShared(node_name, memcpy_type); if (op_desc == nullptr) { GELOGE(FAILED, "Create op_desc fail, MemcpyAsync:%s.", node_name.c_str()); return nullptr; @@ -432,9 +437,10 @@ NodePtr SwitchOpPass::CreateActiveNode(ComputeGraphPtr &graph, NodePtr &node) { /// @brief Add MemcpyAsync Op as StreamMerge in_node /// @param [in] graph /// @param [in] node +/// @param [in] multi_batch_flag /// @return Status /// -Status SwitchOpPass::AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &node) { +Status SwitchOpPass::AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &node, bool multi_batch_flag) { GE_CHK_BOOL_EXEC(node != nullptr, return FAILED, "Param of pre node is null."); for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) { OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor(); @@ -447,7 +453,7 @@ Status SwitchOpPass::AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &node) continue); GE_IF_BOOL_EXEC(type != MEMCPYASYNC, { - in_node = CreateMemcpyAsyncNode(graph, peer_out_anchor); + in_node = CreateMemcpyAsyncNode(graph, peer_out_anchor, multi_batch_flag); GE_CHK_BOOL_EXEC(in_node != nullptr, return FAILED, "Create MemcpyAsync node fail."); GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), "MemcpyAsync node remove edge fail."); GE_CHK_STATUS(GraphUtils::AddEdge(peer_out_anchor, in_node->GetInDataAnchor(0)), diff --git a/src/ge/graph/passes/switch_op_pass.h b/src/ge/graph/passes/switch_op_pass.h index 14cdd22c..7e107e3b 100644 --- a/src/ge/graph/passes/switch_op_pass.h +++ b/src/ge/graph/passes/switch_op_pass.h @@ -103,13 +103,13 @@ class SwitchOpPass : public GraphPass { NodePtr CreateStreamSwitchNode(ComputeGraphPtr &graph, const NodePtr &switch_node, const std::string &suffix, OutDataAnchorPtr &peer_cond_anchor); - NodePtr CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor); + NodePtr CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor, bool multi_batch_flag); Status CombineSwitchNode(ComputeGraphPtr &graph); NodePtr CreateActiveNode(ComputeGraphPtr &graph, NodePtr &node); - Status AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &stream_merge_node); + Status AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &stream_merge_node, bool multi_batch_flag); Status BypassSwitchNode(NodePtr &switch_node, OutDataAnchorPtr &peer_data_anchor, OutDataAnchorPtr &peer_cond_anchor); diff --git a/src/ge/graph/passes/variable_prepare_op_pass.cc b/src/ge/graph/passes/variable_prepare_op_pass.cc index c4ed0405..3a62082a 100644 --- a/src/ge/graph/passes/variable_prepare_op_pass.cc +++ b/src/ge/graph/passes/variable_prepare_op_pass.cc @@ -22,11 +22,14 @@ #include "common/ge/ge_util.h" #include "external/graph/graph.h" #include "framework/common/debug/ge_log.h" +#include "graph/common/omg_util.h" #include "graph/debug/ge_attr_define.h" #include "graph/node.h" #include "graph/utils/tensor_utils.h" namespace ge { +std::map> VariablePrepareOpPass::ref_node_without_prototype_map_{ + {REFSWITCH, {{0, 0}, {0, 1}}}}; Status VariablePrepareOpPass::Run(ComputeGraphPtr graph) { GE_CHECK_NOTNULL(graph); for (const auto &node : graph->GetDirectNode()) { @@ -43,9 +46,7 @@ Status VariablePrepareOpPass::Run(ComputeGraphPtr graph) { for (auto &node : graph->GetDirectNode()) { GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, continue); - bool is_variable = node->GetOpDesc()->GetType() == VARIABLE; - bool is_deal = has_dealed_variable_.find(node->GetName()) == has_dealed_variable_.end(); - if (is_variable && is_deal) { + if (node->GetOpDesc()->GetType() == VARIABLE) { Status ret = DealVariableNode(node); if (ret != SUCCESS) { GELOGE(ret, "variable add back edge failed"); @@ -149,7 +150,7 @@ NodePtr VariablePrepareOpPass::GetFinalWritableNode(ge::NodePtr &writable_node, } } if (!found_writeable_node) { - GELOGI("final writable node is %s", current_node->GetName().c_str()); + GELOGD("final writable node is %s", current_node->GetName().c_str()); return current_node; } } @@ -159,53 +160,54 @@ Status VariablePrepareOpPass::AddVariableRef(ge::NodePtr &final_writable_node, g GE_CHECK_NOTNULL(final_writable_node); GE_CHECK_NOTNULL(var_node); - NodePtr var_ref_node = CreatVariableRef(final_writable_node, var_node); - GE_CHECK_NOTNULL(var_ref_node); - // add control anchor between var_ref_node and final peer node - // var_ref_node need to execute before other nodes + if (final_writable_node->GetType() == FRAMEWORKOP) { + GELOGD("No need to add variable_ref for frameworkop"); + return SUCCESS; + } + std::stringstream variable_ref_name; + variable_ref_name << "_TO_" << final_writable_node->GetName() << "_REF_" << index; + ge::NodePtr find_node = var_node->GetOwnerComputeGraph()->FindNode(var_node->GetName() + variable_ref_name.str()); + if (find_node != nullptr) { + GELOGD("The corresponding variable_ref [%s] has been added to this connection.", find_node->GetName().c_str()); + return SUCCESS; + } + NodePtr variable_ref_node = CreatVariableRef(var_node->GetName() + variable_ref_name.str(), var_node); + + GELOGI("Add variable_ref between [%s] and [%s]", var_node->GetName().c_str(), variable_ref_node->GetName().c_str()); + GE_CHECK_NOTNULL(variable_ref_node); + // add control anchor between variable_ref and final peer node + // variable_ref_node need to execute before other nodes auto final_writable_outAnchors = final_writable_node->GetAllOutAnchors(); for (auto &final_writable_outAnchor : final_writable_outAnchors) { GE_CHECK_NOTNULL(final_writable_outAnchor); for (auto &final_writable_peerAnchor : final_writable_outAnchor->GetPeerAnchors()) { GE_CHECK_NOTNULL(final_writable_peerAnchor); NodePtr peer_node = final_writable_peerAnchor->GetOwnerNode(); - graphStatus ret = ge::GraphUtils::AddEdge(var_ref_node->GetOutControlAnchor(), peer_node->GetInControlAnchor()); + graphStatus ret = + ge::GraphUtils::AddEdge(variable_ref_node->GetOutControlAnchor(), peer_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { - GELOGE(FAILED, "add control anchor between var_ref_node and final_writable peer_node failed"); + GELOGE(FAILED, "add control anchor between variable_ref and final_writable peer node failed"); return FAILED; } } } - // add edge final node:index ---> var_ref_node:0 graphStatus ret = - ge::GraphUtils::AddEdge(final_writable_node->GetOutDataAnchor(index), var_ref_node->GetInDataAnchor(0)); + ge::GraphUtils::AddEdge(final_writable_node->GetOutDataAnchor(index), variable_ref_node->GetInDataAnchor(0)); if (ret != GRAPH_SUCCESS) { - GELOGE(FAILED, "add data anchor between var_ref_node and final_writable peer_node failed"); + GELOGE(FAILED, "add data anchor between variable_ref and final_writable peer node failed"); return FAILED; } return SUCCESS; } -ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_node, ge::NodePtr &var_node) { - if ((final_writable_node == nullptr) || (var_node == nullptr) || (var_node->GetOwnerComputeGraph() == nullptr)) { - GELOGE(FAILED, "parameter ptr is null."); - return nullptr; - } - GELOGD("Create VarRef Op: final_writable_node: [%s] var_node: [%s]>>>>", final_writable_node->GetName().c_str(), - var_node->GetName().c_str()); - - static uint32_t var_ref_count = 0; - std::stringstream var_ref_name; - var_ref_name << "_to_" << final_writable_node->GetName() << "_REF_" << var_ref_count++; - +ge::NodePtr VariablePrepareOpPass::CreatVariableRef(const std::string &variable_ref_name, ge::NodePtr &var_node) { OpDescPtr var_op_desc = var_node->GetOpDesc(); if (var_op_desc == nullptr) { GELOGE(FAILED, "get var opdesc is nullptr"); return nullptr; } - OpDescPtr var_ref_op_desc = - MakeShared(var_node->GetName() + var_ref_name.str().c_str(), var_op_desc->GetType()); + OpDescPtr var_ref_op_desc = MakeShared(variable_ref_name.c_str(), var_op_desc->GetType()); if (var_ref_op_desc == nullptr) { GELOGE(FAILED, "var_ref opdesc is nullptr"); return nullptr; @@ -217,15 +219,15 @@ ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_ GE_IF_BOOL_EXEC(var_ref_op_desc->AddInputDesc(var_op_desc->GetOutputDesc(0)) != SUCCESS, GELOGW("add input desc edge failed"); return nullptr); - NodePtr var_ref_node = var_node->GetOwnerComputeGraph()->AddNode(var_ref_op_desc); - GE_IF_BOOL_EXEC(var_ref_node == nullptr, GELOGW("var_ref_node is null"); return nullptr); - has_dealed_variable_.insert(var_node->GetName()); + NodePtr variable_ref_node = var_node->GetOwnerComputeGraph()->AddNode(var_ref_op_desc); + GE_IF_BOOL_EXEC(variable_ref_node == nullptr, GELOGW("variable_ref_node is null"); return nullptr); bool is_set_str = ge::AttrUtils::SetStr(var_ref_op_desc, REF_VAR_SRC_VAR_NAME, var_op_desc->GetName()); if (is_set_str) { - GELOGD("Set node [%s] REF_VAR_SRC_VAR_NAME [%s]", var_ref_node->GetName().c_str(), var_op_desc->GetName().c_str()); + GELOGD("Set node [%s] REF_VAR_SRC_VAR_NAME [%s]", variable_ref_node->GetName().c_str(), + var_op_desc->GetName().c_str()); } - return var_ref_node; + return variable_ref_node; } int VariablePrepareOpPass::GetWritableNodeOutIndex(const NodePtr &node, int input_index) { @@ -240,16 +242,13 @@ int VariablePrepareOpPass::GetWritableNodeOutIndex(const NodePtr &node, int inpu } } - auto node_iter = ref_input_output_map_.find(node_type); - if (node_iter == ref_input_output_map_.end()) { - return -1; - } - - auto index_iter = node_iter->second.find(input_index); - if (index_iter == node_iter->second.end()) { - return -1; + if (node_type == FRAMEWORKOP) { + std::string original_type; + GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS, GELOGW("Get node original type fail")); + GELOGI("find frameworkop: [%s], original type is %s", node->GetName().c_str(), original_type.c_str()); + return FindRefOutIndex(original_type, input_index, ref_node_without_prototype_map_); } - return index_iter->second; + return FindRefOutIndex(node_type, input_index, ref_input_output_map_); } void VariablePrepareOpPass::GenerateRefTypeAndInputOutputMap(const NodePtr &node) { @@ -301,4 +300,18 @@ Status VariablePrepareOpPass::UpdateAssignOpDesc(const ge::NodePtr &node) { } return SUCCESS; } + +int VariablePrepareOpPass::FindRefOutIndex(const std::string &node_type, int input_index, + const std::map> &ref_map) { + auto node_iter = ref_map.find(node_type); + if (node_iter == ref_map.end()) { + return -1; + } + + auto index_iter = node_iter->second.find(input_index); + if (index_iter == node_iter->second.end()) { + return -1; + } + return index_iter->second; +} } // namespace ge diff --git a/src/ge/graph/passes/variable_prepare_op_pass.h b/src/ge/graph/passes/variable_prepare_op_pass.h index 0fbd311c..fb25d5db 100644 --- a/src/ge/graph/passes/variable_prepare_op_pass.h +++ b/src/ge/graph/passes/variable_prepare_op_pass.h @@ -33,13 +33,15 @@ class VariablePrepareOpPass : public GraphPass { Status DealWritableNode(ge::NodePtr &writable_node, ge::NodePtr &var_node, int out_index); NodePtr GetFinalWritableNode(ge::NodePtr &writable_node, int &out_index); Status AddVariableRef(ge::NodePtr &node, ge::NodePtr &var_node, int index); - NodePtr CreatVariableRef(ge::NodePtr &final_ref_type_node, ge::NodePtr &var_node); + NodePtr CreatVariableRef(const std::string &variable_ref_name, ge::NodePtr &var_node); int GetWritableNodeOutIndex(const NodePtr &node, int input_index); Status UpdateAssignOpDesc(const ge::NodePtr &node); void GenerateRefTypeAndInputOutputMap(const NodePtr &node); + int FindRefOutIndex(const std::string &node_type, int input_index, + const std::map> &ref_map); std::map> ref_input_output_map_; - std::unordered_set has_dealed_variable_{}; + static std::map> ref_node_without_prototype_map_; }; } // namespace ge diff --git a/src/ge/graph/preprocess/graph_preprocess.cc b/src/ge/graph/preprocess/graph_preprocess.cc index a33bc8cc..eacec6d1 100644 --- a/src/ge/graph/preprocess/graph_preprocess.cc +++ b/src/ge/graph/preprocess/graph_preprocess.cc @@ -736,6 +736,35 @@ Status ProcessNetoutputNode(NodePtr &node, std::string &output_type) { } return SUCCESS; } + +Status CheckIfNeedSetNdFormat(const NodePtr &node_ptr) { + auto op = node_ptr->GetOpDesc(); + GE_CHECK_NOTNULL(op); + auto inputDescsPtr = op->GetAllInputsDescPtr(); + auto outputDescsPtr = op->GetAllOutputsDescPtr(); + ge::Format format = ge::FORMAT_ND; + // if user set shape larger than 4, inferformat may set NCHW or NHWC, GE should set ND before FE + // process, otherwise fe will insert transdata. + for (auto &inputDescPtr : inputDescsPtr) { + GE_CHECK_NOTNULL(inputDescPtr); + if ((inputDescPtr->GetShape().GetDims().size() > ge::DIM_DEFAULT_SIZE) && + ((inputDescPtr->GetFormat() == ge::FORMAT_NCHW) || (inputDescPtr->GetFormat() == ge::FORMAT_NHWC))) { + GELOGI("The node inputdesc [%s] format need to be set ND", op->GetName().c_str()); + inputDescPtr->SetFormat(format); + inputDescPtr->SetOriginFormat(format); + } + } + for (auto &outputDescPtr : outputDescsPtr) { + GE_CHECK_NOTNULL(outputDescPtr); + if ((outputDescPtr->GetShape().GetDims().size() > ge::DIM_DEFAULT_SIZE) && + ((outputDescPtr->GetFormat() == ge::FORMAT_NCHW) || (outputDescPtr->GetFormat() == ge::FORMAT_NHWC))) { + GELOGI("The node outputdesc [%s] format need to be set ND", op->GetName().c_str()); + outputDescPtr->SetFormat(format); + outputDescPtr->SetOriginFormat(format); + } + } + return SUCCESS; +} } // namespace GraphPrepare::GraphPrepare() : compute_graph_(nullptr) {} @@ -826,9 +855,12 @@ Status GraphPrepare::CheckGraph() { Status GraphPrepare::CheckRefInputNode(const NodePtr &node, const std::string &input_name, const std::unordered_set &ref_nodes) { + // Acceptable input types should be ref node, variable or Switch operator, which is issued by ME for dynamic + // lossscale and would be optimized in SwitchOpPass. Since ME dont differentiate between RefSwitch and Switch, + // and only issue Switch. static std::unordered_set acceptable_types = {ge::VARIABLE, ge::VARIABLEV2, ge::VARHANDLEOP, ge::REFSWITCH, ge::REFMERGE, ge::REFENTER, - ge::REFNEXTITERATION, ge::REFEXIT}; + ge::REFNEXTITERATION, ge::REFEXIT, ge::SWITCH}; GE_CHECK_NOTNULL(node); const auto &op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); @@ -972,7 +1004,7 @@ Status GraphPrepare::UpdateInput(const std::vector &user_input) { int64_t desc_shape = desc.GetShape().GetShapeSize(); FMK_INT64_UINT32_MULCHECK(desc_shape, length); int64_t shape_size = desc_shape * length; - GE_IF_BOOL_EXEC(shape_size == 0, shape_size = static_cast(length)); + GE_IF_BOOL_EXEC(shape_size == 0 && desc.GetShape().GetDimNum() == 0, shape_size = static_cast(length)); int64_t size = 0; GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(desc, size) != GRAPH_SUCCESS, GELOGE(INTERNAL_ERROR, "TensorUtils GetSize failed"); @@ -1106,6 +1138,10 @@ Status GraphPrepare::OptimizeAfterInfershapeByAtcParams() { GE_RETURN_IF_ERROR(InsertNewOpUtil::Instance().UpdateDataNodeByAipp(compute_graph_)); for (auto &node_ptr : compute_graph_->GetDirectNode()) { GE_CHECK_NOTNULL(node_ptr); + if (CheckIfNeedSetNdFormat(node_ptr) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Set node [%s] format ND failed", node_ptr->GetName().c_str()); + return FAILED; + } if (node_ptr->GetType() == DATA) { if (ProcessDataNode(node_ptr) != SUCCESS) { GELOGE(INTERNAL_ERROR, "Process data node failed"); @@ -1416,9 +1452,17 @@ Status GraphPrepare::VerifyConstOp(const NodePtr &node) { FMK_INT64_UINT32_MULCHECK(shape_size, length); GELOGI("Const real value Size:%zu, op_desc Shape Size:%ld, data_type:%s.", data_size, shape_size * length, TypeUtils::DataTypeToSerialString(data_type).c_str()); - if ((shape_size != 0) || (data_size / length != 1)) { - GE_CHK_BOOL_EXEC(data_size == static_cast(shape_size * length) && data_size != 0, - return GRAPH_PARAM_INVALID, "Const input data size is not equal with tensor desc shape"); + if (shape_size == 0) { + if (ge_tensor_desc.GetShape().GetDims().size() == 0) { + // shape = [], means it's a sclar tensor. + GE_CHK_BOOL_EXEC(data_size / length == 1, return PARAM_INVALID, "Const is invalid scalar tensor."); + } else { + // shape = [x, y, 0,...], means it's a vector tensor that value is []. + GE_CHK_BOOL_EXEC(data_size == 0, return PARAM_INVALID, "Const is invalid vector scalar."); + } + } else { + GE_CHK_BOOL_EXEC(data_size == static_cast(shape_size * length) && data_size != 0, return PARAM_INVALID, + "Const input data size is not equal with tensor desc shape"); } return SUCCESS; } @@ -1448,8 +1492,8 @@ Status GraphPrepare::CheckUserInput(const std::vector &user_input) { GeTensorDesc desc(user_input[index].GetTensorDesc()); for (size_t i = 0; i < desc.GetShape().GetDimNum(); ++i) { - if (desc.GetShape().GetDim(i) <= 0) { - GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need > 0, real:%ld.", i, + if (desc.GetShape().GetDim(i) < 0) { + GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need >= 0, real:%ld.", i, desc.GetShape().GetDim(i)); return GE_GRAPH_INIT_FAILED; } @@ -1472,8 +1516,6 @@ Status GraphPrepare::InferShapeForPreprocess() { } InferShapePass infer_shape_pass; names_to_passes.emplace_back("InferShapePass", &infer_shape_pass); - ReplaceWithEmptyConstPass replace_with_empty_const_pass; - names_to_passes.emplace_back("ReplaceWithEmptyConstPass", &replace_with_empty_const_pass); DimensionComputePass dimension_compute_pass; names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass); ConstantFoldingPass constant_folding_pass; diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc index fbdcc217..680e40c9 100644 --- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc +++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc @@ -124,22 +124,21 @@ Status InsertNewOpUtil::CheckGraph(const ComputeGraphPtr &graph) { if (node->GetType() != DATA) { continue; } - + size_t next_nodes_cnt = 0; std::vector aippNodes; for (const auto &anchor : node->GetAllOutDataAnchors()) { for (const auto &inAnchor : anchor->GetPeerInDataAnchors()) { const std::string &nodeType = inAnchor->GetOwnerNode()->GetType(); - - GE_CHK_BOOL_RET_STATUS(aippNodes.size() == 0 || nodeType == AIPP, PARAM_INVALID, - "Can not config part of outputs of Data node to support AIPP, config all of the " - "outputs of Data to support AIPP, or config none of them"); - + next_nodes_cnt++; if (nodeType == AIPP) { aippNodes.push_back(inAnchor->GetOwnerNode()); continue; } } } + GE_CHK_BOOL_RET_STATUS((aippNodes.size() == 0) || (aippNodes.size() == next_nodes_cnt), PARAM_INVALID, + "Can not config part of outputs of Data node to support AIPP, config all " + "of the outputs of Data to support AIPP, or config none of them"); std::unique_ptr aippParams(new (std::nothrow) domi::AippOpParams()); GE_CHECK_NOTNULL(aippParams); diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc index b93b02f9..523c41cb 100644 --- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc +++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc @@ -142,7 +142,7 @@ Status CalcShape(const std::vector &batch_shape, GeShape &data_shape) { bool IsAllDimsPositive(const std::vector &dims) { for (auto dim : dims) { - if (dim <= 0) { + if (dim < 0) { return false; } } diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc index a9ef96c1..1b449521 100644 --- a/src/ge/init/gelib.cc +++ b/src/ge/init/gelib.cc @@ -16,8 +16,8 @@ #include "init/gelib.h" -#include #include +#include #include #include #include @@ -142,6 +142,35 @@ Status GELib::InnerInitialize(const map &options) { return SUCCESS; } +void GELib::SetIncreBuild(const map &options) { + auto iter = options.find(OPTION_EXEC_ENABLE_INCRE_BUILD); + if (iter != options.end()) { + const std::string enable_incre_build = "true"; + const std::string disable_incre_build = "false"; + if (iter->second == enable_incre_build) { + is_incre_build_ = true; + GELOGI("Enable incre build."); + auto path_iter = options.find(OPTION_EXEC_INCRE_BUILD_CACHE_PATH); + if (path_iter != options.end()) { + std::string cache_path = path_iter->second; + if (!cache_path.empty() && cache_path[cache_path.size() - 1] != '/') { + cache_path += "/"; + } + incre_build_cache_path_ = cache_path; + } else { + incre_build_cache_path_ = ".ge_cache/"; + } + GELOGD("Using incre build cache path: %s.", incre_build_cache_path_.c_str()); + } else if (iter->second == disable_incre_build) { + is_incre_build_ = false; + GELOGI("Disable incre build."); + } else { + is_incre_build_ = false; + GELOGW("Invalid ENABLE_INCRE_BUILD option, it should be true or false."); + } + } +} + Status GELib::SystemInitialize(const map &options) { Status status = FAILED; auto iter = options.find(OPTION_GRAPH_RUN_MODE); @@ -174,6 +203,8 @@ Status GELib::SystemInitialize(const map &options) { PropertiesManager::Instance().SetDumpStep(dump_step); } } + // check incre build flag + SetIncreBuild(options); if (is_train_mode_) { InitOptions(options); @@ -258,8 +289,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret); options.physical_device_id = options.device_id; - // The physical ID is transferred to the logical ID. FMK receives physical ID - // and needs to be converted + // The physical ID is transferred to the logical ID. FMK receives physical ID and needs to be converted uint32_t dev_logic_index = 0; rtError_t rt_ret = rtGetDeviceIndexByPhyId(static_cast(options.device_id), &dev_logic_index); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, @@ -273,8 +303,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt GE_CHK_RT_RET(rtSetDevice(options.device_id)); - // In the scenario that the automatic add fusion is set, but there is no - // cleanaddr operator, maybe need to check it + // In the scenario that the automatic add fusion is set, but there is no cleanaddr operator, + // maybe need to check it is_system_inited = true; is_shutdown = false; @@ -287,10 +317,10 @@ Status GELib::SystemShutdownWithOptions(const Options &options) { GELOGI("Training finalize GELib begin."); std::lock_guard lock(status_mutex_); - GE_IF_BOOL_EXEC(is_shutdown || !is_system_inited, GELOGW("System Shutdown with options is already is_shutdown " - "or system does not inited. " - "is_shutdown:%d is_omm_inited:%d", - is_shutdown, is_system_inited); + GE_IF_BOOL_EXEC(is_shutdown || !is_system_inited, + GELOGW("System Shutdown with options is already is_shutdown or system does not inited. " + "is_shutdown:%d is_omm_inited:%d", + is_shutdown, is_system_inited); return SUCCESS); GE_CHK_RT(rtDeviceReset(options.device_id)); @@ -324,9 +354,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithout static bool is_inited = false; if (is_inited) { - GELOGW( - "System init without options is already inited, don't need to init " - "again."); + GELOGW("System init without options is already inited, don't need to init again."); return SUCCESS; } is_inited = true; diff --git a/src/ge/init/gelib.h b/src/ge/init/gelib.h index 0945907a..3db32dd2 100644 --- a/src/ge/init/gelib.h +++ b/src/ge/init/gelib.h @@ -65,6 +65,12 @@ class GELib { // add head stream to model bool HeadStream() const { return head_stream_; } + // get incre build flag + bool IsIncreBuild() const { return is_incre_build_; } + + // get incre build cache path + const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; } + Status InitSystemWithoutOptions(); Status InitSystemWithOptions(Options &options); Status SystemShutdownWithOptions(const Options &options); @@ -76,6 +82,7 @@ class GELib { Status SystemInitialize(const map &options); void RollbackInit(); void InitOptions(const map &options); + void SetIncreBuild(const map &options); DNNEngineManager engineManager_; OpsKernelManager opsManager_; @@ -87,8 +94,9 @@ class GELib { bool is_system_inited = false; bool is_shutdown = false; bool is_use_hcom = false; - + bool is_incre_build_ = false; bool head_stream_ = false; + std::string incre_build_cache_path_; }; } // namespace ge diff --git a/src/ge/single_op/single_op_model.cc b/src/ge/single_op/single_op_model.cc index 1cede863..f2d2da88 100644 --- a/src/ge/single_op/single_op_model.cc +++ b/src/ge/single_op/single_op_model.cc @@ -76,8 +76,11 @@ void SingleOpModel::ParseOpModelParams(ModelHelper &model_helper, SingleOpModelP param.base_addr = ret ? static_cast(value) : 0; ret = ge::AttrUtils::GetInt(model, MODEL_ATTR_TASK_GEN_WEIGHT_ADDR, value); param.weight_addr = ret ? static_cast(value) : 0; + ret = ge::AttrUtils::GetInt(model, ATTR_MODEL_CORE_TYPE, value); + param.core_type = ret ? value : 0; - GELOGI("ParseOpModelParams(), memory_size:%lu, weight_size:%lu.", param.memory_size, param.weight_size); + GELOGI("ParseOpModelParams(), memory_size:%lu, weight_size:%lu. core_type = %lu", param.memory_size, + param.weight_size, param.core_type); } Status SingleOpModel::InitModelMem(StreamResource &res) { diff --git a/src/ge/single_op/single_op_model.h b/src/ge/single_op/single_op_model.h index c8880b06..c1a63758 100644 --- a/src/ge/single_op/single_op_model.h +++ b/src/ge/single_op/single_op_model.h @@ -39,13 +39,12 @@ struct SingleOpModelParam { uint8_t *weight_base = nullptr; std::map addr_mapping_; + int64_t core_type = 0; }; class SingleOpModel { public: - SingleOpModel(const std::string &model_name, - const void *model_data, - uint32_t model_size); + SingleOpModel(const std::string &model_name, const void *model_data, uint32_t model_size); ~SingleOpModel() = default; Status Init(); diff --git a/src/ge/single_op/task/tbe_task_builder.cc b/src/ge/single_op/task/tbe_task_builder.cc index b8911d0c..c0f6877f 100644 --- a/src/ge/single_op/task/tbe_task_builder.cc +++ b/src/ge/single_op/task/tbe_task_builder.cc @@ -89,16 +89,17 @@ TbeTaskBuilder::TbeTaskBuilder(const std::string &model_name, const OpDescPtr &o const domi::KernelDef &kernel_def) : op_desc_(op_desc), kernel_def_(kernel_def), stub_name_(model_name + "/" + op_desc->GetName() + "_tvmbin") {} -Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle) const { +Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle, + const SingleOpModelParam ¶m) const { rtDevBinary_t binary; binary.version = 0; binary.data = kernel_bin.GetBinData(); binary.length = kernel_bin.GetBinDataSize(); - binary.magic = RT_DEV_BINARY_MAGIC_ELF; + binary.magic = param.core_type == 0 ? RT_DEV_BINARY_MAGIC_ELF : RT_DEV_BINARY_MAGIC_ELF_AIVEC; auto ret = rtDevBinaryRegister(&binary, bin_handle); if (ret != RT_ERROR_NONE) { - GELOGE(RT_FAILED, "rtDevBinaryRegister failed, bin key = %s, rt ret = %d", stub_name_.c_str(), - static_cast(ret)); + GELOGE(RT_FAILED, "rtDevBinaryRegister failed, bin key = %s, core_type = %ld, rt ret = %d", stub_name_.c_str(), + param.core_type, static_cast(ret)); return RT_FAILED; } @@ -132,13 +133,13 @@ Status TbeTaskBuilder::DoRegisterFunction(void *bin_handle, const char *stub_nam return SUCCESS; } -Status TbeTaskBuilder::DoRegisterKernel(const ge::OpKernelBin &tbe_kernel, const char *bin_file_key, - void **bin_handle) { +Status TbeTaskBuilder::DoRegisterKernel(const ge::OpKernelBin &tbe_kernel, const char *bin_file_key, void **bin_handle, + const SingleOpModelParam ¶m) { std::string kernel_name; GetKernelName(op_desc_, kernel_name); void *handle = nullptr; - auto ret = DoRegisterBinary(tbe_kernel, &handle); + auto ret = DoRegisterBinary(tbe_kernel, &handle, param); if (ret != SUCCESS) { return ret; } @@ -160,7 +161,7 @@ Status TbeTaskBuilder::DoRegisterKernel(const ge::OpKernelBin &tbe_kernel, const return SUCCESS; } -Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task) { +Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task, const SingleOpModelParam ¶m) { KernelBinRegistry ®istry = KernelBinRegistry::GetInstance(); // check if already registered const char *stub_func = registry.GetStubFunc(stub_name_); @@ -190,7 +191,7 @@ Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task) { } void *bin_handle = nullptr; - auto ret = DoRegisterKernel(*tbe_kernel, stub_func, &bin_handle); + auto ret = DoRegisterKernel(*tbe_kernel, stub_func, &bin_handle, param); if (ret == SUCCESS) { holder->SetBinHandle(bin_handle); if (!registry.AddKernel(stub_name_, holder)) { @@ -285,7 +286,7 @@ Status TbeTaskBuilder::BuildTask(TbeOpTask &task, const SingleOpModelParam ¶ return ret; } - ret = RegisterKernel(task); + ret = RegisterKernel(task, param); if (ret != SUCCESS) { return ret; } diff --git a/src/ge/single_op/task/tbe_task_builder.h b/src/ge/single_op/task/tbe_task_builder.h index 25441289..5e0965bf 100644 --- a/src/ge/single_op/task/tbe_task_builder.h +++ b/src/ge/single_op/task/tbe_task_builder.h @@ -74,9 +74,10 @@ class TbeTaskBuilder { Status SetKernelArgs(TbeOpTask &task, const SingleOpModelParam ¶m); Status GetSmDesc(void **sm_desc, const SingleOpModelParam ¶m) const; - Status RegisterKernel(TbeOpTask &task); - Status DoRegisterKernel(const OpKernelBin &kernel_bin, const char *bin_file_key, void **bin_handle); - Status DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle) const; + Status RegisterKernel(TbeOpTask &task, const SingleOpModelParam ¶m); + Status DoRegisterKernel(const OpKernelBin &kernel_bin, const char *bin_file_key, void **bin_handle, + const SingleOpModelParam ¶m); + Status DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle, const SingleOpModelParam ¶m) const; Status DoRegisterMeta(void *bin_handle); static Status DoRegisterFunction(void *bin_handle, const char *stub_name, const char *kernel_name); diff --git a/src/proto/fusion_model.proto b/src/proto/fusion_model.proto index 2ff6b77a..6220963c 100644 --- a/src/proto/fusion_model.proto +++ b/src/proto/fusion_model.proto @@ -17,9 +17,10 @@ syntax = "proto3"; import "om.proto"; + package domi; message FusionModelDef { string version = 1; repeated OpDef fusion_op = 2; -} +} \ No newline at end of file diff --git a/src/proto/task.proto b/src/proto/task.proto index 3eb8de5c..8ef5c2e2 100644 --- a/src/proto/task.proto +++ b/src/proto/task.proto @@ -31,7 +31,7 @@ message ModelTaskDef { repeated bytes op = 15; // input/output opdef in bytes - uint64 base_addr = 16; // base addr + uint64 base_addr = 16; // base addr uint64 weight_addr = 17; // weight addr uint32 batch_num = 18; } @@ -58,6 +58,10 @@ message TaskDef { bytes private_def = 34; uint64 ops_kernel_store_ptr = 35; // adjustments to other fields in the future StreamSwitchNDef stream_switch_n = 36; + + LabelSetDef label_set = 37; + LabelGotoExDef label_goto_ex = 38; + LabelSwitchByIndexDef label_switch_by_index = 39; } message KernelDef { @@ -119,6 +123,7 @@ message MemcpyAsyncDef { uint64 src = 3; uint64 count = 4; uint32 kind = 5; + uint32 op_index = 6; } message StreamSwitchDef { @@ -142,3 +147,20 @@ message StreamSwitchNDef { uint32 element_size = 5; uint32 data_type = 6; } + +message LabelSetDef { + uint32 op_index = 1; + uint32 label_id = 2; + uint32 model_id = 3; +} + +message LabelGotoExDef { + uint32 op_index = 1; + uint32 label_id = 2; + uint32 model_id = 3; +} + +message LabelSwitchByIndexDef { + uint32 op_index = 1; + uint32 label_max = 2; +} diff --git a/tests/depends/omg/src/omg_stub.cc b/tests/depends/omg/src/omg_stub.cc index 7197dac7..224d4128 100644 --- a/tests/depends/omg/src/omg_stub.cc +++ b/tests/depends/omg/src/omg_stub.cc @@ -122,6 +122,7 @@ struct OmFileContext { class SubGraphInfo; using SubGraphInfoPtr = std::shared_ptr; +using Graph2SubGraphInfoList = std::unordered_map>; using GeModelPartitionPtr = std::shared_ptr; using ModelPtr = std::shared_ptr; @@ -220,7 +221,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void OmFileSaveHelper::AddParti } class ModelBuilder { public: - ModelBuilder(ge::ComputeGraphPtr compute_graph, const std::vector &subgraphs, + ModelBuilder(ge::ComputeGraphPtr compute_graph, const Graph2SubGraphInfoList &subgraphs, const std::map &stream_max_parallel_num, bool hcom_parallel, int mode); virtual ~ModelBuilder(); Status BuildModel(ge::Model &model_def); @@ -235,7 +236,7 @@ class ModelBuilder { ge::Buffer weight_buffer_; }; -ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const std::vector &subgraphs, +ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const Graph2SubGraphInfoList &subgraphs, const std::map &stream_max_parallel_num, bool hcom_parallel, int mode) { weight_buffer_ = ge::Buffer(4100000); } diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index 5ed130c7..fa94bab1 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -44,6 +44,7 @@ include_directories(${GE_SOURCE_DIR}/inc/graph) include_directories(${GE_SOURCE_DIR}/inc/framework) include_directories(${GE_SOURCE_DIR}/inc/common) include_directories(${GE_SOURCE_DIR}/third_party/securec/include) +include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce) include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops) diff --git a/tests/ut/ge/common/datatype_transfer_unittest.cc b/tests/ut/ge/common/datatype_transfer_unittest.cc index 5f11b272..e0f258a9 100644 --- a/tests/ut/ge/common/datatype_transfer_unittest.cc +++ b/tests/ut/ge/common/datatype_transfer_unittest.cc @@ -368,14 +368,20 @@ TEST_F(UtestDataTypeTransfer, invalid_src_data_type) { EXPECT_EQ(transfer.TransDataType(args, result), UNSUPPORTED); } -TEST_F(UtestDataTypeTransfer, src_shape_empry) { - uint8_t data[1 * 4 * 4 * 1] = {0}; +TEST_F(UtestDataTypeTransfer, src_shape_empty) { + uint8_t data[1*4*4*1] = {0}; + constexpr int64_t kShapeItemNumMAX = 1024UL * 1024UL * 1024UL * 1024UL; DataTypeTransfer transfer; - CastArgs args{reinterpret_cast(data), 0, DT_UINT8, DT_INT32}; + CastArgs args { + reinterpret_cast(data), + 0, + DT_UINT8, + DT_INT32 + }; TransResult result; - EXPECT_EQ(transfer.TransDataType(args, result), PARAM_INVALID); + EXPECT_EQ(transfer.TransDataType(args, result), SUCCESS); } TEST_F(UtestDataTypeTransfer, unsupprot_trans) { diff --git a/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc b/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc index 8d1ff256..b4beb6ce 100644 --- a/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc +++ b/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc @@ -701,7 +701,7 @@ TEST_F(UtestFormatTransferNhwc5d, invalid_src_shape2) { EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID); Status status = transfer.TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, args.dst_shape); - EXPECT_EQ(status, PARAM_INVALID); + EXPECT_EQ(status, SUCCESS); } TEST_F(UtestFormatTransferNhwc5d, invalid_src_format) { diff --git a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc index f9799b49..e49005e8 100644 --- a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc +++ b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc @@ -21,6 +21,7 @@ #define protected public #define private public #include "graph/manager/graph_manager_utils.h" +#include "common/op/attr_value_util.h" #undef protected #undef private @@ -189,18 +190,20 @@ class UtestLogicalStreamAllocator : public testing::Test { bool ExpectStreamEq(SubGraphInfoPtr subgraph, int64_t expect) { return GetStream(subgraph) == expect; } bool ExpectStreamNe(SubGraphInfoPtr subgraph, int64_t expect) { return GetStream(subgraph) != expect; } - Status AssignLogicalStreams(vector subgraphs, vector &confs, + Status AssignLogicalStreams(Graph2SubGraphInfoList &subgraph_map, vector &confs, std::map &max_parallel_num, ComputeGraphPtr &whole_graph) { SchedulerConf scheduler_conf; if (confs.empty()) { - for (const auto &subgraph : subgraphs) { - EngineConfPtr conf = make_shared(); - conf->id = subgraph->GetEngineName(); - if (conf->id == "ge_local") { - conf->skip_assign_stream = true; - conf->attach = true; + for (const auto &subgraph_pair : subgraph_map) { + for (const auto &subgraph : subgraph_pair.second) { + EngineConfPtr conf = make_shared(); + conf->id = subgraph->GetEngineName(); + if (conf->id == "ge_local") { + conf->skip_assign_stream = true; + conf->attach = true; + } + scheduler_conf.cal_engines[conf->id] = conf; } - scheduler_conf.cal_engines[conf->id] = conf; } } else { for (auto &conf : confs) { @@ -217,24 +220,33 @@ class UtestLogicalStreamAllocator : public testing::Test { scheduler_confs["scheduler"] = scheduler_conf; LogicalStreamAllocator allocator(scheduler_confs, max_parallel_num); int64_t stream_num = 0; - return allocator.Assign(whole_graph, subgraphs, stream_num); + return allocator.Assign(whole_graph, subgraph_map, stream_num); } - Status AssignLogicalStreams(vector subgraphs, std::map &max_parallel_num, - vector &confs) { - ComputeGraphPtr whole_graph = make_shared("whole_graph"); + Status AssignLogicalStreams(vector subgraphs, + vector &confs, + std::map &max_parallel_num, + ComputeGraphPtr &whole_graph) { + Graph2SubGraphInfoList subgraph_map; + subgraph_map[whole_graph] = subgraphs; + return AssignLogicalStreams(subgraph_map, confs, max_parallel_num, whole_graph); + } + + Status AssignLogicalStreams(vector subgraphs, vector& confs, + std::map& max_parallel_num) { + ComputeGraphPtr whole_graph = make_shared < ComputeGraph > ("whole_graph"); return AssignLogicalStreams(subgraphs, confs, max_parallel_num, whole_graph); } Status AssignLogicalStreams(vector subgraphs, vector confs = vector()) { std::map max_parallel_num; - return AssignLogicalStreams(subgraphs, max_parallel_num, confs); + return AssignLogicalStreams(subgraphs, confs, max_parallel_num); } - Status AssignLogicalStreams(vector subgraphs, std::map &max_parallel_num) { - vector confs; - return AssignLogicalStreams(subgraphs, max_parallel_num, confs); + Status AssignLogicalStreams(vector subgraphs, std::map& max_parallel_num) { + vector < EngineConfPtr > confs; + return AssignLogicalStreams(subgraphs, confs, max_parallel_num); } /// typical case @@ -294,8 +306,8 @@ class UtestLogicalStreamAllocator : public testing::Test { max_parallel_num["aicpu"] = parallel_num; Status status = AssignLogicalStreams({const1, const2, get_next, genmask1, genmask2, domask, subgraph4, subgraph5, - subgraph6, allreduce1, allreduce2, apply1, apply2}, - max_parallel_num, confs); + subgraph6, allreduce1, allreduce2, apply1, apply2}, confs, + max_parallel_num); EXPECT_EQ(status, ge::SUCCESS); EXPECT_EQ(GetStream(get_next), 0); @@ -324,7 +336,7 @@ class UtestLogicalStreamAllocator : public testing::Test { /// E --> F(AllReduce) --- G /// stream id: 2 2 2 /// - void make_graph_with_allreduce(ge::ComputeGraphPtr graph) { + void MakeGraphWithAllreduce(ge::ComputeGraphPtr graph) { ge::OpDescPtr op_a = make_shared("A", DATA); auto desc_temp_ptr = make_shared(); auto desc_temp = *desc_temp_ptr; @@ -337,6 +349,7 @@ class UtestLogicalStreamAllocator : public testing::Test { ge::OpDescPtr op_c = make_shared("C", "HcomAllReduce"); op_c->AddInputDesc(desc_temp); + op_c->AddInputDesc(desc_temp); op_c->AddOutputDesc(desc_temp); ge::OpDescPtr op_d = make_shared("D", "testa"); @@ -349,12 +362,21 @@ class UtestLogicalStreamAllocator : public testing::Test { ge::OpDescPtr op_f = make_shared("F", "HcomAllReduce"); op_f->AddInputDesc(desc_temp); + op_f->AddInputDesc(desc_temp); op_f->AddOutputDesc(desc_temp); ge::OpDescPtr op_g = make_shared("G", "testa"); op_g->AddInputDesc(desc_temp); op_g->AddOutputDesc(desc_temp); + ge::OpDescPtr op_h = make_shared("H", "testa"); + op_h->AddInputDesc(desc_temp); + op_h->AddOutputDesc(desc_temp); + + ge::OpDescPtr op_i = make_shared("I", "testa"); + op_i->AddInputDesc(desc_temp); + op_i->AddOutputDesc(desc_temp); + // add node ge::NodePtr node_a = graph->AddNode(op_a); ge::NodePtr node_b = graph->AddNode(op_b); @@ -363,14 +385,18 @@ class UtestLogicalStreamAllocator : public testing::Test { ge::NodePtr node_e = graph->AddNode(op_e); ge::NodePtr node_f = graph->AddNode(op_f); ge::NodePtr node_g = graph->AddNode(op_g); + ge::NodePtr node_h = graph->AddNode(op_h); + ge::NodePtr node_i = graph->AddNode(op_i); // add edge - ge::GraphUtils::AddEdge(node_a->GetOutDataAnchor(0), node_b->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_a->GetOutDataAnchor(0), node_e->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_b->GetOutDataAnchor(0), node_c->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_c->GetOutDataAnchor(0), node_d->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_e->GetOutDataAnchor(0), node_f->GetInDataAnchor(0)); - ge::GraphUtils::AddEdge(node_f->GetOutDataAnchor(0), node_g->GetInDataAnchor(0)); + node_a->GetOutDataAnchor(0)->LinkTo(node_b->GetInDataAnchor(0)); + node_a->GetOutDataAnchor(0)->LinkTo(node_e->GetInDataAnchor(0)); + node_b->GetOutDataAnchor(0)->LinkTo(node_c->GetInDataAnchor(0)); + node_c->GetOutDataAnchor(0)->LinkTo(node_d->GetInDataAnchor(0)); + node_e->GetOutDataAnchor(0)->LinkTo(node_f->GetInDataAnchor(0)); + node_f->GetOutDataAnchor(0)->LinkTo(node_g->GetInDataAnchor(0)); + node_h->GetOutDataAnchor(0)->LinkTo(node_c->GetInDataAnchor(1)); + node_i->GetOutDataAnchor(0)->LinkTo(node_f->GetInDataAnchor(1)); // add stream id node_a->GetOpDesc()->SetStreamId(0); @@ -380,6 +406,14 @@ class UtestLogicalStreamAllocator : public testing::Test { node_e->GetOpDesc()->SetStreamId(2); node_f->GetOpDesc()->SetStreamId(2); node_g->GetOpDesc()->SetStreamId(2); + + // add stream label + string stream_label1 = "1"; + (void) AttrUtils::SetStr(node_c->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label1); + (void) AttrUtils::SetStr(node_d->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label1); + string stream_label2 = "2"; + (void) AttrUtils::SetStr(node_f->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label2); + (void) AttrUtils::SetStr(node_g->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label2); } }; @@ -652,7 +686,7 @@ TEST_F(UtestLogicalStreamAllocator, test_independent) { vector confs = {conf1, conf2}; Status status = - AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, max_parallel_num, confs); + AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, confs, max_parallel_num); EXPECT_EQ(status, ge::SUCCESS); EXPECT_EQ(GetStream(subgraph1), 0); EXPECT_EQ(GetStream(subgraph2), 0); @@ -695,7 +729,7 @@ TEST_F(UtestLogicalStreamAllocator, test_independent_switch_label) { vector confs = {conf1, conf2, conf3}; Status status = - AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, max_parallel_num, confs); + AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, confs, max_parallel_num); EXPECT_EQ(status, ge::SUCCESS); EXPECT_EQ(GetStream(subgraph1), 4); EXPECT_EQ(GetStream(subgraph2), 0); @@ -833,9 +867,9 @@ TEST_F(UtestLogicalStreamAllocator, test_reassign_stream) { auto node1_1 = whole_graph->AddNode(node1->GetOpDesc()); auto node1_2 = whole_graph->AddNode(node2->GetOpDesc()); auto node1_3 = whole_graph->AddNode(node3->GetOpDesc()); - GraphUtils::AddEdge(node1_1->GetOutControlAnchor(), node1_2->GetInControlAnchor()); - GraphUtils::AddEdge(node1_2->GetOutDataAnchor(0), node1_3->GetInDataAnchor(0)); - GraphUtils::AddEdge(node1->GetOutControlAnchor(), node2->GetInControlAnchor()); + node1_1->GetOutControlAnchor()->LinkTo(node1_2->GetInControlAnchor()); + node1_2->GetOutDataAnchor(0)->LinkTo(node1_3->GetInDataAnchor(0)); + node1->GetOutControlAnchor()->LinkTo(node2->GetInControlAnchor()); std::map max_parallel_num; vector subgraphs = {subgraph1, const2, subgraph3}; @@ -853,7 +887,7 @@ TEST_F(UtestLogicalStreamAllocator, test_all_reduce_parallel_pass) { ge::ComputeGraphPtr graph = make_shared(""); graph->SetName("TestAllReduceParallelPass"); - make_graph_with_allreduce(graph); + MakeGraphWithAllreduce(graph); std::map max_parallel_num; LogicalStreamPass::Context context; @@ -863,7 +897,13 @@ TEST_F(UtestLogicalStreamAllocator, test_all_reduce_parallel_pass) { LogicalStreamPassPtr allreduce_pass = std::make_shared(); ret = allreduce_pass->Run(graph, subgraphs, context); - EXPECT_EQ(ret, NOT_CHANGED); + EXPECT_EQ(ret, SUCCESS); + + ge::NodePtr node_d = graph->FindNode("D"); + ge::NodePtr node_g = graph->FindNode("G"); + int64_t stream_d = node_d->GetOpDesc()->GetStreamId(); + int64_t stream_g = node_g->GetOpDesc()->GetStreamId(); + EXPECT_EQ(stream_d + stream_g, 11); } } // namespace ge diff --git a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc index f8deff7f..a51299b3 100644 --- a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc +++ b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc @@ -315,7 +315,7 @@ TEST_F(UtestModelManagerDavinciModel, success_GetInputOutputDescInfo_without_net auto node = compute_graph->AddNode(op_desc); model.data_op_list_.push_back(op_desc); - model.output_size_list_.push_back(32); + model.output_data_info_[0] = {32, (void *)0x70002010}; model.op_list_[0] = op_desc; @@ -419,7 +419,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_input_output_descInfo_with_net model.op_list_[0] = op_desc; model.output_op_list_.push_back(op_desc); - model.output_size_list_.push_back(32); + model.output_data_info_[0] = {32, (void *)0x70002010}; vector input_shapes; vector output_shapes; @@ -463,7 +463,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_input_output_desc_info_for_zer model.op_list_[0] = op_desc; model.output_op_list_.push_back(op_desc); - model.output_size_list_.push_back(32); + model.output_data_info_[0] = {32, (void *)0x70002010}; model.output_memory_size_list_.push_back(64); vector input_shapes; @@ -508,7 +508,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_input_output_desc_info_dim_siz model.op_list_[0] = op_desc; model.output_op_list_.push_back(op_desc); - model.output_size_list_.push_back(32); + model.output_data_info_[0] = {32, (void *)0x70002010}; vector input_shapes; vector output_shapes; @@ -1282,7 +1282,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_output_desc_info_with_netoutpu model.op_list_[0] = op_desc; model.output_op_list_.push_back(op_desc); - model.output_size_list_.push_back(32); + model.output_data_info_[0] = {32, (void *)0x70002010}; model.output_memory_size_list_.push_back(64); vector output_shapes; diff --git a/tests/ut/ge/graph/load/output_net_output_unittest.cc b/tests/ut/ge/graph/load/output_net_output_unittest.cc index 52fdebfa..ca0eb871 100644 --- a/tests/ut/ge/graph/load/output_net_output_unittest.cc +++ b/tests/ut/ge/graph/load/output_net_output_unittest.cc @@ -131,25 +131,6 @@ TEST_F(UtestNetOutput, true_is_output) { delete model_utils; } -// test ModelUtils::IsInputTensorNeedTrans -TEST_F(UtestNetOutput, success_is_output_tensor_need_trans) { - ModelUtils *model_utils = new ModelUtils(); - std::shared_ptr op_desc = std::make_shared(); - OmeTestOpDescBuilder builder(op_desc); - builder.SetType("NetOutput"); - size_t tensor_index = 1; - vector outputs_desc; - std::shared_ptr desc = std::make_shared(); - outputs_desc.push_back(desc); - op_desc->outputs_desc_ = outputs_desc; - op_desc->inputs_desc_ = outputs_desc; - - bool ret = model_utils->IsInputTensorNeedTrans(op_desc, tensor_index); - EXPECT_EQ(false, ret); - - delete model_utils; -} - // test ModelUtils::GetOutputSize TEST_F(UtestNetOutput, success_get_output_size) { vector v_output_size; diff --git a/third_party/fwkacllib/inc/mmpa/mmpa_api.h b/third_party/fwkacllib/inc/mmpa/mmpa_api.h index ce1c9720..f1e30538 100644 --- a/third_party/fwkacllib/inc/mmpa/mmpa_api.h +++ b/third_party/fwkacllib/inc/mmpa/mmpa_api.h @@ -20,7 +20,7 @@ #define LINUX 0 #define WIN 1 -#if(OS_TYPE == LINUX) //lint !e553 +#if(OS_TYPE == LINUX) #ifndef _GNU_SOURCE #define _GNU_SOURCE @@ -84,7 +84,7 @@ #endif -#if(OS_TYPE == WIN) //lint !e553 +#if(OS_TYPE == WIN) #include #include #include "Windows.h" diff --git a/third_party/fwkacllib/inc/ops/all_ops.h b/third_party/fwkacllib/inc/ops/all_ops.h index 36c991ff..37315c74 100644 --- a/third_party/fwkacllib/inc/ops/all_ops.h +++ b/third_party/fwkacllib/inc/ops/all_ops.h @@ -35,7 +35,6 @@ #include "decode_wheels_target.h" #include "elewise_calculation_ops.h" #include "fastrcnn_predictions.h" -#include "fsrdetectionoutput_ops.h" #include "functional_ops.h" #include "get_data_ops.h" #include "hcom_ops.h" @@ -58,7 +57,6 @@ #include "outfeed_ops.h" #include "pad_ops.h" #include "parsing_ops.h" -#include "power_ops.h" #include "quantize_ops.h" #include "ragged_conversion_ops.h" #include "random_ops.h" diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h index 74f8924a..7febad77 100644 --- a/third_party/fwkacllib/inc/ops/array_ops.h +++ b/third_party/fwkacllib/inc/ops/array_ops.h @@ -595,6 +595,9 @@ REG_OP(ExpandDims) *@par Outputs: *y: A tensor. + +*@par Attention: +*This operator cannot be directly called by the acllopExecute API. */ REG_OP(Reshape) .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, @@ -848,7 +851,7 @@ REG_OP(Copy) `farmhash::fingerprint64`. *@par Outputs: -y: A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to \n +y: A two-dimensional `Tensor` of type `uint8`. The first dimension equals to \n `data`'s first dimension, and the second dimension size depends on the \n fingerprint algorithm. diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h index 0eaee06c..fee5e67d 100644 --- a/third_party/fwkacllib/inc/ops/data_flow_ops.h +++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h @@ -259,7 +259,7 @@ match this name to the matching Unstage Op. REG_OP(Stage) .DYNAMIC_INPUT(values, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, \ DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE})) + DT_DOUBLE, DT_UINT32, DT_UINT64})) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(container, String, "") @@ -312,7 +312,7 @@ REG_OP(StagePeek) .INPUT(index, TensorType({DT_INT32})) .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE})) + DT_DOUBLE, DT_UINT32, DT_UINT64})) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(container, String, "") @@ -363,7 +363,7 @@ REG_OP(StackPop) .INPUT(handle, TensorType({DT_RESOURCE})) .OUTPUT(element, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE})) + DT_DOUBLE, DT_UINT32, DT_UINT64})) .REQUIRED_ATTR(elem_type, Type) .OP_END_FACTORY_REG(StackPop) @@ -388,10 +388,10 @@ REG_OP(StackPush) .INPUT(handle, TensorType({DT_RESOURCE})) .INPUT(element, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE})) + DT_DOUBLE, DT_UINT32, DT_UINT64})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE})) + DT_DOUBLE, DT_UINT32, DT_UINT64})) .ATTR(swap_memory, Bool, false) .OP_END_FACTORY_REG(StackPush) @@ -540,7 +540,7 @@ REG_OP(ParallelDynamicStitch) *@par Attributes:An optional int that is >= 0. Defaults to "0". *@li capacity: An optional int that is >= 0. Defaults to "0". *@li memory_limit: An optional int that is >= 0. Defaults to "0". -*@li dtypes: A list of tf.DTypes. +*@li dtypes: A list of DTypes. *@li container: An optional string. Defaults to "". *@li shared_name: An optional string. Defaults to "". @@ -563,7 +563,7 @@ REG_OP(MapClear) *@par Attributes: *@li capacity: An optional int that is >= 0. Defaults to "0". *@li memory_limit: An optional int that is >= 0. Defaults to "0". -*@li dtypes: A list of tf.DTypes. +*@li dtypes: A list of DTypes. *@li container: An optional string. Defaults to "". *@li shared_name: An optional string. Defaults to "". @@ -602,7 +602,7 @@ REG_OP(MapIncompleteSize) REG_OP(Unstage) .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \ DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \ - DT_DOUBLE})) + DT_DOUBLE, DT_UINT32, DT_UINT64})) .ATTR(capacity, Int, 0) .ATTR(memory_limit, Int, 0) .ATTR(container, String, "") @@ -630,7 +630,7 @@ DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32. Maximum number of elements in the Staging Area. If > 0, \n inserts on the container will block when the capacity is reached. *@li memory_limit: An optional int that is >= 0. Defaults to "0". -*@li dtypes: A list of tf.DTypes. +*@li dtypes: A list of DTypes. *@li container: An optional string. Defaults to "". \n If non-empty, this queue is placed in the given container. \n Otherwise, a default container is used. @@ -752,7 +752,7 @@ REG_OP(MapUnstageNoKey) *@par Attributes: *@li capacity: An optional int that is >= 0. Defaults to "0". *@li memory_limit: An optional int that is >= 0. Defaults to "0". -*@li dtypes: A list of tf.DTypes that has length >= 1. +*@li dtypes: A list of DTypes that has length >= 1. *@li container: An optional string. Defaults to "". *@li shared_name: An optional string. Defaults to "". @@ -789,7 +789,7 @@ REG_OP(MapPeek) *@par Attributes: *@li capacity: An optional int that is >= 0. Defaults to "0". *@li memory_limit: An optional int that is >= 0. Defaults to "0". -*@li dtypes: A list of tf.DTypes. +*@li dtypes: A list of DTypes. *@li container: An optional string. Defaults to "". *@li shared_name: An optional string. Defaults to "". @@ -1183,7 +1183,7 @@ REG_OP(PaddingFIFOQueue) *@brief A queue that produces elements sorted by the first component value. *@par Attributes: -*@li component_types: An optional list of tf.DTypes. Defaults to {}. \n +*@li component_types: An optional list of DTypes. Defaults to {}. \n The type of each component in a value. *@li shapes: A list of shapes for each component of a queue element. The length of this attr must be either 0 or the same as the length of \n @@ -1451,7 +1451,7 @@ REG_OP(OrderedMapUnstageNoKey) *@par Attributes: *@li capacity: An optional int that is >= 0. Defaults to "0". *@li memory_limit: An optional int that is >= 0. Defaults to "0". -*@li dtypes: A list of tf.DTypes that has length >= 1. +*@li dtypes: A list of DTypes that has length >= 1. *@li container: An optional string. Defaults to "". *@li shared_name: An optional string. Defaults to "". @@ -1876,7 +1876,7 @@ REG_OP(SparseAccumulatorApplyGradient) .INPUT(local_step, TensorType({DT_INT64})) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \ - DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT,DT_FLOAT16, DT_UINT32, \ + DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_UINT32, \ DT_UINT64, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \ DT_QINT8, DT_QUINT8, DT_QINT32})) .INPUT(shape, TensorType({DT_INT64})) diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h index d5272805..3eff2cbe 100644 --- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h @@ -886,7 +886,10 @@ REG_OP(BesselI1e) * y: A Tensor of type UnaryDataType. * @attention Constraints: -* @li base > 0 or if base is set to default (-1), base is set to e; +* @li "base" is supposed to be greater than 0. Retaining the default \n +* value "-1" sets "base" to "e". +* @li If the input value of operator Log is within the range (0, 0.01] or \n +* [0.95, 1.05], the output accuracy is subject to change. */ REG_OP(Log) .INPUT(x, TensorType::UnaryDataType()) @@ -2056,6 +2059,7 @@ REG_OP(ArgMinWithValue) * "0": product, "1": sum, "2": max. *@li coeff: A required attribute. Must met all of following rules: * size of "coeff" must be equal to len("x") or is null. +* the absolute value of “coeff†must less than or equal to 1. */ REG_OP(Eltwise) .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h b/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h deleted file mode 100644 index 2b3e206d..00000000 --- a/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h +++ /dev/null @@ -1,67 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GE_OP_FSRDETECTIONOUTPUT_OPS_H_ -#define GE_OP_FSRDETECTIONOUTPUT_OPS_H_ -#include "graph/operator_reg.h" - -namespace ge { -/** -*@brief Returns detection result. - -*@par Inputs: -* Four inputs, including: -*@li rois: An NCHW tensor of type floa16 or float32, output from operator proposal_d at the preceding layer, used as the input of operator FSRDetectionOutput. -*@li prior_box: An NCHWC0 tensor of type floa16 or float32, specifying the prediction offset, used to update the coordinates [x1, y1, x2, y2] of each ROI. -*@li score: An NCHWC0 tensor of type floa16 or float32, specifying the probability of each class. Class 0 is the background class. -*@li actual_rois_num: An NCHW tensor of type int32, specifying the number of valid boxes per batch. -*@par Attributes: -*@li batch_rois: An optional int32, specifying the number of images to be predicted. Defaults to "1024". The value range is [1, 1024]. -*@li im_info: An optional list of two ints. Defaults to (375, 1024). The value range is [1, 1024]. -*@li num_classes: An optional int32, specifying the number of classes to be predicted. Defaults to "80". The value must be greater than 0. -*@li max_rois_num: An optional int32, specifying the maximum number of ROIs per batch. Defaults to "1024". The value must be a multiple of 16. -*@li score_thresh: An optional float32, specifying the threshold for box filtering. Defaults to 0.45. The value range is [0.0, 1.0]. -*@li nms_thresh: An optional float32, specifying the confidence threshold for box filtering, which is the output "obj" of operator Region. Defaults to 0.7. The value range is (0.0, 1.0). -*@li bbox_reg_weights: An optional list of four ints. Defaults to (1, 1, 1, 1). Must not have value "0". -*@li post_nms_topn: An optional int, specifying the number of output boxes. Defaults to "304". The value must be less than or equal to 1024 and must be a multiple of 16. -*@li kernel_name: An optional string, specifying the operator name. Defaults to "fsr_detection_output". -*@par Outputs: -*box: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. -*actual_bbox_num: An NCHW tensor of type int32, specifying the number of output boxes. - -*@attention Constraints:\n -*@li totalnum < max_rois_num * batch_rois. -*@li "score" must be with shape (total_num, (num_classes+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images. -*@li "prior_box" must be with shape (total_num, (num_classes*4+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images. -*/ -REG_OP(FSRDetectionOutput) - .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16})) - .INPUT(prior_box, TensorType({DT_FLOAT, DT_FLOAT16})) - .INPUT(score, TensorType({DT_FLOAT, DT_FLOAT16})) - .INPUT(actual_rois_num, TensorType({DT_INT32})) - .OUTPUT(actual_bbox_num, TensorType({DT_INT32})) - .OUTPUT(box, TensorType({DT_FLOAT, DT_FLOAT16})) - .ATTR(batch_rois, Int, 1024) - .ATTR(im_info, ListInt, {375,1024}) - .ATTR(num_classes, Int, 80) - .ATTR(max_rois_num, Int, 1024) - .ATTR(score_thresh, Float, 0.45) - .ATTR(nms_thresh, Float, 0.7) - .ATTR(bbox_reg_weights, ListInt, {1,1,1,1}) - .ATTR(post_nms_topn, Int, 304) - .OP_END_FACTORY_REG(FSRDetectionOutput) -} -#endif diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h index 2ac7a70e..aaad03c6 100644 --- a/third_party/fwkacllib/inc/ops/image_ops.h +++ b/third_party/fwkacllib/inc/ops/image_ops.h @@ -525,8 +525,7 @@ REG_OP(ResizeBilinearV2) .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .INPUT(size, TensorType({DT_INT32})) - .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, - DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .OUTPUT(y, TensorType({DT_FLOAT})) .ATTR(align_corners, Bool, false) .ATTR(half_pixel_centers, Bool, false) .OP_END_FACTORY_REG(ResizeBilinearV2) @@ -925,7 +924,7 @@ images[3] <= 2048. */ REG_OP(ResizeBilinearV2D) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) .ATTR(align_corners, Bool, false) .ATTR(half_pixel_centers, Bool, false) .REQUIRED_ATTR(size, ListInt) diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h index aa318c94..cc97a337 100644 --- a/third_party/fwkacllib/inc/ops/math_ops.h +++ b/third_party/fwkacllib/inc/ops/math_ops.h @@ -23,6 +23,29 @@ namespace ge { /** +*@brief Computes the output as (shift + scale * x) ^ power. + +*@par Inputs: +* x: A Tensor of type float16 or float32. + +*@par Attributes: +*@li power: Optional. Defaults to 1.0. +*@li scale: Optional. Defaults to 1.0. +*@li shift: Optional. Defaults to 0.0. + +*@par Outputs: +* y: A Tensor. Has the same type and shape as "x". +*/ + +REG_OP(Power) + .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) + .ATTR(power, Float, 1.0) + .ATTR(scale, Float, 1.0) + .ATTR(shift, Float, 0.0) + .OP_END_FACTORY_REG(Power); + +/** *@brief Compute the lower regularized incomplete Gamma function P(a, x). *@par Inputs: diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h index 597a8982..dd2ce56c 100644 --- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h @@ -492,7 +492,7 @@ REG_OP(DiagPart) *@brief Also known as a "fully-connected" layer, computes an inner product with a set of learned weights, and (optionally) adds biases. *@par Inputs: -* Two inputs, including: +* Four inputs, including: *@li x: A Tensor of type float16, int8. *@li w: A weight matrix of type float16, int8. *@li b: A Tensor of type float16, int32. @@ -501,14 +501,13 @@ REG_OP(DiagPart) *@par Attributes: *@li num_output: Reserved. *@li transpose: A bool, specifying whether to transpose, either "true" or "false". Defaults to "false". -*@li bias_term: A bool, specifying whether to learn and apply a set of additive biases to the filter outputs, either "true" or "false". Defaults to "true". -*@li axis: only support axis is 1. Defaults to "1". -*@li offset_a: A type of Int, Defaults to "1". +*@li axis: Reserved. +*@li offset_x: Reserved. *@par Outputs: *y: The result tensor of type float16, int8. */ -REG_OP(InnerProduct) +REG_OP(FullyConnection) .INPUT(x, TensorType({DT_FLOAT16, DT_INT8})) .INPUT(w, TensorType({DT_FLOAT16, DT_INT8})) .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_INT32})) @@ -516,10 +515,9 @@ REG_OP(InnerProduct) .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32})) .REQUIRED_ATTR(num_output, Int) .ATTR(transpose, Bool, false) - .ATTR(bias_term, Bool, true) .ATTR(axis, Int, 1) - .ATTR(offset_a, Int, 0) - .OP_END_FACTORY_REG(InnerProduct) + .ATTR(offset_x, Int, 0) + .OP_END_FACTORY_REG(FullyConnection) /** *@brief Computes the confusion matrix from predictions and labels. diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h index 1be85a0e..bc492e1b 100644 --- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h @@ -62,7 +62,7 @@ namespace ge { * data is 5D with shape [N, C1, Ho, Wo, C0], * where C is the same as that of the feature map and C0 is 16.\n * Limited by Tiling and L1 / L0 buffer memory: 512 * ceil(Wo, 16) + (480 * -* stride_h + 32 * filter_h) * ceil(Wi, 16) â‰?l1_size and Hf*Wf â‰?l0b_size/512.\n +* stride_h + 32 * filter_h) * ceil(Wi, 16) ≤ l1_size and Hf*Wf ≤ l0b_size/512.\n */ REG_OP(DepthwiseConv2DBackpropFilter) .INPUT(input, TensorType({float16})) @@ -115,7 +115,7 @@ REG_OP(DepthwiseConv2DBackpropFilter) * data is 5D with shape [N, C1, Ho, Wo, C0], * where C is the same as that of the feature map and C0 is 16.\n * Limited by Tiling and L1 / L0 buffer memory: 512 * ceil(Wo, 16) + (480 * -* stride_h + 32 * filter_h) * ceil(Wi, 16) â‰?l1_size and Hf*Wf â‰?l0b_size/512.\n +* stride_h + 32 * filter_h) * ceil(Wi, 16) ≤ l1_size and Hf*Wf ≤ l0b_size/512.\n */ REG_OP(DepthwiseConv2DBackpropFilterD) .INPUT(input, TensorType({float16})) @@ -170,7 +170,7 @@ REG_OP(DepthwiseConv2DBackpropFilterD) * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the * data is 5D with shape [N, C1, Ho, Wo, C0], * where C is the same as that of the feature map and C0 is 16.\n -* Limited by Tiling: max_h_in_l1 â‰?C0, where max_h_in_l1 = (l1_size - Hf * +* Limited by Tiling: max_h_in_l1 ≥ C0, where max_h_in_l1 = (l1_size - Hf * * Wf * C0 * C0 * 2) / (2 * Wo *C0).\n */ REG_OP(DepthwiseConv2DBackpropInput) @@ -223,7 +223,7 @@ REG_OP(DepthwiseConv2DBackpropInput) * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the * data is 5D with shape [N, C1, Ho, Wo, C0], * where C is the same as that of the feature map and C0 is 16.\n -* Limited by Tiling: max_h_in_l1 â‰?C0, where max_h_in_l1 = (l1_size - Hf * +* Limited by Tiling: max_h_in_l1 ≥ C0, where max_h_in_l1 = (l1_size - Hf * * Wf * C0 * C0 * 2) / (2 * Wo *C0).\n */ REG_OP(DepthwiseConv2DBackpropInputD) @@ -439,13 +439,17 @@ REG_OP(Conv2DBackpropInputD) * One optional input: * @li bias: An optional tensor of type int8 *@par Attributes: - * Three attributes: + * Five attributes: * @li strides: A tuple or list of 2 integers. The stride of the sliding window * for H/W dimension. * @li pads: A tuple or list of 4 integers. The [top, bottom, left, right] * padding on the feature map * @li dilations: A tuple or list of 4 integers. The dilation factor for each * dimension of input. Must be [1, 1, 1, 1]. + * @li groups: Number of blocked connections from input channels to \n + output channels. + * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC".\n + Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as "filter". 4D tensor with shape * [batch, height, width, channels] or [batch, channels, height, width]. @@ -458,6 +462,8 @@ REG_OP(Deconvolution) .ATTR(strides, ListInt, {1, 1, 1, 1}) .ATTR(pads, ListInt, {0, 0, 0, 0}) .ATTR(dilations, ListInt, {1, 1, 1, 1}) + .ATTR(groups, Int, 1) + .ATTR(data_format, String, "NHWC") .OP_END_FACTORY_REG(Deconvolution) /** *@brief Computes the gradients of convolution with respect to the filter @@ -631,7 +637,6 @@ REG_OP(Conv2D) *@par Attributes: *@li strides: A list of 5 ints. Specifies the stride of the sliding window for each dimension of "x". The N and C dimensions must be 1. Has the same format as "x". *@li pads: A list of 6 ints. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right. -*@li padding_mode: An optional string from: "zeros", "circular". Defaults to "zeros". *@li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data. *@li dilations: A list of 5 ints. Specifies the dilation factor for each dimension of "x". The N and C dimensions must be 1. Has the same format as "x". @@ -649,7 +654,6 @@ REG_OP(Conv3D) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .ATTR(strides, ListInt, {1, 1, 1, 1, 1}) .ATTR(pads, ListInt, {0, 0, 0, 0, 0, 0}) - .ATTR(padding_mode, String, "zeros") .ATTR(data_format, String, "NDHWC") .ATTR(dilations, ListInt, {1, 1, 1, 1, 1}) .OP_END_FACTORY_REG(Conv3D) @@ -671,7 +675,7 @@ REG_OP(Conv3D) * @li data_format: An optional string from: "NDHWC", "NCHWD". Defaults to "NDHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as filter,and has same format as input_size -*/ +*/ REG_OP(Conv3DBackpropInput) .INPUT(input_sizes, TensorType({DT_INT32, DT_INT64})) .INPUT(filters, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) @@ -698,7 +702,7 @@ REG_OP(Conv3DBackpropInput) * @li data_format: An optional string from: "NDHWC", "NCHWD". Defaults to "NDHWC". Specify the data format of the input and output data. *@par Outputs: * y: A Tensor. Has the same type as filter -*/ +*/ REG_OP(Conv3DBackpropInputD) .INPUT(filters, TensorType({DT_FLOAT16})) .INPUT(grads, TensorType({DT_FLOAT16})) diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h index 04cc3028..f1d6e420 100644 --- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h @@ -311,6 +311,357 @@ REG_OP(PSROIPooling) .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) .OP_END_FACTORY_REG(PSROIPooling) +/** +*@brief Returns detection result. + +*@par Inputs: +* Four inputs, including: +*@li rois: An NCHW tensor of type floa16 or float32, output from operator proposal_d at the preceding layer, used as the input of operator FSRDetectionOutput. +*@li prior_box: An NCHWC0 tensor of type floa16 or float32, specifying the prediction offset, used to update the coordinates [x1, y1, x2, y2] of each ROI. +*@li score: An NCHWC0 tensor of type floa16 or float32, specifying the probability of each class. Class 0 is the background class. +*@li actual_rois_num: An NCHW tensor of type int32, specifying the number of valid boxes per batch. +*@par Attributes: +*@li batch_rois: An optional int32, specifying the number of images to be predicted. Defaults to "1024". The value range is [1, 1024]. +*@li im_info: An optional list of two ints. Defaults to (375, 1024). The value range is [1, 1024]. +*@li num_classes: An optional int32, specifying the number of classes to be predicted. Defaults to "80". The value must be greater than 0. +*@li max_rois_num: An optional int32, specifying the maximum number of ROIs per batch. Defaults to "1024". The value must be a multiple of 16. +*@li score_thresh: An optional float32, specifying the threshold for box filtering. Defaults to 0.45. The value range is [0.0, 1.0]. +*@li nms_thresh: An optional float32, specifying the confidence threshold for box filtering, which is the output "obj" of operator Region. Defaults to 0.7. The value range is (0.0, 1.0). +*@li bbox_reg_weights: An optional list of four ints. Defaults to (1, 1, 1, 1). Must not have value "0". +*@li post_nms_topn: An optional int, specifying the number of output boxes. Defaults to "304". The value must be less than or equal to 1024 and must be a multiple of 16. +*@li kernel_name: An optional string, specifying the operator name. Defaults to "fsr_detection_output". +*@par Outputs: +*box: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*actual_bbox_num: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li totalnum < max_rois_num * batch_rois. +*@li "score" must be with shape (total_num, (num_classes+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images. +*@li "prior_box" must be with shape (total_num, (num_classes*4+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images. +*/ +REG_OP(FSRDetectionOutput) + .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(prior_box, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(score, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(actual_rois_num, TensorType({DT_INT32})) + .OUTPUT(actual_bbox_num, TensorType({DT_INT32})) + .OUTPUT(box, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(batch_rois, Int, 1024) + .ATTR(im_info, ListInt, {375,1024}) + .ATTR(num_classes, Int, 80) + .ATTR(max_rois_num, Int, 1024) + .ATTR(score_thresh, Float, 0.45) + .ATTR(nms_thresh, Float, 0.7) + .ATTR(bbox_reg_weights, ListInt, {1,1,1,1}) + .ATTR(post_nms_topn, Int, 304) + .OP_END_FACTORY_REG(FSRDetectionOutput) + +/** +*@brief Normalizes data. It is called Region on YOLO v2 and Yolo on YOLO v3. + +*@par Inputs: +*x: An NCHW tensor of type float16 or float32. The data is with shape (N, boxes*(coords+obj+classes), H, W),where, "obj" indicates the confidence of an object, and only one confidence is supported. Boxes are arranged as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn. + +*@par Attributes: +*@li boxes: A required int32, specifying the number of anchor boxes. Defaults to "5" for V2 or "3" for V3. +*@li coords: An int32, specifying the number of parameters required for locating an object. The value is fixed at "4", corresponding to (x,y,w,h). +*@li classes: An int32, specifying the number of prediction classes. Defaults to "80". The value range is [1, 1024]. +*@li yolo_version: A string, specifying the YOLO version, either "V2" or "V3". +*@li softmax: A bool, specifying whether to perform softmax, valid only when "yolo_version = V2". +*@li background: A bool, specifying the operation types of the obj and classes, used in conjunction with "softmax" and valid only when "yolo_version = V2". +*@li background: A bool. + +*@par Outputs: +*@li coord_data: A float16 or float32 with shape [N, boxes*coords, ceilx(height*width*2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box. +*@li obj_prob: A float16 or float32 with shape [N, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence. +*@li classes_prob: A float16 or float32 with shape [N, classes, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes. + +*@attention Constraints: +*@li This operator applies to YOLO v2 and v3 networks. +*@li The succeeding layer of the Yolo operator must be operator Yolov3DetectionOutput. +*/ +REG_OP(Yolo) + .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .ATTR(boxes, Int, 3) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(yolo_version, String, "V3") + .ATTR(softmax, Bool, false) + .ATTR(background, Bool, false) + .ATTR(softmaxtree, Bool, false) + .OP_END_FACTORY_REG(Yolo) + +/** +*@brief Performs YOLO V2 detection. + +*@par Inputs: +* Four inputs, including: +*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov3DetectionOutput. \n +Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +* +*@par Attributes: +*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. + +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li This operator applies only to the YOLO v2 network. +*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator. + +*@see Yolo() +*/ +REG_OP(YoloV2DetectionOutput) + .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases, ListFloat) + .ATTR(boxes, Int, 5) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV2DetectionOutput) + +/** +*@brief Performs YOLO V2 detection. + +*@par Inputs: +*Six inputs, including: +*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov2DetectionOutput. \n +Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +*@li windex: A windex tensor with shape [height, weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed. \n + +*@li hindex: A hindex tensor with shape [height, weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]]. \n + +* +*@par Attributes: +*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. + +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. +* +*@attention Constraints:\n +*@li This operator applies only to the YOLO v2 network. +*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator. + +*@see Yolo() +*/ +REG_OP(YoloV2DetectionOutputD) + .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases, ListFloat) + .ATTR(boxes, Int, 5) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV2DetectionOutputD) + +/** +*@brief Performs YOLO V3 detection. + +*@par Inputs: +*Ten inputs, including: +*@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class". \n +There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +* +*@par Attributes: +*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. + +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. + +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n + +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li This operator applies only to the YOLO v3 network. +*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators. + +*@see Yolo() +*/ +REG_OP(YoloV3DetectionOutput) + .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases_low, ListFloat) + .REQUIRED_ATTR(biases_mid, ListFloat) + .REQUIRED_ATTR(biases_high, ListFloat) + .ATTR(boxes, Int, 3) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV3DetectionOutput) + +/** +*@brief Performs YOLO V3 detection. + +*@par Inputs: +*16 Input, including: +*@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput. \n +A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. +*@li imginfo: A float16, describing the image information including the required image height and width \n +and the actual image height and width. +*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively. + +*@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively. + +* +*@par Attributes: +*@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" +*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. +*@li coords: Specifies the number of coordinate parameters. Must be 4. +*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. +*@li relative: An optional bool. Defaults to and must be "true". +*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. +*@li post_nms_topn: An optional int32. This attribute is reserved. +*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. +*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n +*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". +* +*@par Outputs: +*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. +*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. + +*@attention Constraints:\n +*@li This operator applies only to the YOLO v3 network. +*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators. +*@see Yolo() +*/ +REG_OP(YoloV3DetectionOutputD) + .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(windex3, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex1, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex2, TensorType({DT_FLOAT16,DT_FLOAT})) + .INPUT(hindex3, TensorType({DT_FLOAT16,DT_FLOAT})) + .REQUIRED_ATTR(biases_low, ListFloat) + .REQUIRED_ATTR(biases_mid, ListFloat) + .REQUIRED_ATTR(biases_high, ListFloat) + .ATTR(boxes, Int, 3) + .ATTR(coords, Int, 4) + .ATTR(classes, Int, 80) + .ATTR(relative, Bool, true) + .ATTR(obj_threshold, Float, 0.5) + .ATTR(post_nms_topn, Int, 1024) + .ATTR(score_threshold, Float, 0.5) + .ATTR(iou_threshold, Float, 0.45) + .ATTR(pre_nms_topn, Int, 512) + .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) + .OUTPUT(box_out_num, TensorType({DT_INT32})) + .OP_END_FACTORY_REG(YoloV3DetectionOutputD) + +/** +*@brief Spatial Pyramid Pooling, multi-level pooling. +* Pooling out(n, sigma(c*2^i*2^i)) tensor, i in range[0,pyramid_height). + +*@par Inputs: +*x: An NCHW tensor, support float16 or float32 type. + +*@par Attributes: +* @li pyramid_height: An required int32. +* Multi-level pooling out from 2^0 to 2^(pyramid_height-1). +* @li pool_method: An optional int32, pooling method: 0-MAX, 1-AVE. +* Defaults to "0". + +*@par Outputs: +*y: A NCHW tensor, support float16 or float32 type. + +*@attention Constraints: +* @li pyramid_height: pyramid_heigjt should be in range [0,7). +* @li feature_size:input feture map h and w should be [1, 510]. + +*/ +REG_OP(SPP) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .REQUIRED_ATTR(pyramid_height, Int) + .ATTR(pool_method, Int, 0) + .OP_END_FACTORY_REG(SPP) + } // namespace ge #endif // GE_OP_NN_DETECT_OPS_H_ diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h index 10f3f369..87cc004c 100644 --- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h @@ -487,34 +487,6 @@ REG_OP(Upsample) .ATTR(stride_h, Int, 2) .ATTR(stride_w, Int, 2) .OP_END_FACTORY_REG(Upsample) - -/** -*@brief Spatial Pyramid Pooling, multi-level pooling. -* Pooling out(n, sigma(c*2^i*2^i)) tensor, i in range[0,pyramid_height). - -*@par Inputs: -*x: An NCHW tensor, support float16 or float32 type. - -*@par Attributes: -* @li pyramid_height: An required int32. -* Multi-level pooling out from 2^0 to 2^(pyramid_height-1). -* @li pool_method: An optional int32, pooling method: 0-MAX, 1-AVE. -* Defaults to "0". - -*@par Outputs: -*y: A NCHW tensor, support float16 or float32 type. - -*@attention Constraints: -* @li pyramid_height: pyramid_heigjt should be in range [0,7). -* @li feature_size:input feture map h and w should be [1, 510]. - -*/ -REG_OP(SPP) - .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) - .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) - .REQUIRED_ATTR(pyramid_height, Int) - .ATTR(pool_method, Int, 0) - .OP_END_FACTORY_REG(SPP) } // namespace ge #endif // GE_OP_NN_POOLING_OPS_H diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h index 922869c3..88d1a913 100644 --- a/third_party/fwkacllib/inc/ops/nn_training_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h @@ -164,7 +164,8 @@ REG_OP(SparseApplyAdagrad) *@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False". *@par Outputs: -*var: A Tensor. Has the same type and format as input "var". +*@li var: A Tensor. Has the same type and format as input "var". +*@li accum: A Tensor. Has the same type and format as input "var". */ REG_OP(SparseApplyAdagradD) @@ -183,7 +184,7 @@ REG_OP(SparseApplyAdagradD) *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme. *@par Inputs: -* Five inputs, including: +*Six inputs, including: *@li var: An NCHW, NHWC, or ND Tensor of type float32. *@li accum: An NCHW, NHWC, or ND Tensor of type float32. *@li lr: An NCHW, NHWC, or ND Tensor of type float32. @@ -215,7 +216,7 @@ REG_OP(SparseApplyAdagradV2) *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme. *@par Inputs: -* Four inputs, including: +*Four inputs, including: *@li var: An NCHW, NHWC, or ND Tensor of type float32. *@li accum: An NCHW, NHWC, or ND Tensor of type float32. *@li grad: An NCHW, NHWC, or ND Tensor of type float32. @@ -228,8 +229,8 @@ REG_OP(SparseApplyAdagradV2) *@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False". *@par Outputs: -*var: A Tensor. Has the same type and format as input "var". -*accum: A Tensor. Has the same type and format as input "accum". +*@li var: A Tensor. Has the same type and format as input "var". +*@li accum: A Tensor. Has the same type and format as input "accum". */ REG_OP(SparseApplyAdagradV2D) @@ -299,6 +300,39 @@ REG_OP(ApplyMomentumCCE) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ApplyMomentumCCE) +/** +*@brief Updates "var" according to the momentum scheme. Set use_nesterov = True if you +* want to use Nesterov momentum.\n +* computing process: \n +* accum = accum * momentum + grad\n +* var -= lr * accum +* +*@attention Constraints:\n +* the input tensors must have the same shape. +* +*@par Inputs: +*@li var: A mutable tensor. Should be from a Variable(). +*@li accum: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li lr: A scalar. Has the same type as "var". +*@li grad: A tensor for the gradient. Has the same type as "var". +* +*@par Attributes: +*@li use_nesterov: An optional bool. Defaults to "False". +* If "True", the tensor passed to compute grad will be +* var - lr * momentum * accum, so in the end, the var you get is actually +* var - lr * momentum * accum. +* +*@li use_locking: An optional bool. Defaults to "False".\n +* If "True", updating of the "var", "ms", and "mom" tensors is protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. +* +*@par Outputs: +* var: A mutable tensor. Has the same type as input "var". +* accum: A mutable tensor. Has the same type as input "accum". +* +*/ + REG_OP(ApplyMomentumD) .INPUT(var, TensorType::NumberType()) .INPUT(accum, TensorType::NumberType()) @@ -354,6 +388,51 @@ REG_OP(ApplyPowerSign) .OP_END_FACTORY_REG(ApplyPowerSign) /** +*@brief Updates "var" according to the AddSign update.\n +* t-1 mean previous period. +* m_t <- beta1 * m_{t-1} + (1 - beta1) * grad\n +* update <- exp(logbase * sign_decay * sign(grad) * sign(m_t)) * grad\n +* var <- var - lr * update +* +*@attention Constraints:\n +* the input tensors must have the same shape. +* +*@par Inputs: +*@li var: A mutable tensor. Should be from a Variable(). +*@li m: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li lr: A scalar. Has the same type as "var". +*@li logbase: A scalar. Has the same type as "var". +*@li sign_decay: A scalar. Has the same type as "var". +*@li beta: A scalar. Has the same type as "var". +*@li grad: A tensor for the gradient. Has the same type as "var". +* +*@par Attributes: +* use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var", "ms", and "mom" tensors is protected +* by a lock; otherwise the behavior is undefined, but may exhibit less +* contention. +* +*@par Outputs: +*@li var: A mutable tensor. Has the same type as input "var". +*@li m: A mutable tensor. Has the same type as input "var". +* +* +*/ +REG_OP(ApplyPowerSignD) + .INPUT(var, TensorType::NumberType()) + .INPUT(m, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(logbase, TensorType::NumberType()) + .INPUT(sign_decay, TensorType::NumberType()) + .INPUT(beta, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(m, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyPowerSignD) + +/** *@brief Updates "var" as FOBOS algorithm with fixed learning rate.\n * prox_v = var - alpha * delta\n * var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} @@ -426,6 +505,46 @@ REG_OP(ApplyAddSign) .OP_END_FACTORY_REG(ApplyAddSign) /** +*@brief Updates "var" according to the AddSign update. + +*@par Inputs: +*Seven inputs, including: +* @li var: A mutable Tensor of type TensorType::NumberType(). +* Should be a Variable Tensor. +* @li m: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. +* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. +* @li alpha: A Tensor of the same type as "var". Must be a scalar. +* @li sign_decay: A Tensor of the same type as "var". Must be a scalar. +* @li beta: A Tensor of the same type as "var". Must be a scalar. +* @li grad: A Tensor of the same type as "var", for the gradient. + + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var" and "m" tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. + +*@par Outputs: +*@li var: A mutable Tensor. Has the same type as "var". +*@li m: A mutable Tensor. Has the same type as "m". + +*/ +REG_OP(ApplyAddSignD) + .INPUT(var, TensorType::NumberType()) + .INPUT(m, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(alpha, TensorType::NumberType()) + .INPUT(sign_decay, TensorType::NumberType()) + .INPUT(beta, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(m, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyAddSignD) + +/** *@brief Updates "var" according to the centered RMSProp algorithm.\n * The centered RMSProp algorithm uses an estimate of the centered second moment * (i.e., the variance) for normalization, as opposed to regular RMSProp, which @@ -481,6 +600,70 @@ REG_OP(ApplyCenteredRMSProp) .OUTPUT(var, TensorType::NumberType()) .ATTR(use_locking, Bool, false) .OP_END_FACTORY_REG(ApplyCenteredRMSProp) + +/** +*@brief Updates "var" according to the centered RMSProp algorithm.\n +* The centered RMSProp algorithm uses an estimate of the centered second moment +* (i.e., the variance) for normalization, as opposed to regular RMSProp, which +* uses the (uncentered) second moment. This often helps with training, but is +* slightly more expensive in terms of computation and memory. +* +* t-1 mean previous period. +* mg <- rho * mg{t-1} + (1-rho) * grad\n +* ms <- rho * ms{t-1} + (1-rho) * grad * grad\n +* mom <- momentum * mom{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\n +* var <- var - mom\n +* +*@attention Constraints:\n +*@li in dense implementation of this algorithm, mg, ms, and mom will +* update even if the grad is zero, but in this sparse implementation, mg, ms, +* and mom will not update in iterations during which the grad is zero. +*@li the input tensors must have the same shape. +* +*@par Inputs: +*@li var: A mutable tensor. Should be from a Variable(). +*@li mg: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li ms: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li mom: A mutable tensor. Has the same type as "var". +* Should be from a Variable(). +*@li lr: A scalar. Has the same type as "var". +*@li rho: A scalar. Has the same type as "var". +*@li momentum: A tensor. Has the same type as "var". +*@li epsilon: A scalar. Has the same type as "var". +*@li grad: A tensor for the gradient. Has the same type as "var". +* +*@par Attributes: +* use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var", "ms", and "mom" tensors is protected +* by a lock; otherwise the behavior is undefined, but may exhibit less +* contention. +* +*@par Outputs: +*@li var: A mutable Tensor. Has the same type as "var". +*@li mg: A mutable Tensor. Has the same type as "mg". +*@li ms: A mutable Tensor. Has the same type as "ms". +*@li mom: A mutable Tensor. Has the same type as "mom". + +* +*/ +REG_OP(ApplyCenteredRMSPropD) + .INPUT(var, TensorType::NumberType()) + .INPUT(mg, TensorType::NumberType()) + .INPUT(ms, TensorType::NumberType()) + .INPUT(mom, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(rho, TensorType::NumberType()) + .INPUT(momentum, TensorType::NumberType()) + .INPUT(epsilon, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(mg, TensorType::NumberType()) + .OUTPUT(ms, TensorType::NumberType()) + .OUTPUT(mom, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyCenteredRMSPropD) /** *@brief Updates "var" by subtracting 'alpha' * 'delta' from it.\n @@ -590,29 +773,29 @@ REG_OP(ApplyAdagradD) * accum += grad * grad \n * var -= lr * grad * (1 / sqrt(accum) + epsilon) * -* @attention Constraints: -* the input tensors must have the same shape. -* * @par Inputs: * @li var: A mutable tensor. Must be one of the data types defined in -* TensorType::NumberType(). Should be from a Variable(). +* TensorType::NumberType(). Should be from a Variable(). * @li accum: A mutable tensor. Has the same type as "var". Should be from a -* Variable(). +* Variable(). * @li lr: A tensor for the learning rate. Has the same type as "var". Should be -* from a Variable(). +* from a Variable(). * @li grad: A tensor for the gradient. Has the same type as "var". Should be -* from a Variable(). +* from a Variable(). * @li epsilon: A scalar. Has the same type as "var". * * @par Attributes: * @li update_slots: An optional bool. Defaults to "True". -* If "True", accum will be updated +* If "True", accum will be updated * @li use_locking: An optional bool. Defaults to "False". -* If "True", updating of the "var" tensor is protected by a lock; -* otherwise the behavior is undefined, but may exhibit less contention. +* If "True", updating of the "var" tensor is protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. * * @par Outputs: -* var: A mutable tensor. Has the same type as input "var". +* var: A mutable tensor. Has the same type as input "var". +* +* @attention Constraints: +* The input tensors must have the same shape. * * */ @@ -630,32 +813,32 @@ REG_OP(ApplyAdagradV2) /** * @brief Updates "var" according to the adagradv2 scheme.\n -* accum += grad * grad \n -* var -= lr * grad * (1 / sqrt(accum) + epsilon) -* -* @attention Constraints: -* the input tensors must have the same shape. +* accum += grad * grad \n +* var -= lr * grad * (1 / sqrt(accum) + epsilon) * * @par Inputs: * @li var: A mutable tensor. Must be one of the data types defined in -* TensorType::NumberType(). Should be from a Variable(). +* TensorType::NumberType(). Should be from a Variable(). * @li accum: A mutable tensor. Has the same type as "var". Should be from a -* Variable(). +* Variable(). * @li lr: A tensor for the learning rate. Has the same type as "var". Should be -* from a Variable(). +* from a Variable(). * @li grad: A tensor for the gradient. Has the same type as "var". Should be -* from a Variable(). +* from a Variable(). * * @par Attributes: * @li epsilon: A scalar. Has the same type as "var". * @li update_slots: An optional bool. Defaults to "True". -* If "True", accum will be updated +* If "True", accum will be updated * @li use_locking: An optional bool. Defaults to "False". -* If "True", updating of the "var" tensor is protected by a lock; -* otherwise the behavior is undefined, but may exhibit less contention. +* If "True", updating of the "var" tensor is protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. * * @par Outputs: -* var: A mutable tensor. Has the same type as input "var". +* var: A mutable tensor. Has the same type as input "var". +* +* @attention Constraints: +* The input tensors must have the same shape. * * */ @@ -950,7 +1133,9 @@ REG_OP(ApplyRMSPropD) *use_locking: An optional bool. Defaults to "False". If "True", updating of the "var" and "accum" *tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less *contention. *@par Outputs: -*var: A mutable Tensor. Has the same type as "var". +* @li var: A mutable tensor. Must have the same type as input "var". +* @li ms: A mutable tensor. Must have the same type as input "ms". +* @li mom: A mutable tensor. Must have the same type as input "mom". */ REG_OP(ApplyProximalAdagrad) .INPUT(var, TensorType::NumberType()) @@ -964,6 +1149,39 @@ REG_OP(ApplyProximalAdagrad) .OP_END_FACTORY_REG(ApplyProximalAdagrad) /** +*@brief Update "var" and "accum" according to FOBOS with Adagrad learning rate. + +*@par Inputs: +*Six inputs, including: +* @li var: A mutable Tensor of type TensorType::NumberType(). +* Should be from a Variable(). +* @li accum: A mutable Tensor of the same type as "var". Should be from a Variable(). +* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. +* @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar. +* @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar. +* @li grad: A Tensor of the same type as "var", for the gradient. + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". If "True", updating of the "var" and "accum" *tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less *contention. + +*@par Outputs: +* @li var: A mutable Tensor. Has the same type as "var". +* @li accum: A mutable Tensor. Has the same type as "var". + +*/ +REG_OP(ApplyProximalAdagradD) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(l1, TensorType::NumberType()) + .INPUT(l2, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyProximalAdagradD) + +/** *@brief Updates entries in 'var' and 'accum' according to the Proximal Adagrad algorithm.\ n * Compared with op ApplyProximalAdagrad, an additional index tensor is input, * Only the indices into the first dimensions of "var" and "accum" are updated. @@ -1006,6 +1224,51 @@ REG_OP(SparseApplyProximalAdagrad) .OP_END_FACTORY_REG(SparseApplyProximalAdagrad) /** +*@brief Updates entries in 'var' and 'accum' according to the Proximal Adagrad algorithm.\ n +* Compared with op ApplyProximalAdagrad, an additional index tensor is input, +* Only the indices into the first dimensions of "var" and "accum" are updated. + +*@par Inputs: +* Seven inputs, including:\n +* @li var: A mutable Tensor.\n +* TensorType::NumberType(). Should be a Variable Tensor. +* @li accum: A mutable Tensor of the same type as "var".\n +* Should be a Variable Tensor. +* @li lr: A Tensor of the same type as "var".\n +* Scaling factor. Must be a scalar. +* @li l1: A Tensor of the same type as "var".\n +* L1 regulariation. Must be a scalar. +* @li l2: A Tensor of the same type as "var".\n +* L2 regulariation. Must be a scalar. +* @li grad: A Tensor. Has the same type as "var". \n +* The gradient. +* @li indices: A vector of indices into the first dimension of "var" and "accum".\n +* TensorType::IndexNumberType(). + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False".\n +* If "True", updating of the var and accum tensors will be protected by a lock; \n +* If "False", the behavior is undefined, but may exhibit less contention. + +*@par Outputs: +*@li var: A mutable Tensor. Has the same type as "var". +*@li accum: A mutable Tensor. Has the same type as "var". + +*/ +REG_OP(SparseApplyProximalAdagradD) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(l1, TensorType::NumberType()) + .INPUT(l2, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(indices, TensorType::IndexNumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(SparseApplyProximalAdagradD) + +/** *@brief Updates "var" according to the Ftrl-proximal scheme. *@par Inputs: @@ -1045,6 +1308,50 @@ REG_OP(ApplyFtrl) .OP_END_FACTORY_REG(ApplyFtrl) /** +*@brief Updates "var" according to the Ftrl-proximal scheme. + +*@par Inputs: +*Eight inputs, including: +* @li var: A mutable Tensor. Must be of type TensorType::NumberType(). +* Should be a Variable Tensor. +* @li accum: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. +* @li linear: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. +* @li grad: A Tensor of the same type as "var", for the gradient. +* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. +* @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar. +* @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar. +* @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var" and "accum" tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. + +*@par Outputs: +*@li var: A mutable Tensor. Has the same type as "var". +*@li accum: A mutable Tensor. Has the same type as "accum". +*@li linear: A mutable Tensor. Has the same type as "linear". + +*/ +REG_OP(ApplyFtrlD) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(linear, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(l1, TensorType::NumberType()) + .INPUT(l2, TensorType::NumberType()) + .INPUT(lr_power, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .OUTPUT(linear, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyFtrlD) + +/** *@brief Update "var" according to the Ftrl-proximal scheme. *@par Inputs: @@ -1086,6 +1393,52 @@ REG_OP(ApplyFtrlV2) .OP_END_FACTORY_REG(ApplyFtrlV2) /** +*@brief Update "var" according to the Ftrl-proximal scheme. + +*@par Inputs: +*Nine inputs, including: +* @li var: A mutable Tensor. Must be of type TensorType::NumberType(). +* Should be a Variable Tensor. +* @li accum: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. +* @li linear: A mutable Tensor of the same type as "var". +* Should be a Variable Tensor. +* @li grad: A Tensor of the same type as "var", for the gradient. +* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. +* @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar. +* @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar. +* @li l2_shrinkage: A Tensor of the same type as "var". +* @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. + +*@par Attributes: +*use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var" and "accum" tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. + +*@par Outputs: +*var: A mutable Tensor. Has the same type as "var". +*accum: A mutable Tensor. Has the same type as "accum". +*linear: A mutable Tensor. Has the same type as "linear". + +*/ +REG_OP(ApplyFtrlV2D) + .INPUT(var, TensorType::NumberType()) + .INPUT(accum, TensorType::NumberType()) + .INPUT(linear, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(l1, TensorType::NumberType()) + .INPUT(l2, TensorType::NumberType()) + .INPUT(l2_shrinkage, TensorType::NumberType()) + .INPUT(lr_power, TensorType::NumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(accum, TensorType::NumberType()) + .OUTPUT(linear, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(ApplyFtrlV2D) + +/** *@brief Updates "var" according to the Adam algorithm.\n * lr_t <- text{learning\_rate} * sqrt{1 - beta_2^t} / (1 - beta_1^t)\n * m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g\n @@ -1137,6 +1490,45 @@ REG_OP(ApplyAdam) .ATTR(use_nesterov, Bool, false) .OP_END_FACTORY_REG(ApplyAdam) +/** +*@brief Updates "var" according to the Adam algorithm.\n +* lr_t <- text{learning\_rate} * sqrt{1 - beta_2^t} / (1 - beta_1^t)\n +* m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g\n +* v_t <- max(beta2 * v{t-1}, abs(g))\n +* variable <- variable - lr_t * m_t / (sqrt{v_t} + epsilon) +* +*@attention Constraints:\n +* *The input tensors must have the same shape.* +* +*@par Inputs: +*@li var: A mutable Tensor of the type TensorType::NumberType(). +* Should be from a Variable(). +*@li m: A mutable Tensor of the same type as "var". +* Should be from a Variable(). +*@li v: A mutable Tensor of the same type as "var". +* Should be from a Variable(). +*@li beta1_power: A scalar of the same type as "var". +*@li beta2_power: A scalar of the same type as "var". +*@li lr: learning_rate. A scalar of the same type as "var". +*@li beta1: A scalar of the same type as "var". +*@li beta2: A scalar of the same type as "var". +*@li epsilon: A scalar of the same type as "var". +*@li grad: A Tensor of the same type as "var", for the gradient. +* +*@par Attributes:\n +*@li use_locking: An optional bool. Defaults to "False". +* If "True", updating of the "var", m", and "v" tensors will be protected +* by a lock; otherwise the behavior is undefined, but may exhibit less +* contention. +*@li use_nesterov: An optional bool. Defaults to "False". + If "True", uses the nesterov update. +* +*@par Outputs: +*@li var: A mutable tensor. Has the same type as input "var". +*@li m: A mutable tensor. Has the same type as input "m". +*@li v: A mutable tensor. Has the same type as input "v". + +*/ REG_OP(ApplyAdamD) .INPUT(var, TensorType::NumberType()) .INPUT(m, TensorType::NumberType()) @@ -1154,6 +1546,7 @@ REG_OP(ApplyAdamD) .ATTR(use_locking, Bool, false) .ATTR(use_nesterov, Bool, false) .OP_END_FACTORY_REG(ApplyAdamD) + /** *@brief Updates "var" according to the proximal adadelta scheme. @@ -1401,11 +1794,11 @@ REG_OP(LarsV2Update) * @par Inputs: * Nine inputs, including: * @li var: A mutable Tensor. Must be of type TensorType::NumberType(). -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li accum: A mutable Tensor of the same type as "var". -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li linear: A mutable Tensor of the same type as "var". -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li grad: A Tensor of the same type as "var", for the gradient. * @li indices: A vector of indices into the first dimension of var and accum. * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. @@ -1415,9 +1808,9 @@ REG_OP(LarsV2Update) * @par Attributes: * use_locking: An optional bool. Defaults to "False". -* If "True", updating of the "var" and "accum" tensors will be -* protected by a lock; otherwise the behavior is undefined, -* but may exhibit less contention. +* If "True", updating of the "var" and "accum" tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. * @par Outputs: * var: A Tensor. Has the same type and format as input "var". @@ -1441,13 +1834,13 @@ REG_OP(SparseApplyFtrl) * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme. * @par Inputs: -* Nine inputs, including: +* Five inputs, including: * @li var: A mutable Tensor. Must be of type TensorType::NumberType(). -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li accum: A mutable Tensor of the same type as "var". -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li linear: A mutable Tensor of the same type as "var". -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li grad: A Tensor of the same type as "var", for the gradient. * @li indices: A vector of indices into the first dimension of var and accum. * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. @@ -1457,14 +1850,14 @@ REG_OP(SparseApplyFtrl) * @par Attributes: * use_locking: An optional bool. Defaults to "False". -* If "True", updating of the "var" and "accum" tensors will be -* protected by a lock; otherwise the behavior is undefined, -* but may exhibit less contention. +* If "True", updating of the "var" and "accum" tensors will be +* protected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. * @par Outputs: -* var: A Tensor. Has the same type and format as input "var". -* accum: A Tensor. Has the same type and format as input "accum". -* linear: A Tensor. Has the same type and format as input "linear". +* @li var: A Tensor. Has the same type and format as input "var". +* @li accum: A Tensor. Has the same type and format as input "accum". +* @li linear: A Tensor. Has the same type and format as input "linear". */ REG_OP(SparseApplyFtrlD) @@ -1533,13 +1926,13 @@ REG_OP(SparseApplyFtrlV2) * That is for rows we have grad for, we update var, accum and linear * @par Inputs: -* Ten inputs, including: +* Five inputs, including: * @li var: A mutable Tensor. Must be of type TensorType::NumberType(). -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li accum: A mutable Tensor of the same type as "var". -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li linear: A mutable Tensor of the same type as "var". -* Should be a Variable Tensor. +* Should be a Variable Tensor. * @li grad: A Tensor of the same type as "var", for the gradient. * @li indices: A vector of indices into the first dimension of var and accum. @@ -1550,14 +1943,14 @@ REG_OP(SparseApplyFtrlV2) * @li l2_shrinkage: A Tensor of the same type as "var", L2 shrinkage regulariation. Must be a scalar. * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar. * @li use_locking: An optional bool. Defaults to "False". -* If "True", updating of the "var" and "accum" tensors will be -* rotected by a lock; otherwise the behavior is undefined, -* but may exhibit less contention. +* If "True", updating of the "var" and "accum" tensors will be +* rotected by a lock; otherwise the behavior is undefined, +* but may exhibit less contention. * @par Outputs: -* var: A Tensor. Has the same type and format as input "var". -* accum: A Tensor. Has the same type and format as input "accum". -* linear: A Tensor. Has the same type and format as input "linear". +* @li var: A Tensor. Has the same type and format as input "var". +* @li accum: A Tensor. Has the same type and format as input "accum". +* @li linear: A Tensor. Has the same type and format as input "linear". */ REG_OP(SparseApplyFtrlV2D) @@ -1578,6 +1971,109 @@ REG_OP(SparseApplyFtrlV2D) .OP_END_FACTORY_REG(SparseApplyFtrlV2D) /** +* @brief Updates "var" in specified index according to the RMSProp algorithm. +* mean_square = decay * mean_square + (1-decay) * gradient ** 2\n +* Delta = learning_rate * gradient / sqrt(mean_square + epsilon)\n +* ms <- rho * ms_{t-1} + (1-rho) * grad * grad\n +* mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\n +* var <- var - mom\n +* +* @par Inputs: +* @li var: A mutable tensor. Must be one of the data types defined in\n +* TensorType::NumberType(). Should be from a Variable(). +* @li ms: A mutable tensor. Must have the same type as "var". Should be from a +* Variable(). +* @li mom: A mutable tensor. Must have the same type as "var". Should be from a +* Variable(). +* @li lr: A scalar. Must have the same type as "var". +* @li rho: A scalar. Must have the same type as "var". +* @li momentum: A scalar. Must have the same type as "var". +* @li epsilon: A scalar. Must have the same type as "var". +* @li grad: A tensor, specifying the gradient. +* @li indices: A vector of indices into the first dimension of var, mom and ms. +* +* @par Attributes: +* use_locking: An optional "bool". Defaults to "False". If "True", updating of +* the "var", "ms", and "mom" tensors will be protected by a lock; otherwise the +* behavior is undefined, but may exhibit less contention. +* +* @par Outputs: +* var: A mutable tensor. Has the same type as input "var". +* +* @attention Constraints: +* @li Note that in this sparse implementation, "ms" and "mom" will not update +* in iterations during which "grad" is 0. +* @li The input tensors "var", "ms", "mom" must have the same shape. +* +*/ +REG_OP(SparseApplyRMSProp) + .INPUT(var, TensorType::NumberType()) + .INPUT(ms, TensorType::NumberType()) + .INPUT(mom, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(rho, TensorType::NumberType()) + .INPUT(momentum, TensorType::NumberType()) + .INPUT(epsilon, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(indices, TensorType::IndexNumberType()) + .OUTPUT(var, TensorType::NumberType()) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(SparseApplyRMSProp) + +/** +* @brief Updates "var" in specified index according to the RMSProp algorithm. +* a const input will be considered as an attribute.\n +* mean_square = decay * mean_square + (1-decay) * gradient ** 2\n +* Delta = learning_rate * gradient / sqrt(mean_square + epsilon)\n +* ms <- rho * ms_{t-1} + (1-rho) * grad * grad\n +* mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\n +* var <- var - mom +* +* @par Inputs: +* @li var: A mutable tensor. Must be one of the data types defined in +* TensorType::NumberType(). Should be from a Variable(). +* @li ms: A mutable tensor. Must have the same type as "var". Should be from a +* Variable(). +* @li mom: A mutable tensor. Must have the same type as "var". Should be from a +* Variable(). +* @li lr: A scalar. Must have the same type as "var". +* @li grad: A tensor, specifying the gradient. +* +* @par Attributes: +* @li use_locking: An optional "bool". Defaults to "False". If "True", +* updating of the "var", "ms", and "mom" tensors will be protected by a lock; +* otherwise the behavior is undefined, but may exhibit less contention. +* @li rho: A required scalar. Must have the same type as "var". +* @li momentum: A required scalar. Must have the same type as "var". +* @li epsilon: A required scalar. Must have the same type as "var". +* +* @par Outputs: +* @li var: A mutable tensor. Must have the same type as input "var". +* @li ms: A mutable tensor. Must have the same type as input "ms". +* @li mom: A mutable tensor. Must have the same type as input "mom". +* +* @attention Constraints: +* @li Note that in this sparse implementation, "ms" and "mom" will not update +* in iterations during which "grad" is 0. +* @li The input tensors "var", "ms" and "mom" must have the same shape. +*/ +REG_OP(SparseApplyRMSPropD) + .INPUT(var, TensorType::NumberType()) + .INPUT(ms, TensorType::NumberType()) + .INPUT(mom, TensorType::NumberType()) + .INPUT(lr, TensorType::NumberType()) + .INPUT(grad, TensorType::NumberType()) + .INPUT(indices, TensorType::IndexNumberType()) + .OUTPUT(var, TensorType::NumberType()) + .OUTPUT(ms, TensorType::NumberType()) + .OUTPUT(mom, TensorType::NumberType()) + .REQUIRED_ATTR(rho, Float) + .REQUIRED_ATTR(momentum, Float) + .REQUIRED_ATTR(epsilon, Float) + .ATTR(use_locking, Bool, false) + .OP_END_FACTORY_REG(SparseApplyRMSPropD) + +/** *@brief Clean memory of workspace list. *@par Attributes: diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h index 992077ad..46d29b8d 100644 --- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h +++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h @@ -172,24 +172,6 @@ REG_OP(SigmoidGrad) .OUTPUT(z, TensorType(UnaryDataType)) .OP_END_FACTORY_REG(SigmoidGrad) -REG_OP(Activation) - .INPUT(x, TensorType::ALL()) - .OUTPUT(y, TensorType::ALL()) - /* - 0: sigmod, 1: relu, 2: tanh, 3: clipped ReLU, 4: Elu, - 5: leaky relu, 6: abs, 7: relu1, 8: softsign, 9: softplus - */ - .ATTR(mode, Int, 1) - .ATTR(coef, Float, 0) - .OP_END_FACTORY_REG(Activation) - -REG_OP(ActivationGrad) - .INPUT(dy, TensorType{DT_FLOAT}) - .INPUT(x, TensorType{DT_FLOAT}) - .OUTPUT(dx, TensorType{DT_FLOAT}) - .ATTR(mode, Int, 1) - .OP_END_FACTORY_REG(ActivationGrad) - /** *@brief Computes the binomial normal log likelihood (BNLL) output:\n *if x>0, x+log(1+exp(-x)); otherwise log(1+exp(x)). diff --git a/third_party/fwkacllib/inc/ops/power_ops.h b/third_party/fwkacllib/inc/ops/power_ops.h deleted file mode 100644 index b1f5bc24..00000000 --- a/third_party/fwkacllib/inc/ops/power_ops.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - #ifndef GE_OP_POWER_H - #define GE_OP_POWER_H - - #include "../graph/operator_reg.h" - - namespace ge { - -/** -*@brief Computes the output as (shift + scale * x) ^ power. - -*@par Inputs: -* x: A Tensor of type float16 or float32. - -*@par Attributes: -*@li power: Optional. Defaults to 1.0. -*@li scale: Optional. Defaults to 1.0. -*@li shift: Optional. Defaults to 0.0. - -*@par Outputs: -* y: A Tensor. Has the same type and shape as "x". -*/ - - REG_OP(Power) - .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT})) - .ATTR(power, Float, 1.0) - .ATTR(scale, Float, 1.0) - .ATTR(shift, Float, 0.0) - .OP_END_FACTORY_REG(Power); - - } // namespace ge - - #endif // GE_OP_POWER_H diff --git a/third_party/fwkacllib/inc/ops/quantize_ops.h b/third_party/fwkacllib/inc/ops/quantize_ops.h index 235f2645..e44ae888 100644 --- a/third_party/fwkacllib/inc/ops/quantize_ops.h +++ b/third_party/fwkacllib/inc/ops/quantize_ops.h @@ -19,22 +19,6 @@ #include "../graph/operator_reg.h" namespace ge { -REG_OP(QuantizedInnerProduct) - .INPUT(x, TensorType({DT_UINT8})) - .INPUT(w, TensorType({DT_INT8})) - .OPTIONAL_INPUT(b, TensorType({DT_INT32})) - .OPTIONAL_INPUT(scale_q, TensorType({DT_FLOAT16})) - .OPTIONAL_INPUT(offset_q, TensorType({DT_FLOAT16})) - .OPTIONAL_INPUT(scale_deq_req, TensorType({DT_FLOAT16})) - .OPTIONAL_INPUT(offset_req, TensorType({DT_FLOAT16})) - .OUTPUT(y, TensorType({DT_FLOAT16})) - .REQUIRED_ATTR(quant_algo, ListInt) - .REQUIRED_ATTR(scale_sqrt, ListInt) - .REQUIRED_ATTR(num_output, Int) - .ATTR(transpose, Bool, false) - .ATTR(bias_term, Bool, false) - .ATTR(axis, Int, 1) - .OP_END_FACTORY_REG(QuantizedInnerProduct) /** * @brief Dequantizes the input tensor into a float tensor.\n diff --git a/third_party/fwkacllib/inc/ops/ragged_array_ops.h b/third_party/fwkacllib/inc/ops/ragged_array_ops.h index 245f3551..4f3cf97e 100644 --- a/third_party/fwkacllib/inc/ops/ragged_array_ops.h +++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h @@ -45,12 +45,10 @@ namespace ge { REG_OP(RaggedGather) .DYNAMIC_INPUT(params_nested_splits, TensorType({DT_INT32, DT_INT64})) - .INPUT(params_dense_values, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL})) + .INPUT(params_dense_values, TensorType({DT_INT32, DT_INT64})) .INPUT(indices, TensorType({DT_INT32, DT_INT64})) .DYNAMIC_OUTPUT(output_nested_splits, TensorType({DT_INT32, DT_INT64})) - .OUTPUT(output_dense_values, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \ - DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL})) + .OUTPUT(output_dense_values, TensorType({DT_INT32, DT_INT64})) .REQUIRED_ATTR(Tsplits, Type) .ATTR(PARAMS_RAGGED_RANK, Int, 1) .ATTR(OUTPUT_RAGGED_RANK, Int, 0) diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h index 8e07bdc5..7a42e4d9 100644 --- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h +++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h @@ -50,5 +50,43 @@ REG_OP(RaggedTensorToSparse) .ATTR(RAGGED_RANK, Int, 1) .ATTR(Tsplits, Type, DT_INT64) .OP_END_FACTORY_REG(RaggedTensorToSparse) + +/** +*@brief Create a dense tensor from a ragged tensor, possibly altering its shape. + +*@par Inputs: +*Six inputs, including: +*@li shape:A `Tensor`. Must be one of the following types: `int64`, `int32`. +*@li values:A 1D tensor representing the values of the ragged tensor. +*@li default_value:A `Tensor`. Must have the same type as `values`. +*@li row_partition_tensors:A list of at least 1 `Tensor` objects with the same \n +type in: `int64`, `int32`. + +*@par Attributes: +*@li num_row_partition_tensors:Numbers of row partition tensors. +*@li row_partition_types: A list of `strings`. \n +The types of the row partition tensors. At present, these can be: \n +* "ROW_SPLITS": the row_splits tensor from the ragged tensor. \n +* "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor. \n +* "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it \n +is preceeded by "FIRST_DIM_SIZE". + +*@par Outputs: +*@li result: A `Tensor`. Has the same type as `values`. +*/ +REG_OP(RaggedTensorToTensor) + .INPUT(shape, TensorType({DT_INT32, DT_INT64})) + .INPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, + DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) + .INPUT(default_value, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, + DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) + .DYNAMIC_INPUT(row_partition_tensors, TensorType({DT_INT32, DT_INT64})) + .OUTPUT(result, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, + DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16})) + .REQUIRED_ATTR(num_row_partition_tensors, Int) + .REQUIRED_ATTR(row_partition_types, ListString) + .OP_END_FACTORY_REG(RaggedTensorToTensor) + + } // namespace ge #endif // GE_OP_RAGGED_CONVERSION_OPS_H \ No newline at end of file diff --git a/third_party/fwkacllib/inc/ops/ragged_math_ops.h b/third_party/fwkacllib/inc/ops/ragged_math_ops.h index 51797ff8..80669f0f 100644 --- a/third_party/fwkacllib/inc/ops/ragged_math_ops.h +++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h @@ -41,11 +41,11 @@ namespace ge { */ REG_OP(RaggedRange) - .INPUT(starts, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) - .INPUT(limits, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) - .INPUT(deltas, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .INPUT(starts, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .INPUT(limits, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .INPUT(deltas, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) .OUTPUT(rt_nested_splits, TensorType({DT_INT32, DT_INT64})) - .OUTPUT(rt_dense_values, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) + .OUTPUT(rt_dense_values, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64})) .REQUIRED_ATTR(Tsplits, Type) .OP_END_FACTORY_REG(RaggedRange) diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h index abd98695..7a6aaa9e 100644 --- a/third_party/fwkacllib/inc/ops/rnn.h +++ b/third_party/fwkacllib/inc/ops/rnn.h @@ -180,15 +180,15 @@ REG_OP(RNN) .OPTIONAL_INPUT(x_static, TensorType({DT_FLOAT16})) .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(w_xh, TensorType({DT_FLOAT16})) + .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(w_sh, TensorType({DT_FLOAT16})) .INPUT(w_hh, TensorType({DT_FLOAT16})) .INPUT(w_ho, TensorType({DT_FLOAT16})) - .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(bias_o, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT})) - .ATTR(expose_hidden, Bool, false) .ATTR(num_output, Int, 0) + .ATTR(expose_hidden, Bool, false) .OP_END_FACTORY_REG(RNN) /** @@ -220,9 +220,9 @@ REG_OP(BasicRNNCell) .OPTIONAL_INPUT(w_xh_x_static, TensorType({DT_FLOAT16, DT_FLOAT})) .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(w_xh, TensorType({DT_FLOAT16})) + .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT})) .OPTIONAL_INPUT(w_hh, TensorType({DT_FLOAT16})) .INPUT(w_ho, TensorType({DT_FLOAT16})) - .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT})) .INPUT(bias_o, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(o_t, TensorType({DT_FLOAT16, DT_FLOAT})) .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h index 3f1e938a..15428d2b 100644 --- a/third_party/fwkacllib/inc/ops/sdca_ops.h +++ b/third_party/fwkacllib/inc/ops/sdca_ops.h @@ -64,7 +64,7 @@ REG_OP(SdcaOptimizerV2) .INPUT(example_weights, TensorType({DT_FLOAT})) .INPUT(example_labels, TensorType({DT_FLOAT})) .DYNAMIC_INPUT(sparse_indices, TensorType({DT_INT64})) - .DYNAMIC_INPUT(sparse_weights, TensorType({DT_INT64})) + .DYNAMIC_INPUT(sparse_weights, TensorType({DT_FLOAT})) .DYNAMIC_INPUT(dense_weights, TensorType({DT_FLOAT})) .INPUT(example_state_data, TensorType({DT_FLOAT})) .OUTPUT(out_example_state_data, TensorType({DT_FLOAT})) diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h index dab71025..c7b59caa 100644 --- a/third_party/fwkacllib/inc/ops/selection_ops.h +++ b/third_party/fwkacllib/inc/ops/selection_ops.h @@ -240,7 +240,7 @@ REG_OP(GatherV2D) REG_OP(StridedSlice) .INPUT(x, TensorType::BasicType()) .INPUT(begin, TensorType::IndexNumberType()) - .INPUT(end, TensorType::IndexNumberTypeT()) + .INPUT(end, TensorType::IndexNumberType()) .INPUT(strides, TensorType::IndexNumberType()) .ATTR(begin_mask, Int, 0) .ATTR(end_mask, Int, 0) @@ -571,7 +571,7 @@ REG_OP(SegmentMax) *@par Outputs: *y:A Tensor with same type as "x". -*/ +*/ REG_OP(SegmentMaxD) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32})) @@ -703,6 +703,7 @@ REG_OP(SliceD) * @attention Constraints: * @li k =< 4096 * @li Size of the last dimension =< 65500 +* @li sorted = true */ REG_OP(TopKD) .INPUT(x, TensorType::RealNumberType()) @@ -1309,174 +1310,6 @@ REG_OP(UnsortedSegmentProdD) .OP_END_FACTORY_REG(UnsortedSegmentProdD) /** -*@brief Normalizes data. It is called Region on YOLO v2 and Yolo on YOLO v3. - -*@par Inputs: -*x: An NCHW tensor of type float16 or float32. The data is with shape (N, boxes*(coords+obj+classes), H, W),where, "obj" indicates the confidence of an object, and only one confidence is supported. Boxes are arranged as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn. - -*@par Attributes: -*@li boxes: A required int32, specifying the number of anchor boxes. Defaults to "5" for V2 or "3" for V3. -*@li coords: An int32, specifying the number of parameters required for locating an object. The value is fixed at "4", corresponding to (x,y,w,h). -*@li classes: An int32, specifying the number of prediction classes. Defaults to "80". The value range is [1, 1024]. -*@li yolo_version: A string, specifying the YOLO version, either "V2" or "V3". -*@li softmax: A bool, specifying whether to perform softmax, valid only when "yolo_version = V2". -*@li background: A bool, specifying the operation types of the obj and classes, used in conjunction with "softmax" and valid only when "yolo_version = V2". -*@li background: A bool. - -*@par Outputs: -*@li coord_data: A float16 or float32 with shape [N, boxes*coords, ceilx(height*width*2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box. -*@li obj_prob: A float16 or float32 with shape [N, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence. -*@li classes_prob: A float16 or float32 with shape [N, classes, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes. - -*@attention Constraints: -*@li This operator applies to YOLO v2 and v3 networks. -*@li The succeeding layer of the Yolo operator must be operator Yolov3DetectionOutput. -*/ -REG_OP(Yolo) - .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) - .ATTR(boxes, Int, 3) - .ATTR(coords, Int, 4) - .ATTR(classes, Int, 80) - .ATTR(yolo_version, String, "V3") - .ATTR(softmax, Bool, false) - .ATTR(background, Bool, false) - .ATTR(softmaxtree, Bool, false) - .OP_END_FACTORY_REG(Yolo) - -/** -*@brief Performs YOLO V3 detection. - -*@par Inputs: -*Ten inputs, including: -*@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class". \n -There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo. -*@li imginfo: A float16, describing the image information including the required image height and width \n -and the actual image height and width. -* -*@par Attributes: -*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" -*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. -*@li coords: Specifies the number of coordinate parameters. Must be 4. -*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. -*@li relative: An optional bool. Defaults to and must be "true". -*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. - -*@li post_nms_topn: An optional int32. This attribute is reserved. -*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. - -*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n - -*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". -* -*@par Outputs: -*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. -*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. - -*@attention Constraints:\n -*@li This operator applies only to the YOLO v3 network. -*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators. - -*@see Yolo() -*/ -REG_OP(YoloV3DetectionOutput) - .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) - .REQUIRED_ATTR(biases_low, ListFloat) - .REQUIRED_ATTR(biases_mid, ListFloat) - .REQUIRED_ATTR(biases_high, ListFloat) - .ATTR(boxes, Int, 3) - .ATTR(coords, Int, 4) - .ATTR(classes, Int, 80) - .ATTR(relative, Bool, true) - .ATTR(obj_threshold, Float, 0.5) - .ATTR(post_nms_topn, Int, 1024) - .ATTR(score_threshold, Float, 0.5) - .ATTR(iou_threshold, Float, 0.45) - .ATTR(pre_nms_topn, Int, 512) - .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(box_out_num, TensorType({DT_INT32})) - .OP_END_FACTORY_REG(YoloV3DetectionOutput) - -/** -*@brief Performs YOLO V3 detection. - -*@par Inputs: -*16 Input, including: -*@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput. \n -A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. -*@li imginfo: A float16, describing the image information including the required image height and width \n -and the actual image height and width. -*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively. - -*@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively. - -* -*@par Attributes: -*@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" -*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. -*@li coords: Specifies the number of coordinate parameters. Must be 4. -*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. -*@li relative: An optional bool. Defaults to and must be "true". -*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. -*@li post_nms_topn: An optional int32. This attribute is reserved. -*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. -*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n -*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". -* -*@par Outputs: -*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. -*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. - -*@attention Constraints:\n -*@li This operator applies only to the YOLO v3 network. -*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators. -*@see Yolo() -*/ -REG_OP(YoloV3DetectionOutputD) - .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(windex1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(windex2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(windex3, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(hindex1, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(hindex2, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(hindex3, TensorType({DT_FLOAT16,DT_FLOAT})) - .REQUIRED_ATTR(biases_low, ListFloat) - .REQUIRED_ATTR(biases_mid, ListFloat) - .REQUIRED_ATTR(biases_high, ListFloat) - .ATTR(boxes, Int, 3) - .ATTR(coords, Int, 4) - .ATTR(classes, Int, 80) - .ATTR(relative, Bool, true) - .ATTR(obj_threshold, Float, 0.5) - .ATTR(post_nms_topn, Int, 1024) - .ATTR(score_threshold, Float, 0.5) - .ATTR(iou_threshold, Float, 0.45) - .ATTR(pre_nms_topn, Int, 512) - .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(box_out_num, TensorType({DT_INT32})) - .OP_END_FACTORY_REG(YoloV3DetectionOutputD) - -/** *@brief Performs object detection. *@par Inputs: @@ -1555,116 +1388,6 @@ REG_OP(ProposalD) .OP_END_FACTORY_REG(ProposalD) /** -*@brief Performs YOLO V2 detection. - -*@par Inputs: -* Four inputs, including: -*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov3DetectionOutput. \n -Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. -*@li imginfo: A float16, describing the image information including the required image height and width \n -and the actual image height and width. -* -*@par Attributes: -*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" -*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. -*@li coords: Specifies the number of coordinate parameters. Must be 4. -*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. -*@li relative: An optional bool. Defaults to and must be "true". -*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. - -*@li post_nms_topn: An optional int32. This attribute is reserved. -*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. -*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n -*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". -* -*@par Outputs: -*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. -*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. - -*@attention Constraints:\n -*@li This operator applies only to the YOLO v2 network. -*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator. - -*@see Yolo() -*/ -REG_OP(YoloV2DetectionOutput) - .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) - .REQUIRED_ATTR(biases, ListFloat) - .ATTR(boxes, Int, 5) - .ATTR(coords, Int, 4) - .ATTR(classes, Int, 80) - .ATTR(relative, Bool, true) - .ATTR(obj_threshold, Float, 0.5) - .ATTR(post_nms_topn, Int, 1024) - .ATTR(score_threshold, Float, 0.5) - .ATTR(iou_threshold, Float, 0.45) - .ATTR(pre_nms_topn, Int, 512) - .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(box_out_num, TensorType({DT_INT32})) - .OP_END_FACTORY_REG(YoloV2DetectionOutput) - -/** -*@brief Performs YOLO V2 detection. - -*@par Inputs: -*Six inputs, including: -*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov2DetectionOutput. \n -Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo. -*@li imginfo: A float16, describing the image information including the required image height and width \n -and the actual image height and width. -*@li windex: A windex tensor with shape [height, weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed. \n - -*@li hindex: A hindex tensor with shape [height, weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]]. \n - -* -*@par Attributes: -*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes" -*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer. -*@li coords: Specifies the number of coordinate parameters. Must be 4. -*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80]. -*@li relative: An optional bool. Defaults to and must be "true". -*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0]. -*@li post_nms_topn: An optional int32. This attribute is reserved. -*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0]. - -*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n -*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024". -* -*@par Outputs: -*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence. -*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes. -* -*@attention Constraints:\n -*@li This operator applies only to the YOLO v2 network. -*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator. - -*@see Yolo() -*/ -REG_OP(YoloV2DetectionOutputD) - .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT})) - .INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT})) - .REQUIRED_ATTR(biases, ListFloat) - .ATTR(boxes, Int, 5) - .ATTR(coords, Int, 4) - .ATTR(classes, Int, 80) - .ATTR(relative, Bool, true) - .ATTR(obj_threshold, Float, 0.5) - .ATTR(post_nms_topn, Int, 1024) - .ATTR(score_threshold, Float, 0.5) - .ATTR(iou_threshold, Float, 0.45) - .ATTR(pre_nms_topn, Int, 512) - .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT})) - .OUTPUT(box_out_num, TensorType({DT_INT32})) - .OP_END_FACTORY_REG(YoloV2DetectionOutputD) - -/** *@brief Performs plane or channel conversion on YoloV2. * If reverse=true: (N, H, W, C)->(N, H*stride, W*stride, C/(stride*stride)) * If reverse=false: (N, H, W, C)->(N, H/stride, W/stride, C*(stride*stride)) diff --git a/third_party/fwkacllib/inc/ops/sparse_ops.h b/third_party/fwkacllib/inc/ops/sparse_ops.h index 87f44a54..5c50298c 100644 --- a/third_party/fwkacllib/inc/ops/sparse_ops.h +++ b/third_party/fwkacllib/inc/ops/sparse_ops.h @@ -215,7 +215,7 @@ REG_OP(SparseDenseCwiseMul) REG_OP(AddSparseToTensorsMap) .INPUT(indices, TensorType({DT_INT64})) .INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \ - DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE \ + DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \ DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING})) .INPUT(shape, TensorType({DT_INT64})) .OUTPUT(handle, TensorType({DT_INT64})) @@ -410,8 +410,6 @@ REG_OP(SparseToDense) * @li y_indices:A `Tensor` of type `int64`. * @li y_values:A `Tensor`. Has the same type as `values`. * @li y_shape:A `Tensor` of type `int64`. - -* Compatible SparseConcat operator in Tensorflow */ REG_OP(SparseConcat) .DYNAMIC_INPUT(indices, TensorType({DT_INT64})) @@ -452,8 +450,6 @@ REG_OP(SparseConcat) * @li sum_indices:A `Tensor` of type `int64`. * @li sum_values:A `Tensor`. Has the same type as `x1_values`. * @li sum_shape:A `Tensor` of type `int64`. - -* Compatible SparseAdd operator in Tensorflow */ REG_OP(SparseAdd) .INPUT(x1_indices, TensorType({DT_INT64})) @@ -489,8 +485,6 @@ REG_OP(SparseAdd) * @li y_values:A `Tensor`. Has the same type as `values`. * @li empty_row_indicator:A `Tensor` of type `bool`. * @li reverse_index_map:A `Tensor` of type `int64`. - -* Compatible SparseFillEmptyRows operator in Tensorflow */ REG_OP(SparseFillEmptyRows) .INPUT(indices, TensorType({DT_INT64})) @@ -529,8 +523,6 @@ REG_OP(SparseFillEmptyRows) *@par Outputs: * @li y_indices:A `Tensor` of type `int64`. * @li y_values:A `Tensor`. Has the same type as `x1_values`. - -* Compatible SparseSparseMaximum operator in Tensorflow */ REG_OP(SparseSparseMaximum) .INPUT(x1_indices, TensorType({DT_INT64})) @@ -564,8 +556,6 @@ REG_OP(SparseSparseMaximum) *@par Outputs: * @li y_indices:A `Tensor` of type `int64`. * @li y_values:A `Tensor`. Has the same type as `x1_values`. - -* Compatible SparseSparseMinimum operator in Tensorflow */ REG_OP(SparseSparseMinimum) .INPUT(x1_indices, TensorType({DT_INT64})) @@ -604,8 +594,6 @@ REG_OP(SparseSparseMinimum) *@par Outputs: * y:A `Tensor`. Has the same type as `input_values`. - -* Compatible SparseReduceMax operator in Tensorflow */ REG_OP(SparseReduceMax) .INPUT(x_indices, TensorType({DT_INT64})) @@ -640,8 +628,6 @@ REG_OP(SparseReduceMax) * @li y_indices:A `Tensor` of type `int64`. * @li y_values:A `Tensor`. Has the same type as `input_values`. * @li y_shape:A `Tensor` of type `int64`. - -* Compatible SparseReduceMaxSparse operator in Tensorflow */ REG_OP(SparseReduceMaxSparse) .INPUT(x_indices, TensorType({DT_INT64})) @@ -854,7 +840,7 @@ REG_OP(AddManySparseToTensorsMap) * The "N" serialized SparseTensor objects. *@par Attributes: -* @li dtype: A tf.DType. The "dtype" of the SparseTensor objects stored in the "SparseTensorsMap". +* @li dtype: A DType. The "dtype" of the SparseTensor objects stored in the "SparseTensorsMap". * @li container: An optional string. Defaults to "". \n *The container name for the "SparseTensorsMap" read by this op. * @li shared_name: An optional string. Defaults to "". \n diff --git a/third_party/fwkacllib/inc/ops/stateful_random_ops.h b/third_party/fwkacllib/inc/ops/stateful_random_ops.h index 929481d5..9ba09dd6 100644 --- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h +++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h @@ -87,9 +87,9 @@ smaller than the range of the output (either `2^32` or `2^64`). REG_OP(StatefulRandomBinomial) .INPUT(x, TensorType({DT_RESOURCE})) .INPUT(algorithm, TensorType({DT_INT64})) - .INPUT(shape, TensorType({DT_INT32, DT_INT64})) - .INPUT(counts, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) - .INPUT(probs, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) + .INPUT(shape, TensorType({DT_INT32})) + .INPUT(counts, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) + .INPUT(probs, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE})) .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64})) .REQUIRED_ATTR(dtype, Type) .OP_END_FACTORY_REG(StatefulRandomBinomial) @@ -111,7 +111,7 @@ REG_OP(StatefulRandomBinomial) REG_OP(StatefulStandardNormalV2) .INPUT(x, TensorType({DT_RESOURCE})) .INPUT(algorithm, TensorType({DT_INT64})) - .INPUT(shape, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT32,DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(StatefulStandardNormalV2) @@ -134,7 +134,7 @@ REG_OP(StatefulStandardNormalV2) REG_OP(StatefulTruncatedNormal) .INPUT(x, TensorType({DT_RESOURCE})) .INPUT(algorithm, TensorType({DT_INT64})) - .INPUT(shape, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT32,DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(StatefulTruncatedNormal) @@ -156,7 +156,7 @@ lower bound 0 is included in the range, while the upper bound 1 is excluded. \n REG_OP(StatefulUniform) .INPUT(x, TensorType({DT_RESOURCE})) .INPUT(algorithm, TensorType({DT_INT64})) - .INPUT(shape, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT32,DT_INT64})) .OUTPUT(y, TensorType({DT_FLOAT})) .OP_END_FACTORY_REG(StatefulUniform) @@ -177,8 +177,8 @@ The generated values are uniform integers covering the whole range of `dtype`. REG_OP(StatefulUniformFullInt) .INPUT(x, TensorType({DT_RESOURCE})) .INPUT(algorithm, TensorType({DT_INT64})) - .INPUT(shape, TensorType({DT_INT64})) - .OUTPUT(y, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT32,DT_INT64})) + .OUTPUT(y, TensorType({DT_UINT64})) .OP_END_FACTORY_REG(StatefulUniformFullInt) /** @@ -205,7 +205,7 @@ smaller than the range of the output (either `2^32` or `2^64`). REG_OP(StatefulUniformInt) .INPUT(x, TensorType({DT_RESOURCE})) .INPUT(algorithm, TensorType({DT_INT64})) - .INPUT(shape, TensorType({DT_INT64})) + .INPUT(shape, TensorType({DT_INT32,DT_INT64})) .INPUT(minval, TensorType({DT_INT64})) .INPUT(maxval, TensorType({DT_INT64})) .OUTPUT(y, TensorType({DT_INT64})) diff --git a/third_party/fwkacllib/inc/ops/string_ops.h b/third_party/fwkacllib/inc/ops/string_ops.h index 9b87817f..1b88fbd0 100644 --- a/third_party/fwkacllib/inc/ops/string_ops.h +++ b/third_party/fwkacllib/inc/ops/string_ops.h @@ -127,7 +127,7 @@ include: \n *inputs are trusted or unimportant. There is a risk of adversaries\n *constructing inputs that all hash to the same bucket.\n *To prevent this problem, use a strong hash function with\n -*tf.string_to_hash_bucket_strong. +*string_to_hash_bucket_strong. *@see Substr() @@ -155,7 +155,7 @@ include: \n *This function may be used when CPU time is scarce and inputs are trusted or\n *unimportant. There is a risk of adversaries constructing inputs that all hash\n *to the same bucket. To prevent this problem, use a strong hash function with\n -*tf.string_to_hash_bucket_strong. +*string_to_hash_bucket_strong. *@see StringToHashBucketFast() @@ -187,7 +187,7 @@ include: \n * hash value distribution over buckets. This requires that the hash function\ *is seeded by a high-entropy (random) "key" unknown to the adversary. *@li The additional robustness comes at a cost of roughly 4x higher\n -*compute time than tf.string_to_hash_bucket_fast. +*compute time than string_to_hash_bucket_fast. *@see StringToHashBucketStrong() diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h index 689cde4e..69dd450f 100644 --- a/third_party/fwkacllib/inc/ops/transformation_ops.h +++ b/third_party/fwkacllib/inc/ops/transformation_ops.h @@ -400,8 +400,8 @@ REG_OP(Unpack) * "ksizes", "strides" and "rates" are lists of integers. */ REG_OP(ExtractImagePatches) - .INPUT(x, TensorType::REALNUMBERTYPE()) - .OUTPUT(y, TensorType::REALNUMBERTYPE()) + .INPUT(x, TensorType::RealNumberType()) + .OUTPUT(y, TensorType::RealNumberType()) .REQUIRED_ATTR(ksizes, ListInt) .REQUIRED_ATTR(strides, ListInt) .REQUIRED_ATTR(rates, ListInt) @@ -409,6 +409,37 @@ REG_OP(ExtractImagePatches) .OP_END_FACTORY_REG(ExtractImagePatches) /** +* @brief Extract "patches" from "input" and put them in the "depth" +* dimension of the output. + +* @par Inputs: +* x: A 5D Tensor with shape [batch, in_planes, in_rows, in_cols, depth]. + +* @par Attributes: +* @li ksizes: A required list or tuple. The size of the sliding window for each +* dimension of "x". +* @li strides: A required list or tuple. How far the centers of two consecutive +* patches are in "x". Must be: [1, stride_planes, stride_rows, stride_cols, 1]. +* @li padding: A required string. The type of padding algorithm to use. + +* @par Outputs: +* Output: A 5D Tensor with shape [batch, out_planes, out_rows, out_cols, ksize_planes * \n +* ksize_rows * ksize_cols * depth] containing patches with size (ksize_rows * ksize_cols\n +* * depth) vectorized in the "depth" dimension. Note "out_planes", "out_rows" and "out_cols"\n +* are the dimensions of the output patches. + +* @attention Constraints: +* "ksizes" and "strides" are lists of integers. +*/ +REG_OP(ExtractVolumePatches) + .INPUT(x, TensorType::REALNUMBERTYPE()) + .OUTPUT(y, TensorType::REALNUMBERTYPE()) + .REQUIRED_ATTR(ksizes, ListInt) + .REQUIRED_ATTR(strides, ListInt) + .REQUIRED_ATTR(padding, String) + .OP_END_FACTORY_REG(ExtractVolumePatches) + +/** *@brief Confuse reshape and transpose. *@par Inputs: @@ -466,7 +497,7 @@ REG_OP(ConfusionTranspose) *y: The flattened ND tensor. All data types are supported. *@attention Constraints: -* "axis" and "end_axis" must be within the dimension range of the input. +* "axis" and "end_axis" must be within the dimension range of the input. This operator cannot be directly called by the acllopExecute API. */ REG_OP(FlattenV2) .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, diff --git a/third_party/fwkacllib/inc/register/op_kernel_registry.h b/third_party/fwkacllib/inc/register/op_kernel_registry.h index 47bdca07..2c479e92 100644 --- a/third_party/fwkacllib/inc/register/op_kernel_registry.h +++ b/third_party/fwkacllib/inc/register/op_kernel_registry.h @@ -18,7 +18,8 @@ #define INC_REGISTER_OP_KERNEL_REGISTRY_H_ #include #include -#include "register/register.h" +#include "register/register_types.h" +#include "register.h" namespace ge { class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry { @@ -40,7 +41,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry { private: OpKernelRegistry(); class OpKernelRegistryImpl; - /*lint -e148*/ std::unique_ptr impl_; }; } // namespace ge diff --git a/third_party/fwkacllib/inc/register/register.h b/third_party/fwkacllib/inc/register/register.h new file mode 100644 index 00000000..27da0b0b --- /dev/null +++ b/third_party/fwkacllib/inc/register/register.h @@ -0,0 +1,53 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INC_REGISTER_REGISTRY_H_ +#define INC_REGISTER_REGISTRY_H_ + +#include "external/register/register.h" + +namespace ge { +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp { + public: + HostCpuOp() = default; + virtual ~HostCpuOp() = default; + + virtual graphStatus Compute(Operator &op, + const std::map &inputs, + std::map &outputs) = 0; +}; + +class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOpRegistrar { + public: + HostCpuOpRegistrar(const char *op_type, HostCpuOp *(*create_fn)()); + ~HostCpuOpRegistrar() = default; +}; + +#define REGISTER_HOST_CPU_OP_BUILDER(name, op) \ + REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(__COUNTER__, name, op) + +#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(ctr, name, op) \ + REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op) + +#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op) \ + static ::ge::HostCpuOpRegistrar register_host_cpu_op##ctr \ + __attribute__((unused)) = \ + ::ge::HostCpuOpRegistrar(name, []()->::ge::HostCpuOp* { \ + return new (std::nothrow) op(); \ + }) +} // namespace ge + +#endif //INC_REGISTER_REGISTRY_H_ diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h index 1609519f..c99eb96f 100644 --- a/third_party/fwkacllib/inc/runtime/kernel.h +++ b/third_party/fwkacllib/inc/runtime/kernel.h @@ -448,7 +448,7 @@ RTS_API rtError_t rtSubscribeReport(uint64_t threadId, rtStream_t stream); * @param [in] stream subscribed stream * @return RT_ERROR_NONE for ok, errno for failed */ -RTS_API rtError_t rtCallbackLaunch(rtCallback_t callBackFunc, void *fnData, rtStream_t stream); +RTS_API rtError_t rtCallbackLaunch(rtCallback_t callBackFunc, void *fnData, rtStream_t stream, bool isBlock); /** * @ingroup rt_kernel diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h index b55530a1..93b7585a 100644 --- a/third_party/fwkacllib/inc/runtime/mem.h +++ b/third_party/fwkacllib/inc/runtime/mem.h @@ -17,9 +17,7 @@ #ifndef __CCE_RUNTIME_MEM_H__ #define __CCE_RUNTIME_MEM_H__ -/*lint -e7*/ #include -/*lint +e7*/ #include "base.h" #include "config.h" #include "stream.h" @@ -77,6 +75,8 @@ typedef enum tagRtMemcpyKind { RT_MEMCPY_DEVICE_TO_HOST, // device to host RT_MEMCPY_DEVICE_TO_DEVICE, // device to device, 1P && P2P RT_MEMCPY_MANAGED, // managed memory + RT_MEMCPY_ADDR_DEVICE_TO_DEVICE, + RT_MEMCPY_HOST_TO_DEVICE_EX, // host to device ex (only used for 8 bytes) RT_MEMCPY_RESERVED, } rtMemcpyKind_t; diff --git a/third_party/fwkacllib/inc/runtime/rt_model.h b/third_party/fwkacllib/inc/runtime/rt_model.h index 1e03e853..d4e5682b 100644 --- a/third_party/fwkacllib/inc/runtime/rt_model.h +++ b/third_party/fwkacllib/inc/runtime/rt_model.h @@ -45,7 +45,8 @@ typedef enum tagModelTaskType { RT_MODEL_TASK_EVENT_RESET = 18, RT_MODEL_TASK_MODEL_END_GRAPH, RT_MODEL_TASK_STREAM_SWITCH_N, - RT_MODEL_TASK_RDMA_DB_SEND + RT_MODEL_TASK_RDMA_DB_SEND, + RT_MODEL_TASK_MEMCPY_ADDR_ASYNC } rtModelTaskType_t; typedef enum tagModelStreamType { diff --git a/third_party/fwkacllib/inc/toolchain/slog.h b/third_party/fwkacllib/inc/toolchain/slog.h index 1fb9aff2..2728c812 100644 --- a/third_party/fwkacllib/inc/toolchain/slog.h +++ b/third_party/fwkacllib/inc/toolchain/slog.h @@ -168,6 +168,7 @@ enum { DSS, PROCMGR, // Process Manager, Base Platform BBOX, + AIVECTOR, INVLID_MOUDLE_ID }; @@ -241,6 +242,7 @@ static DCODE g_moduleIdName[] = {SET_MOUDLE_ID_MAP_NAME(SLOG), SET_MOUDLE_ID_MAP_NAME(DSS), SET_MOUDLE_ID_MAP_NAME(PROCMGR), SET_MOUDLE_ID_MAP_NAME(BBOX), + SET_MOUDLE_ID_MAP_NAME(AIVECTOR), { NULL, -1 }}; #endif // MODULE_ID_NAME diff --git a/third_party/fwkacllib/version.info b/third_party/fwkacllib/version.info index 0e65dd04..8bc7f6e0 100644 --- a/third_party/fwkacllib/version.info +++ b/third_party/fwkacllib/version.info @@ -1 +1 @@ -Version=1.60.T49.0.B201 +Version=1.71.T6.0.B070