diff --git a/build.sh b/build.sh index 7b1c0792..5931bbaa 100644 --- a/build.sh +++ b/build.sh @@ -173,7 +173,7 @@ build_graphengine() TARGET="ge_runner fwk_atc.bin ${TARGET}" elif [ "x${PLATFORM}" = "xinference" ] then - TARGET="ge_compiler atc_atc.bin opensrc_ascendcl ${TARGET}" + TARGET="ge_compiler atc_atc.bin ge_executor_shared ${TARGET}" elif [ "X$ENABLE_GE_UT" = "Xon" ] then TARGET="ut_libgraph ut_libge_multiparts_utest ut_libge_others_utest ut_libge_kernel_utest ut_libge_distinct_load_utest" @@ -183,7 +183,7 @@ build_graphengine() elif [ "x${PLATFORM}" = "xall" ] then # build all the target - TARGET="ge_runner ge_compiler fwk_atc.bin atc_atc.bin opensrc_ascendcl ${TARGET}" + TARGET="ge_runner ge_compiler fwk_atc.bin atc_atc.bin ge_executor_shared ${TARGET}" fi make ${VERBOSE} ${TARGET} -j${THREAD_NUM} && make install @@ -250,6 +250,7 @@ generate_package() NNENGINE_PATH="plugin/nnengine/ge_config" OPSKERNEL_PATH="plugin/opskernel" + ACL_LIB=("libge_common.so" "libgraph.so" "libregister.so" "liberror_manager.so" "libge_executor.so") ATC_LIB=("libc_sec.so" "libge_common.so" "libge_compiler.so" "libgraph.so" "libregister.so" "liberror_manager.so") FWK_LIB=("libge_common.so" "libge_runner.so" "libgraph.so" "libregister.so" "liberror_manager.so") PLUGIN_OPSKERNEL=("libge_local_engine.so" "libge_local_opskernel_builder.so" "libhost_cpu_engine.so" "libhost_cpu_opskernel_builder.so" "optimizer_priority.pbtxt") @@ -303,6 +304,11 @@ generate_package() find ${OUTPUT_PATH}/${GRAPHENGINE_LIB_PATH} -maxdepth 1 -name "$lib" -exec cp -f {} ${OUTPUT_PATH}/${FWK_PATH} \; done + for lib in "${ACL_LIB[@]}"; + do + find ${OUTPUT_PATH}/${GRAPHENGINE_LIB_PATH} -maxdepth 1 -name "$lib" -exec cp -f {} ${OUTPUT_PATH}/${ACL_PATH} \; + done + for lib in "${ATC_LIB[@]}"; do find ${OUTPUT_PATH}/${GRAPHENGINE_LIB_PATH} -maxdepth 1 -name "$lib" -exec cp -f {} ${OUTPUT_PATH}/${ATC_PATH} \; @@ -310,7 +316,6 @@ generate_package() find ./lib/atclib -name atc.bin -exec cp {} "${OUTPUT_PATH}/${ATC_BIN_PATH}" \; find ./lib/fwkacl -name atc.bin -exec cp {} "${OUTPUT_PATH}/${FWK_BIN_PATH}" \; - find ${OUTPUT_PATH}/${GRAPHENGINE_LIB_PATH} -maxdepth 1 -name "libascendcl.so" -exec cp -f {} ${OUTPUT_PATH}/${ACL_PATH} \; cp -r ${OUTPUT_PATH}/../metadef/inc/external/* ${ATC_INCLUDE_PATH} cp -r ${OUTPUT_PATH}/../parser/inc/external/* ${ATC_INCLUDE_PATH} diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index d84bb89a..1698710e 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -25,7 +25,7 @@ set(PROTO_HEADER_LIST "${METADEF_DIR}/proto/insert_op.proto" "${METADEF_DIR}/proto/ge_ir.proto" "${METADEF_DIR}/proto/fwk_adapter.proto" - "${METADEF_DIR}/proto/op_mapping_info.proto" + "${METADEF_DIR}/proto/op_mapping.proto" ) protobuf_generate(ge PROTO_SRCS PROTO_HDRS ${PROTO_LIST}) @@ -108,6 +108,7 @@ set(TRAIN_SRC_LIST "common/helper/model_cache_helper.cc" "common/profiling/profiling_manager.cc" "common/dump/dump_manager.cc" + "common/dump/exception_dumper.cc" "common/dump/dump_properties.cc" "common/dump/opdebug_register.cc" "common/dump/dump_op.cc" @@ -339,7 +340,6 @@ set(TRAIN_SRC_LIST "init/gelib.cc" "model/ge_model.cc" "model/ge_root_model.cc" - "omm/csa_interact.cc" "opskernel_manager/ops_kernel_manager.cc" "opskernel_manager/ops_kernel_builder_manager.cc" "session/inner_session.cc" @@ -401,7 +401,7 @@ set(TRAIN_SRC_LIST "ir_build/attr_options/utils.cc" "ir_build/attr_options/keep_dtype_option.cc" "ir_build/attr_options/weight_compress_option.cc" - "ir_build/atc_ir_common.cc" + "ir_build/option_utils.cc" "graph/build/memory/memory_assigner.cc" "graph/build/memory/graph_mem_assigner.cc" "graph/build/memory/binary_block_mem_assigner.cc" @@ -414,7 +414,6 @@ set(TRAIN_SRC_LIST set(INFER_SRC_LIST "graph/manager/trans_var_data_utils.cc" - "omm/csa_interact.cc" "common/fp16_t.cc" "common/formats/utils/formats_trans_utils.cc" "common/formats/format_transfers/datatype_transfer.cc" @@ -437,6 +436,7 @@ set(INFER_SRC_LIST "common/formats/formats.cc" "common/profiling/profiling_manager.cc" "common/dump/dump_properties.cc" + "common/dump/exception_dumper.cc" "common/dump/dump_manager.cc" "common/dump/dump_op.cc" "common/dump/opdebug_register.cc" @@ -661,7 +661,7 @@ set(INFER_SRC_LIST "ir_build/attr_options/utils.cc" "ir_build/attr_options/keep_dtype_option.cc" "ir_build/attr_options/weight_compress_option.cc" - "ir_build/atc_ir_common.cc" + "ir_build/option_utils.cc" "graph/preprocess/insert_op/ge_aipp_op.cc" "graph/preprocess/insert_op/util_insert_aipp_op.cc" "hybrid/node_executor/aicpu/aicpu_ext_info.cc" diff --git a/ge/client/ge_api.cc b/ge/client/ge_api.cc index 2af82e4d..8f6fba95 100644 --- a/ge/client/ge_api.cc +++ b/ge/client/ge_api.cc @@ -100,7 +100,7 @@ Status GEInitializeImpl(const std::map &options) { GELOGW("GEInitialize is called more than once"); return SUCCESS; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOpsProtoInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOpsProtoInit); // Load OpsProto lib plugin std::string opsproto_path; GetOpsProtoPath(opsproto_path); @@ -119,7 +119,7 @@ Status GEInitializeImpl(const std::map &options) { return FAILED; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); // check options is valid GE_TIMESTAMP_START(CheckOptionsValid); if (CheckOptionsValid(options) != SUCCESS) { @@ -127,13 +127,13 @@ Status GEInitializeImpl(const std::map &options) { } GE_TIMESTAMP_END(CheckOptionsValid, "GEInitialize::CheckOptionsValid"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOpsProtoInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOpsProtoInit); GE_TIMESTAMP_START(InitPreparation); TBEPluginManager::Instance().InitPreparation(options); GE_TIMESTAMP_END(InitPreparation, "GEInitialize::InitPreparation"); // call Initialize GELOGT(TRACE_RUNNING, "Initializing environment"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); GE_TIMESTAMP_START(GELibInitialize); ret = ge::GELib::Initialize(options); GE_TIMESTAMP_END(GELibInitialize, "GEInitialize::GELibInitialize"); @@ -154,7 +154,7 @@ Status GEInitializeImpl(const std::map &options) { // Initialize GE, prepare for execution, call GELib::Initialize Status GEInitialize(const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); if (DlogReportInitialize() != SUCCESS) { GELOGW("Dlog report device log initialize failed."); } @@ -162,7 +162,7 @@ Status GEInitialize(const std::map &options) { } Status GEInitialize(const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); std::map str_options; for (auto &option : options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { @@ -191,7 +191,7 @@ Status GEFinalize() { return SUCCESS; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().SetStage(error_message::kFinalize, error_message::kFinalize); ErrorManager::GetInstance().GenWorkStreamIdDefault(); GELOGT(TRACE_INIT, "GEFinalize start"); @@ -243,7 +243,7 @@ std::string GEGetWarningMsg() { // Initialize session,which calls innerSession Session::Session(const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); GELOGT(TRACE_INIT, "Session Constructor start"); ErrorManager::GetInstance().GenWorkStreamIdDefault(); @@ -259,7 +259,7 @@ Session::Session(const std::map &options) { // call Initialize std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Construct][Session]Failed, GELib instance is nullptr or it is not InitFlag"); return; } @@ -280,7 +280,7 @@ Session::Session(const std::map &options) { } Session::Session(const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); GELOGT(TRACE_INIT, "Session Constructor start"); ErrorManager::GetInstance().GenWorkStreamIdDefault(); @@ -331,7 +331,7 @@ Session::Session(const std::map &options) { // session destructor Session::~Session() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().SetStage(error_message::kFinalize, error_message::kFinalize); GELOGT(TRACE_INIT, "Session Destructor start"); // 0.check init status if (!g_ge_initialized) { @@ -371,7 +371,7 @@ Session::~Session() { // Add Graph Status Session::AddGraph(uint32_t graph_id, const Graph &graph) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); std::map options; ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); return AddGraph(graph_id, graph, options); @@ -379,7 +379,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph) { // Add Graph Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); @@ -393,7 +393,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::mapSessionManagerObj().AddGraph(sessionId_, graph_id, graph, options); if (ret != SUCCESS) { - GELOGE(ret, + GELOGE(ret, "[Add][Graph]Failed, error code:%u, session_id:%lu, graph_id:%u.", ret, sessionId_, graph_id); return FAILED; @@ -405,7 +405,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); @@ -441,7 +441,7 @@ Status Session::AddGraph(uint32_t graph_id, const Graph &graph, } Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::map options; return AddGraphWithCopy(graph_id, graph, options); @@ -450,7 +450,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph) { // Add Graph With Copy Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGT(TRACE_INIT, "Start to add graph in Session. graph_id: %u, session_id: %lu.", graph_id, sessionId_); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); @@ -479,7 +479,7 @@ Status Session::AddGraphWithCopy(uint32_t graph_id, const Graph &graph, // Remove Graph Status Session::RemoveGraph(uint32_t graph_id) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGT(TRACE_INIT, "Session RemoveGraph start"); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); @@ -560,7 +560,7 @@ void PrintOutputResult(std::vector &outputs) { // Run Graph Status Session::RunGraph(uint32_t graph_id, const std::vector &inputs, std::vector &outputs) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGT(TRACE_INIT, "Session RunGraph start"); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); @@ -615,7 +615,7 @@ Status Session::RegisterCallBackFunc(const char *key, const session::pCallBackFu // Build Graph Status Session::BuildGraph(uint32_t graph_id, const std::vector &inputs) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { @@ -643,7 +643,7 @@ Status Session::BuildGraph(uint32_t graph_id, const std::vector // Run Graph Asynchronously Status Session::RunGraphAsync(uint32_t graph_id, const std::vector &inputs, RunAsyncCallback callback) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); ErrorManager::GetInstance().GenWorkStreamIdBySessionGraph(sessionId_, graph_id); std::shared_ptr instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { @@ -672,11 +672,11 @@ Status Session::RunGraphAsync(uint32_t graph_id, const std::vector &var_names, std::vector &var_values) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); ErrorManager::GetInstance().GenWorkStreamIdDefault(); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { - GELOGE(GE_CLI_GE_NOT_INITIALIZED, + GELOGE(GE_CLI_GE_NOT_INITIALIZED, "[Get][Variables]Failed, the GELib instance is nullptr or is not InitFlag."); REPORT_INNER_ERROR("E19999", "GetVariables failed, the GELib instance is nullptr or is not InitFlag."); @@ -693,7 +693,7 @@ Status Session::GetVariables(const std::vector &var_names, std::vec // Get Variables Status Session::GetVariables(const std::vector &var_names, std::vector &var_values) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); ErrorManager::GetInstance().GenWorkStreamIdDefault(); auto instance_ptr = ge::GELib::GetInstance(); if (instance_ptr == nullptr || !instance_ptr->InitFlag()) { diff --git a/ge/client/proto/ge_ir.proto b/ge/client/proto/ge_ir.proto index 12989a54..c0ef3071 100644 --- a/ge/client/proto/ge_ir.proto +++ b/ge/client/proto/ge_ir.proto @@ -31,6 +31,8 @@ enum DataType DT_STRING_REF = 24; // string_ref type DT_DUAL = 25; /**< dual output type */ DT_VARIANT = 26; // variant type + DT_BF16 = 27; // bf16 type + DT_INT4 = 28; // int4 type } message AttrDef diff --git a/ge/common/auth/file_saver.cc b/ge/common/auth/file_saver.cc index cb297d88..50dcf776 100755 --- a/ge/common/auth/file_saver.cc +++ b/ge/common/auth/file_saver.cc @@ -48,8 +48,8 @@ Status FileSaver::OpenFile(int32_t &fd, const std::string &file_path) { fd = mmOpen2(real_path, M_RDWR | M_CREAT | O_TRUNC, mode); if (fd == EN_INVALID_PARAM || fd == EN_ERROR) { // -1: Failed to open file; - 2: Illegal parameter - GELOGE(FAILED, "[Open][File]Failed. mmpa_errno = %d, %s", fd, strerror(errno)); - REPORT_INNER_ERROR("E19999", "Open file failed, mmpa_errno = %d, error:%s.", + GELOGE(FAILED, "[Open][File]Failed. errno:%d, errmsg:%s", fd, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Open file failed, errno:%d, errmsg:%s.", fd, strerror(errno)); return FAILED; } @@ -67,9 +67,9 @@ Status FileSaver::WriteData(const void *data, uint32_t size, int32_t fd) { while (size > size_1g) { write_count = mmWrite(fd, reinterpret_cast(seek), size_1g); if (write_count == EN_INVALID_PARAM || write_count == EN_ERROR) { - GELOGE(FAILED, "[Write][Data]Failed, mmpa_errorno = %ld, error:%s", + GELOGE(FAILED, "[Write][Data]Failed, errno:%ld, errmsg:%s", write_count, strerror(errno)); - REPORT_INNER_ERROR("E19999", "Write data failed, mmpa_errorno = %ld, error:%s.", + REPORT_INNER_ERROR("E19999", "Write data failed, errno:%ld, errmsg:%s.", write_count, strerror(errno)); return FAILED; } diff --git a/ge/common/debug/memory_dumper.cc b/ge/common/debug/memory_dumper.cc index f500e4cc..78ef2daa 100644 --- a/ge/common/debug/memory_dumper.cc +++ b/ge/common/debug/memory_dumper.cc @@ -59,17 +59,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::DumpToFile int32_t mmpa_ret = mmWrite(fd, data, len); // mmWrite return -1:Failed to write data to file;return -2:Invalid parameter if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { - GELOGE(FAILED, "[Write][Data]Failed, errno = %d, error:%s", mmpa_ret, strerror(errno)); - REPORT_INNER_ERROR("E19999", "Write data failed, errno = %d, error:%s.", + GELOGE(FAILED, "[Write][Data]Failed, errno:%d, errmsg:%s", mmpa_ret, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Write data failed, errno:%d, errmsg:%s.", mmpa_ret, strerror(errno)); ret = FAILED; } // Close the file if (mmClose(fd) != EN_OK) { // mmClose return 0: success - GELOGE(FAILED, "[Close][File]Failed, error_code:%u, filename:%s.", ret, filename); - REPORT_INNER_ERROR("E19999", "Close file failed, error_code:%u, filename:%s.", - ret, filename); + GELOGE(FAILED, "[Close][File]Failed, error_code:%u, filename:%s errmsg:%s.", ret, filename, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Close file failed, error_code:%u, filename:%s errmsg:%s.", + ret, filename, strerror(errno)); ret = FAILED; } @@ -111,8 +111,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::Dump(void int32_t mmpa_ret = mmWrite(fd_, data, len); // mmWrite return -1:failed to write data to file;return -2:invalid parameter if (mmpa_ret == EN_ERROR || mmpa_ret == EN_INVALID_PARAM) { - GELOGE(FAILED, "[Write][Data]Failed, errno = %d, error:%s", mmpa_ret, strerror(errno)); - REPORT_INNER_ERROR("E19999", "Write data to file failed, errno = %d, error:%s.", + GELOGE(FAILED, "[Write][Data]Failed, errno:%d, errmsg:%s", mmpa_ret, strerror(errno)); + REPORT_INNER_ERROR("E19999", "Write data to file failed, errno:%d, errmsg:%s.", mmpa_ret, strerror(errno)); return FAILED; } @@ -128,7 +128,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status MemoryDumper::Dump(void void MemoryDumper::Close() noexcept { // Close file if (fd_ != kInvalidFd && mmClose(fd_) != EN_OK) { - GELOGW("Close file failed."); + GELOGW("Close file failed, errmsg:%s.", strerror(errno)); } fd_ = kInvalidFd; } @@ -151,7 +151,7 @@ int MemoryDumper::OpenFile(const char *filename) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(prefix_path.length() >= MMPA_MAX_PATH, return kInvalidFd, "Prefix path is too long!"); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mmRealPath(prefix_path.c_str(), tmp_path, MMPA_MAX_PATH) != EN_OK, return kInvalidFd, - "Dir %s does not exit.", prefix_path.c_str()); + "Dir %s does not exit, errmsg:%s.", prefix_path.c_str(), strerror(errno)); real_path = std::string(tmp_path) + last_path;) GE_IF_BOOL_EXEC( path_split_pos == -1 || path_split_pos == 0, @@ -164,9 +164,9 @@ int MemoryDumper::OpenFile(const char *filename) { // Using the O_EXCL, if the file already exists,return failed to avoid privilege escalation vulnerability. mmMode_t mode = M_IRUSR | M_IWUSR; - int32_t fd = mmOpen2(real_path.c_str(), M_RDWR | M_CREAT | O_TRUNC, mode); + int32_t fd = mmOpen2(real_path.c_str(), M_RDWR | M_CREAT | M_APPEND, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { - GELOGE(kInvalidFd, "[Open][File]Failed. errno = %d, error:%s, filename:%s.", + GELOGE(kInvalidFd, "[Open][File]Failed. errno:%d, errmsg:%s, filename:%s.", fd, strerror(errno), filename); return kInvalidFd; } diff --git a/ge/common/dump/dump_op.cc b/ge/common/dump/dump_op.cc index ae29b2a8..e9414b2f 100755 --- a/ge/common/dump/dump_op.cc +++ b/ge/common/dump/dump_op.cc @@ -26,7 +26,7 @@ #include "graph/op_desc.h" #include "graph/utils/tensor_utils.h" #include "proto/ge_ir.pb.h" -#include "proto/op_mapping_info.pb.h" +#include "proto/op_mapping.pb.h" #include "runtime/mem.h" #include "aicpu/common/aicpu_task_struct.h" @@ -64,7 +64,7 @@ void DumpOp::SetDynamicModelInfo(const string &dynamic_model_name, const string } static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uintptr_t loop_cond, - aicpu::dump::OpMappingInfo &op_mapping_info) { + toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { if (step_id != 0) { GELOGI("step_id exists."); op_mapping_info.set_step_id_addr(static_cast(step_id)); @@ -87,11 +87,11 @@ static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uin } } -Status DumpOp::DumpOutput(aicpu::dump::Task &task) { +Status DumpOp::DumpOutput(toolkit::aicpu::dump::Task &task) { GELOGI("Start dump output in Launch dump op"); const auto &output_descs = op_desc_->GetAllOutputsDesc(); for (size_t i = 0; i < output_descs.size(); ++i) { - aicpu::dump::Output output; + toolkit::aicpu::dump::Output output; output.set_data_type(static_cast(DataTypeUtil::GetIrDataType(output_descs.at(i).GetDataType()))); output.set_format(static_cast(output_descs.at(i).GetFormat())); for (auto dim : output_descs.at(i).GetShape().GetDims()) { @@ -116,11 +116,11 @@ Status DumpOp::DumpOutput(aicpu::dump::Task &task) { return SUCCESS; } -Status DumpOp::DumpInput(aicpu::dump::Task &task) { +Status DumpOp::DumpInput(toolkit::aicpu::dump::Task &task) { GELOGI("Start dump input in Launch dump op"); const auto &input_descs = op_desc_->GetAllInputsDesc(); for (size_t i = 0; i < input_descs.size(); ++i) { - aicpu::dump::Input input; + toolkit::aicpu::dump::Input input; input.set_data_type(static_cast(DataTypeUtil::GetIrDataType(input_descs.at(i).GetDataType()))); input.set_format(static_cast(input_descs.at(i).GetFormat())); @@ -155,7 +155,7 @@ void DumpOp::SetDumpInfo(const DumpProperties &dump_properties, const OpDescPtr stream_ = stream; } -Status DumpOp::ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info) { +Status DumpOp::ExecutorDumpOp(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { std::string proto_msg; size_t proto_size = op_mapping_info.ByteSizeLong(); bool ret = op_mapping_info.SerializeToString(&proto_msg); @@ -216,7 +216,11 @@ Status DumpOp::ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info) { return SUCCESS; } -Status DumpOp::SetDumpModelName(aicpu::dump::OpMappingInfo &op_mapping_info) { +Status DumpOp::SetDumpModelName(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { + if (dynamic_model_name_.empty() && dynamic_om_name_.empty()) { + GELOGI("Single op dump, no need set model name"); + return SUCCESS; + } std::set model_list = dump_properties_.GetAllDumpModel(); bool not_find_by_omname = model_list.find(dynamic_om_name_) == model_list.end(); bool not_find_by_modelname = model_list.find(dynamic_model_name_) == model_list.end(); @@ -232,7 +236,7 @@ Status DumpOp::SetDumpModelName(aicpu::dump::OpMappingInfo &op_mapping_info) { } } if (!dump_model_name.empty() && dump_properties_.IsDumpOpen()) { - GELOGD("Dump model name is %s", dump_model_name.c_str()); + GELOGI("Dump model name is %s", dump_model_name.c_str()); op_mapping_info.set_model_name(dump_model_name); } return SUCCESS; @@ -252,7 +256,7 @@ Status DumpOp::LaunchDumpOp() { REPORT_INNER_ERROR("E19999","Check device_id %d failed", device_id); return ACL_ERROR_GE_INTERNAL_ERROR; } - aicpu::dump::OpMappingInfo op_mapping_info; + toolkit::aicpu::dump::OpMappingInfo op_mapping_info; auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id) + "/"; op_mapping_info.set_dump_path(dump_path); op_mapping_info.set_flag(kAicpuLoadFlag); @@ -271,7 +275,7 @@ Status DumpOp::LaunchDumpOp() { if (rt_ret != RT_ERROR_NONE) { GELOGW("call rtGetTaskIdAndStreamID failed, ret = 0x%X", rt_ret); } - aicpu::dump::Task task; + toolkit::aicpu::dump::Task task; task.set_task_id(task_id); task.set_stream_id(stream_id); task.mutable_op()->set_op_name(op_desc_->GetName()); diff --git a/ge/common/dump/dump_op.h b/ge/common/dump/dump_op.h index 4d322bee..b664495a 100755 --- a/ge/common/dump/dump_op.h +++ b/ge/common/dump/dump_op.h @@ -21,7 +21,7 @@ #include "common/ge_inner_error_codes.h" #include "common/properties_manager.h" -#include "proto/op_mapping_info.pb.h" +#include "proto/op_mapping.pb.h" #include "runtime/stream.h" namespace ge { @@ -37,10 +37,10 @@ class DumpOp { void SetDynamicModelInfo(const string &dynamic_model_name, const string &dynamic_om_name, uint32_t dynamic_model_id); private: - Status ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info); - Status DumpOutput(aicpu::dump::Task &task); - Status DumpInput(aicpu::dump::Task &task); - Status SetDumpModelName(aicpu::dump::OpMappingInfo &op_mapping_info); + Status ExecutorDumpOp(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); + Status DumpOutput(toolkit::aicpu::dump::Task &task); + Status DumpInput(toolkit::aicpu::dump::Task &task); + Status SetDumpModelName(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); DumpProperties dump_properties_; OpDescPtr op_desc_; diff --git a/ge/common/dump/exception_dumper.cc b/ge/common/dump/exception_dumper.cc new file mode 100644 index 00000000..c8ec3d35 --- /dev/null +++ b/ge/common/dump/exception_dumper.cc @@ -0,0 +1,241 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common/dump/exception_dumper.h" + +#include "common/ge/datatype_util.h" +#include "common/debug/memory_dumper.h" +#include "framework/common/debug/log.h" +#include "graph/manager/util/debug.h" +#include "graph/utils/tensor_utils.h" +#include "graph/load/model_manager/model_utils.h" +#include "proto/dump_task.pb.h" + +namespace { +static uint64_t GetNowTime() { + uint64_t ret = 0; + mmTimeval tv; + if (mmGetTimeOfDay(&tv, nullptr) == 0) { + ret = tv.tv_sec * 1000000ULL + tv.tv_usec; + } + + return ret; +} + +static void ReplaceStringElem(std::string &str) { + for_each(str.begin(), str.end(), [](char &ch) { + if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) { + ch = '_'; + } + }); +} + +static void SetDumpData(const ge::OpDescInfo &op_desc_info, toolkit::dump::DumpData &dump_data) { + dump_data.set_version("2.0"); + dump_data.set_dump_time(GetNowTime()); + dump_data.set_op_name(op_desc_info.op_name); + for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { + toolkit::dump::OpInput input; + input.set_data_type(toolkit::dump::OutputDataType( + ge::DataTypeUtil::GetIrDataType(op_desc_info.input_data_type[i]))); + input.set_format(toolkit::dump::OutputFormat(op_desc_info.input_format[i])); + for (auto dim : op_desc_info.input_shape[i]) { + input.mutable_shape()->add_dim(dim); + } + input.set_size(op_desc_info.input_size[i]); + GELOGI("[Set][DumpData] The input size int exception is %ld", op_desc_info.input_size[i]); + dump_data.mutable_input()->Add(std::move(input)); + } + + for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { + toolkit::dump::OpOutput output; + output.set_data_type(toolkit::dump::OutputDataType( + ge::DataTypeUtil::GetIrDataType(op_desc_info.output_data_type[j]))); + output.set_format(toolkit::dump::OutputFormat(op_desc_info.output_format[j])); + for (auto dim : op_desc_info.output_shape[j]) { + output.mutable_shape()->add_dim(dim); + } + output.set_size(op_desc_info.output_size[j]); + GELOGI("[Set][DumpData] The output size int exception is %ld", op_desc_info.output_size[j]); + dump_data.mutable_output()->Add(std::move(output)); + } +} +} // namespace + +namespace ge { +ExceptionDumper::~ExceptionDumper() {} + +void ExceptionDumper::SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + vector &input_addrs, vector &output_addrs) { + OpDescInfo op_desc_info; + SaveOpDescInfo(op, task_id, stream_id, op_desc_info); + op_desc_info.input_addrs = input_addrs; + op_desc_info.output_addrs = output_addrs; + op_desc_info_.emplace_back(std::move(op_desc_info)); +} + +void ExceptionDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, + uint32_t task_id, uint32_t stream_id) { + OpDescInfo op_desc_info; + SaveOpDescInfo(op, task_id, stream_id, op_desc_info); + op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); + op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); + op_desc_info_.emplace_back(std::move(op_desc_info)); +} + +void ExceptionDumper::SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + OpDescInfo &op_desc_info) { + if (op == nullptr) { + GELOGW("[Save][OpExceptionInfo] op desc ptr is null."); + return; + } + GELOGD("[Save][OpExceptionInfo] Start to save dump op [%s] info of task_id: %u, stream_id: %u", + op->GetName().c_str(), task_id, stream_id); + op_desc_info.op_name = op->GetName(); + op_desc_info.op_type = op->GetType(); + op_desc_info.task_id = task_id; + op_desc_info.stream_id = stream_id; + for (size_t i = 0; i < op->GetAllInputsSize(); ++i) { + GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i); + if (input_tensor_desc == nullptr) { + continue; + } + op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat()); + op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims()); + op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType()); + int64_t input_size = 0; + + if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) { + GELOGW("[Save][OpExceptionInfo] Op [%s] get input size failed.", op->GetName().c_str()); + return; + } + GELOGD("[Save][OpExceptionInfo] Save dump op info, the input size is %ld", input_size); + op_desc_info.input_size.emplace_back(input_size); + } + for (size_t j = 0; j < op->GetOutputsSize(); ++j) { + GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j); + if (output_tensor_desc == nullptr) { + continue; + } + op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat()); + op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims()); + op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType()); + int64_t output_size = 0; + if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) { + GELOGW("[Save][OpExceptionInfo] Op [%s] get output size failed.", op->GetName().c_str()); + return; + } + GELOGD("[Save][OpExceptionInfo] Save dump op info, the output size is %ld.", output_size); + op_desc_info.output_size.emplace_back(output_size); + } +} + +Status ExceptionDumper::DumpExceptionInfo(const std::vector &exception_infos) const { + GELOGI("[Dump][Exception] Start to dump exception info"); + for (const rtExceptionInfo &iter : exception_infos) { + OpDescInfo op_desc_info; + if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { + toolkit::dump::DumpData dump_data; + SetDumpData(op_desc_info, dump_data); + uint64_t now_time = GetNowTime(); + std::string op_name = op_desc_info.op_name; + std::string op_type = op_desc_info.op_type; + ReplaceStringElem(op_name); + ReplaceStringElem(op_type); + string dump_file_path = + "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time); + GELOGI("[Dump][Exception] The exception dump file path is %s", dump_file_path.c_str()); + + uint64_t proto_size = dump_data.ByteSizeLong(); + std::unique_ptr proto_msg(new (std::nothrow) char[proto_size]); + bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); + if (!ret || proto_size == 0) { + REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump data proto serialize failed"); + return PARAM_INVALID; + } + + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), + "Failed to dump proto size"); + GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), + "Failed to dump proto msg"); + if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception input failed"); + return PARAM_INVALID; + } + + if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception output failed"); + return PARAM_INVALID; + } + GELOGI("[Dump][Exception] Dump exception info SUCCESS"); + } else { + GELOGE(PARAM_INVALID, "[Dump][Exception] Get op desc info failed,task id:%u,stream id:%u", + iter.taskid, iter.streamid); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +bool ExceptionDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + GELOGI("[Get][OpDescInfo] There are %zu op need to dump.", op_desc_info_.size()); + for (size_t index = 0; index < op_desc_info_.size(); ++index) { + OpDescInfo dump_op_info = op_desc_info_.at(index); + if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) { + GELOGI("[Get][OpDescInfo] Find exception op [%s] of task_id: %u, stream_id: %u.", + dump_op_info.op_name.c_str(), task_id, stream_id); + op_desc_info = dump_op_info; + return true; + } + } + return false; +} + +Status ExceptionDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const { + GELOGI("[Dump][ExceptionInput] Start to dump exception input"); + for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed", + i, op_desc_info.op_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +Status ExceptionDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const { + GELOGI("[Dump][ExceptionOutput] Start to dump exception output"); + for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { + if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != + SUCCESS) { + GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed", + i, op_desc_info.op_name.c_str()); + return PARAM_INVALID; + } + } + return SUCCESS; +} + +OpDescInfo *ExceptionDumper::MutableOpDescInfo(uint32_t task_id, uint32_t stream_id) { + for (OpDescInfo &op_desc_info : op_desc_info_) { + if (op_desc_info.task_id == task_id && op_desc_info.stream_id == stream_id) { + return &op_desc_info; + } + } + return nullptr; +} +} // namespace ge \ No newline at end of file diff --git a/ge/common/dump/exception_dumper.h b/ge/common/dump/exception_dumper.h new file mode 100644 index 00000000..38a3f26e --- /dev/null +++ b/ge/common/dump/exception_dumper.h @@ -0,0 +1,48 @@ +/** + * Copyright 2019-2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ +#define GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ + +#include + +#include "graph/op_desc.h" +#include "framework/common/ge_types.h" +#include "graph/load/model_manager/task_info/task_info.h" + +namespace ge { +class ExceptionDumper { + public: + ExceptionDumper() = default; + ~ExceptionDumper(); + + void SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, + std::vector &input_addrs, std::vector &output_addrs); + void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id); + Status DumpExceptionInfo(const std::vector &exception_infos) const; + bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + OpDescInfo *MutableOpDescInfo(uint32_t task_id, uint32_t stream_id); + + private: + void SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id, OpDescInfo &op_desc_info); + Status DumpExceptionInput(const OpDescInfo &op_desc_info, const std::string &dump_file) const; + Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const std::string &dump_file) const; + + std::vector op_desc_info_; +}; +} // namespace ge + +#endif // GE_COMMON_DUMP_EXCEPTION_DUMPER_H_ diff --git a/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc b/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc index 6066c250..e9e41cd1 100755 --- a/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc +++ b/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc @@ -66,7 +66,7 @@ Status CheckArgsForNc1hwc0ToNhwc(const TransArgs &args) { if (c0 <= 0) { GELOGE(ACL_ERROR_GE_DATATYPE_INVALID, "[Get][Cube]Failed, the data type %s is invalid", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); - REPORT_CALL_ERROR("E19999", "Failed to get cube size, the data type %s is invalid", + REPORT_CALL_ERROR("E19999", "Failed to get cube size, the data type %s is invalid", TypeUtils::DataTypeToSerialString(args.src_data_type).c_str()); return ACL_ERROR_GE_DATATYPE_INVALID; } @@ -175,7 +175,8 @@ Status FormatTransferNc1hwc0Nhwc::TransFormat(const TransArgs &args, TransResult ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str()); return ACL_ERROR_GE_SHAPE_INVALID; } - GELOGD("Begin to trans format from NC1HWC0 to NCHW, src shape %s, data type %s, dst shape %s, memory size %ld", + GELOGD("[Trans][Format]Begin to trans format from NC1HWC0 to NCHW, " + "src shape %s, data type %s, dst shape %s, memory size %ld", ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(), ShapeToString(args.dst_shape).c_str(), total_size); diff --git a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc index aa3b4c7b..5efe486c 100644 --- a/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc +++ b/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc @@ -189,9 +189,9 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) { ret = memcpy_s(p_d + k * stride, protectSize, p_s + k * block, block); if (ret != EOK) { GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Set][Memcpy]Failed, block %zu, stride %zu, " - "protect_size %ld, error_code %d", block, stride, protectSize, ret); + "protect_size %ld, error_code %d", block, stride, protectSize, ret); REPORT_CALL_ERROR("E19999", "[Set][Memcpy]Failed, block %zu, stride %zu, " - "protect_size %ld, error_code %d", block, stride, protectSize, ret); + "protect_size %ld, error_code %d", block, stride, protectSize, ret); return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; } protectSize = protectSize - block; @@ -304,8 +304,8 @@ Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr(&PluginManager::GetPath), &dl_info) != EN_OK) { - GELOGW("Failed to read the shared library file path!"); + const char *error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); + GELOGW("Failed to read the shared library file path! errmsg:%s", error); return string(); } else { GE_IF_BOOL_EXEC(dl_info.dli_fname == nullptr, return string()); @@ -61,7 +63,7 @@ string PluginManager::GetPath() { return string(); } if (mmRealPath(so_path.c_str(), path, MMPA_MAX_PATH) != EN_OK) { - GELOGW("Failed to get realpath of %s", so_path.c_str()); + GELOGW("Failed to get realpath of %s, errmsg:%s", so_path.c_str(), strerror(errno)); return string(); } @@ -137,18 +139,24 @@ Status PluginManager::LoadSo(const string &path, const vector &func_chec for (const auto &func_name : func_check_list) { auto real_fn = (void (*)())mmDlsym(handle, const_cast(func_name.c_str())); if (real_fn == nullptr) { + const char *error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); ErrorManager::GetInstance().ATCReportErrMessage("E19012", {"function", "reason"}, {"mmDlsym", FmtToStr(func_name) + " is skipped since function" + FmtToStr(func_name) + " is not existed!"}); GELOGE(ACL_ERROR_GE_PLGMGR_PATH_INVALID, - "[Check][So]%s is skipped since function %s is not existed!", - func_name.c_str(), func_name.c_str()); + "[Check][So]%s is skipped since function %s is not existed! errmsg:%s", + func_name.c_str(), func_name.c_str(), error); is_valid = false; break; } } if (!is_valid) { - GE_LOGE_IF(mmDlclose(handle), "[DLClose][Handle]Failed."); + if (mmDlclose(handle) != 0) { + const char *error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); + GELOGE(FAILED, "[DLClose][Handle]Failed. errmsg:%s", error); + } continue; } @@ -212,21 +220,21 @@ Status PluginManager::Load(const string &path, const vector &func_check_ GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(path.length() >= MMPA_MAX_PATH, GELOGW("File path is too long!"); return FAILED, "File path is too long!"); if (mmRealPath(path.c_str(), canonical_path, MMPA_MAX_PATH) != EN_OK) { - GELOGW("Failed to get realpath of %s", path.c_str()); + GELOGW("Failed to get realpath of %s, errmsg:%s", path.c_str(), strerror(errno)); return SUCCESS; } INT32 is_dir = mmIsDir(canonical_path); // Lib plugin path not exist if (is_dir != EN_OK) { - GELOGW("Invalid path for load: %s", path.c_str()); + GELOGW("Invalid path for load: %s, errmsg:%s", path.c_str(), strerror(errno)); return SUCCESS; } mmDirent **entries = nullptr; auto ret = mmScandir(canonical_path, &entries, nullptr, nullptr); if (ret < EN_OK) { - GELOGW("scan dir failed. path = %s, ret = %d", canonical_path, ret); + GELOGW("scan dir failed. path = %s, ret = %d, errmsg = %s", canonical_path, ret, strerror(errno)); return FAILED; } for (int i = 0; i < ret; ++i) { @@ -283,13 +291,20 @@ Status PluginManager::Load(const string &path, const vector &func_check_ for (const auto &func_name : func_check_list) { auto real_fn = (void (*)())mmDlsym(handle, const_cast(func_name.c_str())); if (real_fn == nullptr) { - GELOGW("The %s is skipped since function %s is not existed!", file_name.c_str(), func_name.c_str()); + const char *error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); + GELOGW("The %s is skipped since function %s is not existed! errmsg:%s", + file_name.c_str(), func_name.c_str(), error); is_valid = false; break; } } if (!is_valid) { - GE_LOGE_IF(mmDlclose(handle), "Failed to dlclose."); + if (mmDlclose(handle) != 0) { + const char *error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); + GELOGE(FAILED, "[DLClose][Handle]Failed. errmsg:%s", error); + } continue; } diff --git a/ge/common/ge/plugin_manager.h b/ge/common/ge/plugin_manager.h index 7ef0f81a..8c351e62 100755 --- a/ge/common/ge/plugin_manager.h +++ b/ge/common/ge/plugin_manager.h @@ -59,7 +59,11 @@ class PluginManager { for (const auto &handle : handles_) { auto real_fn = (R(*)(Types...))mmDlsym(handle.second, const_cast(func_name.c_str())); if (real_fn == nullptr) { - GELOGW("Failed to get function %s in %s!", func_name.c_str(), handle.first.c_str()); + const char *error = mmDlerror(); + if (error == nullptr) { + error = ""; + } + GELOGW("Failed to get function %s in %s! errmsg:%s", func_name.c_str(), handle.first.c_str(), error); return GE_PLGMGR_FUNC_NOT_EXIST; } else { funcs[handle.first] = real_fn; @@ -74,7 +78,11 @@ class PluginManager { // If the funcName is existed, signature of realFn can be casted to any type auto real_fn = (void (*)(Types...))mmDlsym(handle.second, const_cast(func_name.c_str())); if (real_fn == nullptr) { - GELOGW("Failed to invoke function %s in %s!", func_name.c_str(), handle.first.c_str()); + const char *error = mmDlerror(); + if (error == nullptr) { + error = ""; + } + GELOGW("Failed to invoke function %s in %s! errmsg:%s", func_name.c_str(), handle.first.c_str(), error); return GE_PLGMGR_INVOKE_FAILED; } else { real_fn(args...); @@ -89,7 +97,11 @@ class PluginManager { // If the funcName is existed, signature of realFn can be casted to any type auto real_fn = (void (*)(T))mmDlsym(handle.second, const_cast(func_name.c_str())); if (real_fn == nullptr) { - GELOGW("Failed to invoke function %s in %s!", func_name.c_str(), handle.first.c_str()); + const char *error = mmDlerror(); + if (error == nullptr) { + error = ""; + } + GELOGW("Failed to invoke function %s in %s! errmsg:%s", func_name.c_str(), handle.first.c_str(), error); return GE_PLGMGR_INVOKE_FAILED; } typename std::remove_reference::type arg_temp; @@ -114,7 +126,11 @@ class PluginManager { // If the funcName is existed, signature of realFn can be casted to any type auto real_fn = (T2(*)(T1))mmDlsym(handle.second, const_cast(func_name.c_str())); if (real_fn == nullptr) { - GELOGW("Failed to invoke function %s in %s!", func_name.c_str(), handle.first.c_str()); + const char *error = mmDlerror(); + if (error == nullptr) { + error = ""; + } + GELOGW("Failed to invoke function %s in %s! errmsg:%s", func_name.c_str(), handle.first.c_str(), error); return GE_PLGMGR_INVOKE_FAILED; } else { T2 res = real_fn(arg); @@ -132,7 +148,11 @@ class PluginManager { // If the funcName is existed, signature of realFn can be casted to any type auto real_fn = (T(*)())mmDlsym(handle.second, const_cast(func_name.c_str())); if (real_fn == nullptr) { - GELOGW("Failed to invoke function %s in %s!", func_name.c_str(), handle.first.c_str()); + const char *error = mmDlerror(); + if (error == nullptr) { + error = ""; + } + GELOGW("Failed to invoke function %s in %s! errmsg:%s", func_name.c_str(), handle.first.c_str(), error); return GE_PLGMGR_INVOKE_FAILED; } else { T res = real_fn(); diff --git a/ge/common/ge/tbe_plugin_manager.cc b/ge/common/ge/tbe_plugin_manager.cc index 0cc7d553..94ba8a9a 100755 --- a/ge/common/ge/tbe_plugin_manager.cc +++ b/ge/common/ge/tbe_plugin_manager.cc @@ -69,7 +69,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status TBEPluginManager::Finali string TBEPluginManager::GetPath() { mmDlInfo dl_info; if (mmDladdr(reinterpret_cast(&TBEPluginManager::GetPath), &dl_info) != EN_OK) { - GELOGW("Failed to read so path!"); + const char *error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); + GELOGW("Failed to read so path! errmsg:%s", error); return string(); } else { string so_path = dl_info.dli_fname; @@ -79,7 +81,7 @@ string TBEPluginManager::GetPath() { return string(); } if (mmRealPath(so_path.c_str(), path, MMPA_MAX_PATH) != EN_OK) { - GELOGW("Failed to get realpath of %s", so_path.c_str()); + GELOGW("Failed to get realpath of %s, errmsg:%s", so_path.c_str(), strerror(errno)); return string(); } @@ -113,14 +115,14 @@ void TBEPluginManager::FindParserSo(const string &path, vector &file_lis INT32 is_dir = mmIsDir(real_path.c_str()); // Lib plugin path not exist if (is_dir != EN_OK) { - GELOGW("%s is not a dir.", real_path.c_str()); + GELOGW("%s is not a dir. errmsg:%s", real_path.c_str(), strerror(errno)); return; } mmDirent **entries = nullptr; auto ret = mmScandir(real_path.c_str(), &entries, nullptr, nullptr); if (ret < EN_OK) { - GELOGW("scan dir failed. path = %s, ret = %d", real_path.c_str(), ret); + GELOGW("scan dir failed. path = %s, ret = %d, errmsg = %s", real_path.c_str(), ret, strerror(errno)); return; } for (int i = 0; i < ret; ++i) { diff --git a/ge/common/helper/model_cache_helper.cc b/ge/common/helper/model_cache_helper.cc index 41ad6d59..78ca697f 100755 --- a/ge/common/helper/model_cache_helper.cc +++ b/ge/common/helper/model_cache_helper.cc @@ -441,11 +441,11 @@ Status ModelCacheHelper::SaveJsonToFile(const string &file_name, const Json &jso const int FILE_AUTHORITY = 0600; int fd = mmOpen2(path.c_str(), M_WRONLY | M_CREAT | O_TRUNC, FILE_AUTHORITY); if (fd < 0) { - GELOGW("Fail to open the file: %s.", path.c_str()); + GELOGW("Fail to open the file:%s. errmsg:%s", path.c_str(), strerror(errno)); return INTERNAL_ERROR; } if (mmClose(fd) != 0) { - GELOGW("Fail to close the file: %s.", path.c_str()); + GELOGW("Fail to close the file:%s. errmsg:%s", path.c_str(), strerror(errno)); return INTERNAL_ERROR; } diff --git a/ge/common/helper/model_helper.cc b/ge/common/helper/model_helper.cc index b62462b2..ac3343ed 100644 --- a/ge/common/helper/model_helper.cc +++ b/ge/common/helper/model_helper.cc @@ -697,6 +697,7 @@ Status ModelHelper::GenerateGeRootModel(OmFileLoadHelper &om_load_helper) { is_first_model = false; root_model_->SetRootGraph(GraphUtils::GetComputeGraph(cur_model->GetGraph())); root_model_->SetModelId(cur_model->GetModelId()); + root_model_->SetModelName(cur_model->GetName()); model_ = cur_model; continue; } diff --git a/ge/common/model_saver.cc b/ge/common/model_saver.cc index a17977b2..ff601c7a 100755 --- a/ge/common/model_saver.cc +++ b/ge/common/model_saver.cc @@ -61,7 +61,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelSaver::SaveJsonToFi int32_t fd = mmOpen2(real_path, M_RDWR | M_CREAT | O_TRUNC, mode); if (fd == EN_ERROR || fd == EN_INVALID_PARAM) { ErrorManager::GetInstance().ATCReportErrMessage("E19001", {"file", "errmsg"}, {file_path, strerror(errno)}); - GELOGE(FAILED, "[Open][File]Failed, file %s, error %s", file_path, strerror(errno)); + GELOGE(FAILED, "[Open][File]Failed, file %s, errmsg:%s", file_path, strerror(errno)); return FAILED; } const char *model_char = model_str.c_str(); @@ -72,13 +72,13 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ModelSaver::SaveJsonToFi ErrorManager::GetInstance().ATCReportErrMessage( "E19004", {"file", "errmsg"}, {file_path, strerror(errno)}); // Need to both print the error info of mmWrite and mmClose, so return ret after mmClose - GELOGE(FAILED, "[Write][Data]Write to file failed. errno = %ld, %s", mmpa_ret, strerror(errno)); + GELOGE(FAILED, "[Write][Data]To file %s failed. errno %ld, errmsg %s", ,file_path, mmpa_ret, strerror(errno)); ret = FAILED; } // Close file if (mmClose(fd) != EN_OK) { - GELOGE(FAILED, "[Close][File]Failed, file %s", file_path); - REPORT_CALL_ERROR("E19999", "Close file %s failed", file_path); + REPORT_CALL_ERROR("E19999", "Close file %s failed, errmsg %s", file_path, strerror(errno)); + GELOGE(FAILED, "Close file failed, file %s, errmsg %s", file_path, strerror(errno)); ret = FAILED; } return ret; diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc index 51df1a43..521f8cc8 100644 --- a/ge/common/profiling/profiling_manager.cc +++ b/ge/common/profiling/profiling_manager.cc @@ -383,6 +383,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportDa reporter_data.deviceId = device_id; ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, tag_name.c_str(), tag_name.size()); GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag [%s] memcpy error!", tag_name.c_str()); return;); + std::lock_guard lock(mutex_report_); for (size_t i = 0; i < index; ++i) { reporter_data.data = (unsigned char *)data.c_str() + report_max_len * i; reporter_data.dataLen = report_max_len; @@ -403,7 +404,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportDa reporter_data.dataLen = data.size(); ret = memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN + 1, tag_name.c_str(), tag_name.size()); GE_IF_BOOL_EXEC(ret != EOK, GELOGE(ret, "Report data tag [%s] memcpy error!", tag_name.c_str()); return;); - + std::lock_guard lock(mutex_report_); cb_ret = CallMsprofReport(reporter_data); GE_IF_BOOL_EXEC(cb_ret != 0, GELOGE(cb_ret, "Reporter data [%s] failed, ret:%d", tag_name.c_str(), cb_ret); return;); diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h index ab344204..af9fce06 100755 --- a/ge/common/profiling/profiling_manager.h +++ b/ge/common/profiling/profiling_manager.h @@ -118,6 +118,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager { map subs_dev_module_; // key: device_id, value: profiling on module uint32_t subscribe_count_; std::mutex mutex_; + std::mutex mutex_report_; MsprofCallback prof_cb_; std::string fp_point_; std::string bp_point_; diff --git a/ge/common/proto/ge_ir.proto b/ge/common/proto/ge_ir.proto index 12989a54..c0ef3071 100644 --- a/ge/common/proto/ge_ir.proto +++ b/ge/common/proto/ge_ir.proto @@ -31,6 +31,8 @@ enum DataType DT_STRING_REF = 24; // string_ref type DT_DUAL = 25; /**< dual output type */ DT_VARIANT = 26; // variant type + DT_BF16 = 27; // bf16 type + DT_INT4 = 28; // int4 type } message AttrDef diff --git a/ge/common/proto/op_mapping_info.proto b/ge/common/proto/op_mapping.proto similarity index 97% rename from ge/common/proto/op_mapping_info.proto rename to ge/common/proto/op_mapping.proto index 7fb6f84b..d626eb49 100644 --- a/ge/common/proto/op_mapping_info.proto +++ b/ge/common/proto/op_mapping.proto @@ -1,5 +1,5 @@ syntax = "proto3"; -package aicpu.dump; +package toolkit.aicpu.dump; message Shape { repeated uint64 dim = 1; diff --git a/ge/common/util.cc b/ge/common/util.cc index 0a343a83..63d75de1 100644 --- a/ge/common/util.cc +++ b/ge/common/util.cc @@ -122,7 +122,7 @@ long GetFileLength(const std::string &input_file) { GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( mmGetFileSize(input_file.c_str(), &file_length) != EN_OK, ErrorManager::GetInstance().ATCReportErrMessage("E19001", {"file", "errmsg"}, {input_file, strerror(errno)}); - return kFileSizeOutLimitedOrOpenFailed, "Open file[%s] failed. %s", input_file.c_str(), strerror(errno)); + return kFileSizeOutLimitedOrOpenFailed, "Open file[%s] failed. errmsg:%s", input_file.c_str(), strerror(errno)); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((file_length == 0), ErrorManager::GetInstance().ATCReportErrMessage("E19015", {"filepath"}, {input_file}); @@ -226,7 +226,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std:: if (ret != 0) { if (errno != EEXIST) { ErrorManager::GetInstance().ATCReportErrMessage("E19006", {"path"}, {directory_path}); - GELOGW("Can not create directory %s. Make sure the directory exists and writable.", directory_path.c_str()); + GELOGW("Can not create directory %s. Make sure the directory exists and writable. errmsg:%s", + directory_path.c_str(), strerror(errno)); return ret; } } @@ -237,7 +238,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std:: if (ret != 0) { if (errno != EEXIST) { ErrorManager::GetInstance().ATCReportErrMessage("E19006", {"path"}, {directory_path}); - GELOGW("Can not create directory %s. Make sure the directory exists and writable.", directory_path.c_str()); + GELOGW("Can not create directory %s. Make sure the directory exists and writable. errmsg:%s", + directory_path.c_str(), strerror(errno)); return ret; } } @@ -310,7 +312,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromMem(const cha FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp() { mmTimeval tv{}; int ret = mmGetTimeOfDay(&tv, nullptr); - GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed: ret=%d", ret); + GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed, ret:%d, errmsg:%s", ret, strerror(errno)); auto total_use_time = tv.tv_usec + tv.tv_sec * 1000000; // 1000000: seconds to microseconds return static_cast(total_use_time); } @@ -318,7 +320,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint64_t GetCurrentTimestamp() FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint32_t GetCurrentSecondTimestap() { mmTimeval tv{}; int ret = mmGetTimeOfDay(&tv, nullptr); - GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed: ret=%d", ret); + GE_LOGE_IF(ret != EN_OK, "Func gettimeofday may failed, ret:%d, errmsg:%s", ret, strerror(errno)); auto total_use_time = tv.tv_sec; // seconds return static_cast(total_use_time); } @@ -568,7 +570,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status CheckPath(const char *pa INT32 is_dir = mmIsDir(path); if (is_dir != EN_OK) { - GELOGE(PATH_INVALID, "Open directory %s failed, maybe it is not exit or not a dir", path); + GELOGE(PATH_INVALID, "Open directory %s failed, maybe it is not exit or not a dir. errmsg:%s", + path, strerror(errno)); return PATH_INVALID; } diff --git a/ge/engine_manager/dnnengine_manager.cc b/ge/engine_manager/dnnengine_manager.cc index 7ff5ed42..2bd9b3e5 100644 --- a/ge/engine_manager/dnnengine_manager.cc +++ b/ge/engine_manager/dnnengine_manager.cc @@ -435,7 +435,7 @@ Status DNNEngineManager::ReadJsonFile(const std::string &file_path, JsonHandle h const char *file = file_path.data(); if ((mmAccess2(file, M_F_OK)) != EN_OK) { if (engines_map_.size() != 0) { - GELOGE(FAILED, "The json file %s is not exist, %s", file_path.c_str(), strerror(errno)); + GELOGE(FAILED, "The json file %s is not exist, errmsg:%s", file_path.c_str(), strerror(errno)); return FAILED; } else { GELOGW("The json file %s is not needed.", file_path.c_str()); diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt index 89fce8a0..1782d497 100644 --- a/ge/executor/CMakeLists.txt +++ b/ge/executor/CMakeLists.txt @@ -3,7 +3,7 @@ set(PROTO_LIST "${METADEF_DIR}/proto/ge_ir.proto" "${METADEF_DIR}/proto/insert_op.proto" "${METADEF_DIR}/proto/task.proto" - "${METADEF_DIR}/proto/op_mapping_info.proto" + "${METADEF_DIR}/proto/op_mapping.proto" "${METADEF_DIR}/proto/dump_task.proto" ) @@ -16,13 +16,13 @@ set(SRC_LIST "../common/ge/plugin_manager.cc" "../common/ge/op_tiling_manager.cc" "../common/dump/dump_properties.cc" + "../common/dump/exception_dumper.cc" "../common/dump/dump_manager.cc" "../common/dump/dump_op.cc" "../common/dump/opdebug_register.cc" "../common/profiling/ge_profiling.cc" "../graph/load/graph_loader.cc" "../graph/execute/graph_execute.cc" - "../omm/csa_interact.cc" "../graph/manager/graph_manager_utils.cc" "../graph/manager/graph_var_manager.cc" "../graph/manager/graph_mem_allocator.cc" @@ -249,12 +249,11 @@ target_include_directories(ge_executor_shared PRIVATE target_link_options(ge_executor_shared PRIVATE -Wl,-Bsymbolic + -Wl,--exclude-libs,ALL ) target_link_libraries(ge_executor_shared PRIVATE $ - msprofiler - static_mmpa -Wl,--no-as-needed ge_common runtime diff --git a/ge/executor/module.mk b/ge/executor/module.mk index 4966eeb5..7a7e2b51 100644 --- a/ge/executor/module.mk +++ b/ge/executor/module.mk @@ -11,7 +11,6 @@ local_ge_executor_src_files := \ ../common/profiling/ge_profiling.cc \ ../graph/load/graph_loader.cc \ ../graph/execute/graph_execute.cc \ - ../omm/csa_interact.cc \ ../graph/manager/graph_manager_utils.cc \ ../graph/manager/graph_var_manager.cc \ ../graph/manager/rdma_pool_allocator.cc \ diff --git a/ge/executor/proto/dump_task.proto b/ge/executor/proto/dump_task.proto index ee1c6f47..a2411ddb 100644 --- a/ge/executor/proto/dump_task.proto +++ b/ge/executor/proto/dump_task.proto @@ -1,5 +1,5 @@ syntax = "proto3"; -package toolkit.dumpdata; +package toolkit.dump; enum OutputDataType { DT_UNDEFINED = 0; diff --git a/ge/executor/proto/ge_ir.proto b/ge/executor/proto/ge_ir.proto index 12989a54..c0ef3071 100644 --- a/ge/executor/proto/ge_ir.proto +++ b/ge/executor/proto/ge_ir.proto @@ -31,6 +31,8 @@ enum DataType DT_STRING_REF = 24; // string_ref type DT_DUAL = 25; /**< dual output type */ DT_VARIANT = 26; // variant type + DT_BF16 = 27; // bf16 type + DT_INT4 = 28; // int4 type } message AttrDef diff --git a/ge/executor/proto/op_mapping_info.proto b/ge/executor/proto/op_mapping.proto similarity index 97% rename from ge/executor/proto/op_mapping_info.proto rename to ge/executor/proto/op_mapping.proto index 7fb6f84b..d626eb49 100644 --- a/ge/executor/proto/op_mapping_info.proto +++ b/ge/executor/proto/op_mapping.proto @@ -1,5 +1,5 @@ syntax = "proto3"; -package aicpu.dump; +package toolkit.aicpu.dump; message Shape { repeated uint64 dim = 1; diff --git a/ge/ge_inference.mk b/ge/ge_inference.mk index 32fc206d..ae1288f5 100755 --- a/ge/ge_inference.mk +++ b/ge/ge_inference.mk @@ -4,7 +4,6 @@ COMMON_LOCAL_SRC_FILES := \ proto/fusion_model.proto \ proto/optimizer_priority.proto \ graph/manager/trans_var_data_utils.cc \ - omm/csa_interact.cc \ common/fp16_t.cc \ common/formats/utils/formats_trans_utils.cc \ common/formats/format_transfers/datatype_transfer.cc \ @@ -73,7 +72,7 @@ BUILER_SRC_FILES := \ ir_build/attr_options/utils.cc \ ir_build/attr_options/keep_dtype_option.cc \ ir_build/attr_options/weight_compress_option.cc \ - ir_build/atc_ir_common.cc \ + ir_build/option_utils.cc \ ANALYZER_SRC_FILES:= \ analyzer/analyzer.cc \ diff --git a/ge/ge_local_engine/engine/host_cpu_engine.cc b/ge/ge_local_engine/engine/host_cpu_engine.cc index 4aebffb4..8bc159dc 100755 --- a/ge/ge_local_engine/engine/host_cpu_engine.cc +++ b/ge/ge_local_engine/engine/host_cpu_engine.cc @@ -82,7 +82,9 @@ Status GetDataNumber(const GeTensorDesc &out_desc, uint64_t &data_num) { void HostCpuEngine::CloseSo() { for (auto handle : lib_handles_) { if (mmDlclose(handle) != 0) { - GELOGW("failed to close handle, message: %s", mmDlerror()); + const char *error = mmDlerror(); + error = (error == nullptr) ? "" : error; + GELOGW("failed to close handle, message: %s", error); } } lib_handles_.clear(); @@ -284,7 +286,7 @@ Status HostCpuEngine::ListSoFiles(const std::string &base_dir, std::vector &options, OmgContext &o return MEMALLOC_FAILED; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOpsProtoInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOpsProtoInit); string opsproto_path; GetOpsProtoPath(opsproto_path); GELOGI("Get opsproto path is %s", opsproto_path.c_str()); @@ -426,7 +426,7 @@ Status GeGenerator::Initialize(const map &options, OmgContext &o } Status GeGenerator::Finalize() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().SetStage(error_message::kFinalize, error_message::kFinalize); GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID); Status ret = impl_->graph_manager_.Finalize(); if (ret != SUCCESS) { @@ -438,14 +438,14 @@ Status GeGenerator::Finalize() { Status GeGenerator::GenerateOfflineModel(const Graph &graph, const string &file_name_prefix, const vector &inputs) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGI("Start to generate offline model."); ModelBufferData model; return GenerateModel(graph, file_name_prefix, inputs, model, true); } Status GeGenerator::GenerateOnlineModel(const Graph &graph, const vector &inputs, ModelBufferData &model) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); return GenerateModel(graph, "online", inputs, model, false); } @@ -783,9 +783,7 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in GELOGD("Inputs size is %zu, outputs size is %zu.", inputs.size(), outputs.size()); GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID); impl_->is_offline_ = is_offline; - if (!is_offline) { - (void)AttrUtils::SetBool(op_desc, ATTR_SINGLE_OP_SCENE, true); - } + (void)AttrUtils::SetBool(op_desc, ATTR_SINGLE_OP_SCENE, true); if (CheckForSingleOp(op_desc, inputs, outputs) != SUCCESS) { GELOGE(PARAM_INVALID, "input param is invalid when build single op!"); @@ -824,7 +822,7 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in auto node = comp_graph->FindNode(op_desc->GetName()); Status ret = CheckEngineTypeSupport(node, engine_type); if (ret != SUCCESS) { - GELOGE(ret, "[Check][EngineType]value:%d for node:%s not support", engine_type, node->GetName().c_str()); + GELOGE(ret, "[Check][EngineType]not support node:%s with engine of %d.", node->GetName().c_str(), engine_type); return ret; } } @@ -850,6 +848,11 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in bool all_shape = false; (void)AttrUtils::GetBool(op_desc, kAicpuAllshape, all_shape); + GELOGD("Node: %s, all_shape is %d, compile_flag is %d.", op_desc->GetName().c_str(), all_shape, compile_flag); + (void)AttrUtils::SetInt(ge_model, ATTR_NAME_BUILD_MODE, fuzz_compile_flag); + if (all_shape) { + (void)AttrUtils::SetBool(ge_model, kAicpuAllshape, all_shape); + } if (all_shape && CheckNoAicore(root_graph)) { GELOGD("Get aicpu all_shape kernel!"); vector inputs_dynamic; @@ -859,8 +862,6 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in GE_CHK_STATUS_RET_NOLOG( impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs_dynamic, outputs_dynamic)); } else if (fuzz_compile_flag) { - GELOGD("Get fuzz build result of %s.", op_desc->GetName().c_str()); - (void)AttrUtils::SetInt(ge_model, ATTR_NAME_BUILD_MODE, fuzz_compile_flag); GeAttrValue::LIST_NAMED_ATTRS fuzz_build_attrs; if (GetFuzzBuildAttrs(op_desc, ge_root_model, fuzz_build_attrs) != SUCCESS) { GELOGE(FAILED, "[Get][FuzzRet]Failed to get fuzz build result of %s.", op_desc->GetName().c_str()); @@ -892,7 +893,7 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector &in Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, const string &model_file_name, int32_t compile_flag) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGI("Start to build single op offline model, input size: %zu, output size: %zu", inputs.size(), outputs.size()); ModelBufferData model_buff; OpEngineType engine_type = ENGINE_SYS; @@ -916,7 +917,7 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, OpEngineType engine_type, ModelBufferData &model_buff) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGI("Start to build single op online, input size: %zu, output size: %zu", inputs.size(), outputs.size()); Status status = BuildSingleOp(op_desc, inputs, outputs, kFileNameSuffix, engine_type, model_buff, false); GELOGI("Finish build single online model, status: %u", status); @@ -926,7 +927,7 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector &inputs, const vector &outputs, OpEngineType engine_type, int32_t compile_flag, ModelBufferData &model_buff) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGI("Start to build single op online, input size: %zu, output size: %zu", inputs.size(), outputs.size()); Status status = BuildSingleOp(op_desc, inputs, outputs, kFileNameSuffix, engine_type, model_buff, false, compile_flag); @@ -1072,7 +1073,7 @@ Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector ret = graph_manager_.BuildGraph(graph_id, inputs, ge_root_model, session_id); } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); if (ret != SUCCESS) { GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph fail, graph id: %u", graph_id); ret = GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED; diff --git a/ge/graph/build/graph_builder.cc b/ge/graph/build/graph_builder.cc index 97b7608c..bcd80b0c 100644 --- a/ge/graph/build/graph_builder.cc +++ b/ge/graph/build/graph_builder.cc @@ -233,7 +233,7 @@ Status GraphBuilder::BuildForKnownShapeGraph(ComputeGraphPtr &comp_graph, return SUCCESS; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kPreBuild); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kPreBuild); GELOGI("Begin to build known shape graph[%s].", comp_graph->GetName().c_str()); Status ret = SecondPartition(comp_graph); GE_CHK_STATUS_RET(ret, "Graph[%s] second partition Failed.", comp_graph->GetName().c_str()); @@ -264,7 +264,7 @@ Status GraphBuilder::BuildForKnownShapeGraph(ComputeGraphPtr &comp_graph, GE_TIMESTAMP_END(BuildModelForGetTask, "GraphBuilder::BuildModelForGetTask"); GE_DUMP(comp_graph, "AfterBuildModel"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kTaskGenerate); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kTaskGenerate); GE_TIMESTAMP_START(GetTaskInfo); ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_map, session_id); GE_TIMESTAMP_END(GetTaskInfo, "GraphBuilder::GetTaskInfo"); @@ -274,7 +274,7 @@ Status GraphBuilder::BuildForKnownShapeGraph(ComputeGraphPtr &comp_graph, return ret; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); ge_model_ptr = MakeShared(); if (ge_model_ptr == nullptr) { return MEMALLOC_FAILED; @@ -336,7 +336,7 @@ Status GraphBuilder::SetConstantInputOffset(ComputeGraphPtr &comp_graph) { Status GraphBuilder::BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr, uint64_t session_id) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kPreBuild); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kPreBuild); GELOGI("Begin to build unknown shape graph[%s].", comp_graph->GetName().c_str()); Graph2SubGraphInfoList subgraph_map; ge::ModelBuilder builder(session_id, comp_graph, subgraph_map, stream_max_parallel_num_, hcom_parallel_, build_mode_); @@ -369,11 +369,11 @@ Status GraphBuilder::BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeMo GE_CHK_STATUS_RET(builder.BuildModelForGetDynShapeTask(*model_ptr), "Graph[%s] builder BuildModelForGetDynShapeTask() return fail.", comp_graph->GetName().c_str()); GE_TIMESTAMP_END(BuildModelForGetDynShapeTask, "GraphBuilder::BuildModelForGetDynShapeTask"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kTaskGenerate); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kTaskGenerate); GE_TIMESTAMP_START(GetTaskInfo); Status ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_map, session_id); GE_TIMESTAMP_END(GetTaskInfo, "GraphBuilder::GetTaskInfo"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GraphUtils::DumpGEGraph(comp_graph, "AfterGetTask"); GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterGetTask"); @@ -395,24 +395,6 @@ Status GraphBuilder::BuildForHostCpuGraph(ComputeGraphPtr &comp_graph, GeModelPt return BuildForUnknownShapeGraph(comp_graph, ge_model_ptr, session_id); } -static Status InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDataAnchorPtr &out_anchor, - const std::vector &in_anchors, const std::string &name) { - GE_CHECK_NOTNULL(out_anchor); - NodePtr in_node = out_anchor->GetOwnerNode(); - GE_CHECK_NOTNULL(in_node); - OpDescBuilder op_desc_builder(name, MEMCPYASYNC); - OpDescPtr op_desc = op_desc_builder.AddInput("x", in_node->GetOpDesc()->GetOutputDesc(0)) - .AddOutput("y", in_node->GetOpDesc()->GetOutputDesc(0)) - .Build(); - (void)AttrUtils::SetBool(op_desc, ATTR_NO_NEED_CONSTANT_FOLDING, false); - if (GraphUtils::InsertNodeAfter(out_anchor, in_anchors, graph->AddNode(op_desc)) != GRAPH_SUCCESS) { - REPORT_CALL_ERROR("E19999", "Insert IDENTITY node %s after %s failed", name.c_str(), in_node->GetName().c_str()); - GELOGE(FAILED, "Insert IDENTITY node %s after %s failed.", name.c_str(), in_node->GetName().c_str()); - return FAILED; - } - return SUCCESS; -} - Status GraphBuilder::MarkFpBpProfilingTaskAttr(ComputeGraphPtr &com_graph) { bool original_unknown_shape_flag = com_graph->GetGraphUnknownFlag(); com_graph->SetGraphUnknownFlag(false); diff --git a/ge/graph/build/memory/graph_mem_assigner.cc b/ge/graph/build/memory/graph_mem_assigner.cc index 3bbec914..b9f80070 100755 --- a/ge/graph/build/memory/graph_mem_assigner.cc +++ b/ge/graph/build/memory/graph_mem_assigner.cc @@ -427,6 +427,86 @@ bool IsContinuousInputConflict(const ge::NodePtr &node, const OpDescPtr &peer_op return false; } +/// op1 -> node -> op2 +/// return true when node is ref from input, and op1 or op2 is reuse input from output +bool GraphMemoryAssigner::IsRefFromInputOpCascade(const NodePtr &node) { + bool ref_from_input = false; + int32_t reuse_in_index = -1; + for (const auto &out_anchor : node->GetAllOutDataAnchors()) { + ref_from_input = GraphUtils::IsRefFromInput(out_anchor, reuse_in_index); + if (ref_from_input) { + GELOGD("IsRefFromInputOpCascade: cur node:%s:%d is ref", node->GetName().c_str(), reuse_in_index); + break; + } + } + + for (const auto &in_anchor : node->GetAllInDataAnchors()) { + const auto &peer_out_anchor = in_anchor->GetPeerOutAnchor(); + GE_IF_BOOL_EXEC(peer_out_anchor == nullptr, continue); + if (ref_from_input && GraphUtils::IsRefFromInput(peer_out_anchor, reuse_in_index)) { + GELOGD("IsRefFromInputOpCascade: in node[%s] is ref, reuse index is:%d", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), reuse_in_index); + return true; + } + } + + for (const auto &out_anchor : node->GetAllOutDataAnchors()) { + const auto &peer_in_anchors = out_anchor->GetPeerInDataAnchors(); + for (const auto &peer_in_anchor : peer_in_anchors) { + auto peer_in_node = peer_in_anchor->GetOwnerNode(); + GE_IF_BOOL_EXEC(peer_in_node == nullptr, continue); + for (const auto &peer_in_node_out_anchor : peer_in_node->GetAllOutDataAnchors()) { + if (ref_from_input && GraphUtils::IsRefFromInput(peer_in_node_out_anchor, reuse_in_index)) { + GELOGD("IsRefFromInputOpCascade: out node[%s] is ref, reuse index is:%d", + peer_in_node_out_anchor->GetOwnerNode()->GetName().c_str(), reuse_in_index); + return true; + } + } + } + } + return false; +} + +/// node:in0(in0 reuse out0) -> peer_node:out0 +/// update peer_node's 0th output offset with node's 0th output offset +Status GraphMemoryAssigner::UpdateRefOpOffsetReverse(const NodePtr &node) { + map out2ins; + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node:%s", + node->GetName().c_str()); + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + vector output_list = op_desc->GetOutputOffset(); + for (const auto &out2in : out2ins) { + auto reuse_in_anchor = node->GetInDataAnchor(out2in.second); + GE_CHECK_NOTNULL(reuse_in_anchor); + auto peer_out_anchor = reuse_in_anchor->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(peer_out_anchor); + auto peer_node = peer_out_anchor->GetOwnerNode(); + GE_CHECK_NOTNULL(peer_node); + auto peer_op_desc = peer_node->GetOpDesc(); + GE_CHECK_NOTNULL(peer_op_desc); + vector peer_output_list = peer_op_desc->GetOutputOffset(); + if ((peer_out_anchor->GetIdx() >= static_cast(peer_output_list.size())) + || (out2in.first >= static_cast(output_list.size()))) { + GELOGW("out of range, peer_out_anchor:%d, peer_output_list size:%zu, out2in:%d, output_list size:%zu", + peer_out_anchor->GetIdx(), + peer_output_list.size(), + out2in.first, + output_list.size()); + continue; + } + peer_output_list.at(peer_out_anchor->GetIdx()) = output_list.at(out2in.first); + peer_op_desc->SetOutputOffset(peer_output_list); + GELOGD("UpdateRefOpOffsetReverse: Node[%s] output[%d] is set from node[%s] output index[%d] offset[%ld]", + peer_node->GetName().c_str(), + peer_out_anchor->GetIdx(), + node->GetName().c_str(), + out2in.first, + output_list.at(out2in.first)); + } + return SUCCESS; +} + Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { Status ret; // Stored nodes which need assign continuous input memory in `reverse topo order` @@ -446,12 +526,16 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { } // Assign continuous input memory bool continuous_input = ((continuous_type & kTypeInput) != 0) || ((continuous_type & kTypeInputNoPadding) != 0); - if (continuous_input) { + if (IsRefFromInputOpCascade(node)) { + nodes_stack.push_back(node); + GELOGD("Ref: Push node:%s to stack", node->GetName().c_str()); + } else if (continuous_input) { if (AssignContinuousInputMemoryWithAtomicProcessDirectly(node, node_2_continuous_type)) { GE_CHK_STATUS_RET(AssignContinuousInputMemoryWithAtomicProcess(node, continuous_type), "[Assign][Memory:Continuous:Input]fail for node:%s", node->GetName().c_str()) } else { nodes_stack.push_back(node); + GELOGD("Continuous: Push node:%s to stack", node->GetName().c_str()); } } // Assign continuous output memory @@ -478,8 +562,13 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) { GELOGE(FAILED, "[Get][ContinuousType] find fail for node:%s", node->GetName().c_str()); return FAILED; } - GE_CHK_STATUS_RET(AssignContinuousInputMemoryWithAtomicProcess(node, iter->second, true), - "[Assign][Memory:Continuous:Input]fail for node:%s.", node->GetName().c_str()) + if (((iter->second & kTypeInput) != 0) || ((iter->second & kTypeInputNoPadding) != 0)) { + GE_CHK_STATUS_RET(AssignContinuousInputMemoryWithAtomicProcess(node, iter->second, true), + "[Assign][Memory:Continuous:Input]fail for node:%s.", node->GetName().c_str()) + } else { + GE_CHK_STATUS_RET(UpdateRefOpOffsetReverse(node), + "[Update][Memory:Reference:Output]fail for node:%s", node->GetName().c_str()) + } } for (auto pair : memory_offset_) { GELOGD("[Reassign][Memory:Continuous]At last, memory type = %ld, mem offset = %zu", pair.first, @@ -560,7 +649,7 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node, bool is_allocated_first_input = is_continuous_input_allocated && (in_data_anchor->GetIdx() == 0); if (is_allocated_first_input) { std::map out2ins; - GE_CHK_STATUS_RET(GetAllRef(node, out2ins), "[Get][AllRef]fail for node: %s", node->GetName().c_str()); + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node: %s", node->GetName().c_str()); // output is beginning offset, set offset for input; only support this case now if ((out2ins.size() == 1) && (out2ins.begin()->second == 0) && (reverse_refresh)) { auto peer_output_offset = output_list.at(peer_out_data_anchor->GetIdx()); @@ -1250,10 +1339,47 @@ Status GraphMemoryAssigner::CheckOffset() { return FAILED; } } + // check reuse input and output + GE_CHK_STATUS_RET(CheckRefNodeOffset(node), "[Check][Offset]fail for node: %s", node->GetName().c_str()); } + return SUCCESS; } +ge::Status GraphMemoryAssigner::CheckRefNodeOffset(const NodePtr &node) { + GE_CHECK_NOTNULL(node); + std::map out2ins; + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node: %s", node->GetName().c_str()); + auto opdesc = node->GetOpDesc(); + GE_CHECK_NOTNULL(opdesc); + auto output_list = opdesc->GetOutputOffset(); + auto input_list = opdesc->GetInputOffset(); + for (const auto &out2in : out2ins) { + auto out_i = out2in.first; + if (static_cast(out_i) >= output_list.size()) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "output offset size" + + FmtToStr(output_list.size()) + "should bigger than ref out index" + FmtToStr(out_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + auto in_i = out2in.second; + if (static_cast(in_i) >= input_list.size()) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "input offset size" + + FmtToStr(input_list.size()) + "should bigger than ref input index" + FmtToStr(in_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + if (output_list[out_i] != input_list[in_i]) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "input offset " + FmtToStr(input_list[in_i]) + + "should equal to output offset" + FmtToStr(output_list[out_i]) + "with ref in" + + FmtToStr(in_i) + "to output" + FmtToStr(out_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + } + return ge::SUCCESS; +} + ge::Status GraphMemoryAssigner::SetInputOffset() { if (memory_offset_.empty()) { REPORT_INNER_ERROR("E19999", "InnerData memory_offset_ empty, not expected, graph_id:%u, graph_name:%s", @@ -1330,6 +1456,8 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< origin_input_list = tmp_op_desc->GetInputOffset(); int64_t valid_input_index = 0; bool has_mem_type_attr = ge::AttrUtils::GetListInt(tmp_op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, memory_type); + std::map out2ins; + GE_CHK_STATUS_RET(TryGetNodeRefIndexes(node, out2ins), "[Get][RefIndexes]fail for node: %s", node->GetName().c_str()); for (const auto &anchor : node->GetAllInDataAnchors()) { vector output_list; auto peer_out_anchor = anchor->GetPeerOutAnchor(); @@ -1344,23 +1472,30 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< output_list = last_peer_out_op_desc->GetOutputOffset(); auto out_index = static_cast(peer_out_anchor->GetIdx()); if (output_list.size() > static_cast(out_index)) { + bool is_l1_type = false; int64_t input_offset = output_list.at(out_index); if (has_mem_type_attr && !origin_input_list.empty()) { auto input_size = tmp_op_desc->GetInputsSize(); auto ori_input_offset_list_size = origin_input_list.size(); auto mem_type_size = memory_type.size(); if ((input_size != mem_type_size) || (input_size != ori_input_offset_list_size)) { - std::string error = "fusion: node" + FmtToStr(tmp_op_desc->GetName()) + + std::string error = "Node" + FmtToStr(tmp_op_desc->GetName()) + + " input_size" + FmtToStr(input_size) + " diff from memory_type_size" + FmtToStr(mem_type_size) + " from ori_input_offset_list_size" + FmtToStr(ori_input_offset_list_size); GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); return ge::FAILED; } - // not hbm keep orignal inputoffest - // hbm inputoffset = original inputoffset + outputoffset - input_offset = (memory_type[valid_input_index] == RT_MEMORY_L1 ? origin_input_list[valid_input_index] - : origin_input_list[valid_input_index] + output_list.at(out_index)); + GELOGD("Node[%s] input[%d] has origin offset[%ld]", tmp_op_desc->GetName().c_str(), anchor->GetIdx(), + origin_input_list[valid_input_index]); + // L1 keep original input_offset + is_l1_type = (memory_type[valid_input_index] == RT_MEMORY_L1); + if (is_l1_type) { + input_offset = origin_input_list[valid_input_index]; + } else { + // hbm input_offset = original input_offset + output_offset + input_offset = origin_input_list[valid_input_index] + output_list.at(out_index); + } } const auto &in_node = GetKnownInputNode(peer_out_anchor->GetOwnerNode()); if (in_node->GetType() == CONSTANT) { @@ -1368,12 +1503,13 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, input_offset)); } - GELOGD("%s node[%s] input[%ld] is set from node[%s] out index[%lu] offset[%ld]", - has_mem_type_attr ? "Fusion" : "", - tmp_op_desc->GetName().c_str(), - valid_input_index, - peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), - out_index, + if (!is_l1_type) { + // update ref output_offset when input change + GE_CHK_STATUS_RET(UpdateRefOpOutputOffset(node, out2ins, anchor->GetIdx(), input_offset), + "[Update][RefOffset]fail for node: %s", node->GetName().c_str()); + } + GELOGD("Node[%s] input[%d] is set from node[%s] out index[%lu] offset[%ld]", tmp_op_desc->GetName().c_str(), + anchor->GetIdx(), peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), out_index, input_offset); input_list.emplace_back(input_offset); valid_input_index++; @@ -1382,6 +1518,30 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector< return ge::SUCCESS; } +ge::Status GraphMemoryAssigner::UpdateRefOpOutputOffset(const NodePtr &node, const std::map &out2ins, + const int ref_in, const int64_t input_offset) const { + auto opdesc = node->GetOpDesc(); + GE_CHECK_NOTNULL(opdesc); + for (const auto &out2in : out2ins) { + auto out_i = out2in.first; + auto in_i = out2in.second; + if (in_i == ref_in) { + auto origin_output_list = opdesc->GetOutputOffset(); + if (static_cast(out_i) >= origin_output_list.size()) { + std::string error = "Node" + FmtToStr(opdesc->GetName()) + "output offset size" + + FmtToStr(origin_output_list.size()) + "should bigger than ref out index" + FmtToStr(out_i); + GE_ERRORLOG_AND_ERRORMSG(ge::FAILED, error.c_str()); + return ge::FAILED; + } + origin_output_list[out_i] = input_offset; + opdesc->SetOutputOffset(origin_output_list); + GELOGI("Node[%s] output[%d] is updated from reuse input index[%d] to offset[%ld]", opdesc->GetName().c_str(), + out_i, ref_in, input_offset); + } + } + return ge::SUCCESS; +} + ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const { GE_CHECK_NOTNULL(node->GetOpDesc()); vector input_list; @@ -1626,9 +1786,19 @@ void GraphMemoryAssigner::PrintMemoryOffset() { } } -ge::Status GraphMemoryAssigner::GetAllRef(const NodePtr &node, map &out2ins) { +ge::Status GraphMemoryAssigner::TryGetNodeRefIndexes(const NodePtr &node, map &out2ins) const{ + // data and netoutput no need check because only data's output or netoutput's input is used + if (node->GetType() == DATA || node->GetType() == NETOUTPUT) { + return ge::SUCCESS; + } for (const auto &out_data_anchor : node->GetAllOutDataAnchors()) { int32_t reuse_in_index = -1; + // nopadding means output[0] reuse input[0], but as history reason, + // other output index also return true for mem assign in block_mem_assigner + if (GraphUtils::IsNoPaddingRefFromInput(out_data_anchor, reuse_in_index)) { + out2ins.emplace(out_data_anchor->GetIdx(), reuse_in_index); + return ge::SUCCESS; + } bool reuse_input_flag = GraphUtils::IsRefFromInput(out_data_anchor, reuse_in_index); if (reuse_input_flag) { if (node->GetInDataAnchor(reuse_in_index) != nullptr) { diff --git a/ge/graph/build/memory/graph_mem_assigner.h b/ge/graph/build/memory/graph_mem_assigner.h index 92e599b8..33a5b6d3 100755 --- a/ge/graph/build/memory/graph_mem_assigner.h +++ b/ge/graph/build/memory/graph_mem_assigner.h @@ -110,8 +110,11 @@ class GraphMemoryAssigner { ge::Status SetInputOffset(); ge::Status UpdateOpInputOffset(const NodePtr &node) const; + ge::Status UpdateRefOpOutputOffset(const NodePtr &node, const std::map &out2ins, const int ref_in, + const int64_t input_offset) const; ge::Status CheckOffset(); + ge::Status CheckRefNodeOffset(const NodePtr &node); ge::Status AssignReferenceMemory(); @@ -125,7 +128,7 @@ class GraphMemoryAssigner { ge::Status ReAssignAtomicMemory(bool is_loop_graph); - ge::Status GetAllRef(const NodePtr &node, std::map &out2ins); + ge::Status TryGetNodeRefIndexes(const NodePtr &node, std::map &out2ins) const; bool AssignContinuousInputMemoryWithAtomicProcessDirectly(const NodePtr &input_continuous_node, std::map &node_2_continuous_type); @@ -190,6 +193,10 @@ class GraphMemoryAssigner { Status AssignBufferPoolMemory(); + bool IsRefFromInputOpCascade(const NodePtr &node); + + Status UpdateRefOpOffsetReverse(const NodePtr &node); + MemoryOffsetMap memory_offset_; ge::ComputeGraphPtr compute_graph_; HybridMemAssignerPtr mem_assigner_; diff --git a/ge/graph/build/model_builder.cc b/ge/graph/build/model_builder.cc index 6f427683..56cd5b5a 100755 --- a/ge/graph/build/model_builder.cc +++ b/ge/graph/build/model_builder.cc @@ -243,7 +243,7 @@ Status ModelBuilder::SetInputOutputDesc() { } // if user set input node format ND, the expected node for data and netoutput format is ND in // final graph. - if ((GetLocalOmgContext().format == domi::DOMI_TENSOR_ND) && (!node_op_desc->HasAttr("_is_single_op")) && + if ((compute_graph_->GetParentGraph() == nullptr) && (GetLocalOmgContext().format == domi::DOMI_TENSOR_ND) && (!node_op_desc->HasAttr("_is_single_op")) && ((node_op_desc->GetType() == DATA_TYPE) || (node_op_desc->GetType() == NETOUTPUT))) { auto inputDescsPtr = node_op_desc->GetAllInputsDescPtr(); auto outputDescsPtr = node_op_desc->GetAllOutputsDescPtr(); @@ -794,7 +794,7 @@ Status ModelBuilder::PreBuildModel() { Status ModelBuilder::BuildModelForGetTask(ge::Model &model) { GE_CHK_STATUS_RET(AdjustInputTensorFlag(), "AdjustInputTensorFlag failed!"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kStreamAlloc); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kStreamAlloc); // Assign logical streams. StreamAllocator stream_allocator(compute_graph_, subgraphs_); GE_TIMESTAMP_START(AssignLogicalStreams); @@ -802,7 +802,7 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) { "Assign logical streams failed."); GE_TIMESTAMP_END(AssignLogicalStreams, "GraphBuilder::AssignLogicalStreams"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kMemoryAlloc); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kMemoryAlloc); // Assign functional op labels. auto root_graph = GraphUtils::FindRootGraph(compute_graph_); (void)AttrUtils::GetInt(*root_graph, ATTR_MODEL_LABEL_NUM, label_num_); @@ -813,7 +813,7 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) { "Assign Memory Failed!"); GE_TIMESTAMP_END(AssignMemory, "GraphBuilder::AssignMemory"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GE_TIMESTAMP_START(SetInputOutputOffset); SetInputOutputOffsetPass input_output_offset; GE_CHK_STATUS_RET(input_output_offset.Run(compute_graph_), "Set input output offset failed."); @@ -824,14 +824,14 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) { GE_CHK_STATUS_RET(CompileSingleOp(), "ATC builder CompileSingleOp() return fail."); GE_TIMESTAMP_EVENT_END(CompileSingleOp, "GraphBuilder::CompileSingleOp"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kStreamAlloc); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kStreamAlloc); // Refresh real streams and insert event nodes. GE_TIMESTAMP_START(RefreshRealStream); GE_CHK_STATUS_RET(stream_allocator.RefreshRealStream(stream_num_, event_num_), "RefreshRealStream failed."); huge_streams_ = stream_allocator.GetHugeStreams(); GE_TIMESTAMP_END(RefreshRealStream, "GraphBuilder::RefreshRealStream"); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GE_TIMESTAMP_START(MergeWeights); GE_CHK_STATUS_RET(MergeWeights(), "MergeWeights Failed!"); GE_TIMESTAMP_END(MergeWeights, "GraphBuilder::MergeWeights"); diff --git a/ge/graph/execute/graph_execute.cc b/ge/graph/execute/graph_execute.cc index d924302c..1d22016e 100755 --- a/ge/graph/execute/graph_execute.cc +++ b/ge/graph/execute/graph_execute.cc @@ -20,9 +20,11 @@ #include #include "graph/load/model_manager/model_manager.h" -#include "omm/csa_interact.h" +#include "graph/load/model_manager/davinci_model.h" namespace ge { +using Uint32Pair = pair; +const uint32_t kInvalidModelId = UINT32_MAX; GraphExecutor::GraphExecutor() : init_flag_(false), train_graph_flag_(false), @@ -380,7 +382,8 @@ Status GraphExecutor::ExecuteGraph(GraphId graph_id, const GeRootModelPtr &ge_ro } Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr &ge_root_model, - const std::vector &input_tensor) { + const std::vector &input_tensor, + const RunAsyncCallback& callback) { GELOGI("[GraphExecutor] Start to async execute graph, graph_id=%u", graph_id); if (graph_id != last_graph_id_) { auto ret = FreeExecuteMemory(); @@ -390,7 +393,7 @@ Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr & } last_graph_id_ = graph_id; GE_CHECK_NOTNULL_EXEC(ge_root_model, return FAILED); - Status ret = AsyncExecuteModel(ge_root_model->GetModelId(), input_tensor); + Status ret = AsyncExecuteModel(ge_root_model, input_tensor, callback); if (ret != SUCCESS) { GELOGE(GE_GRAPH_SYNC_MODEL_FAILED, "[GraphExecutor] AsyncExecuteModel Error!"); return GE_GRAPH_SYNC_MODEL_FAILED; @@ -400,11 +403,81 @@ Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr & return SUCCESS; } -Status GraphExecutor::AsyncExecuteModel(uint32_t model_id, const std::vector &inputs) { +bool CompareByLoad(const Uint32Pair &lhs, const Uint32Pair &rhs) { + return lhs.second < rhs.second; +} + +uint32_t GraphExecutor::GetExecuteModelId(const GeRootModelPtr &ge_root_model) { + std::vector model_ids = ge_root_model->GetAllModelId(); + if (model_ids.empty()) { + return kInvalidModelId; + } + if (model_ids.size() == 1) { + return ge_root_model->GetModelId(); + } + std::vector model_id_to_loads; + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + for (auto model_id : model_ids) { + auto davinci_model = model_manager->GetModel(model_id); + auto hybrid_model = model_manager->GetHybridModel(model_id); + if (hybrid_model == nullptr) { + GE_CHECK_NOTNULL(davinci_model); + } + uint32_t input_load = hybrid_model != nullptr ? hybrid_model->GetDataInputerSize() : + davinci_model->GetDataInputerSize(); + uint32_t running_load = hybrid_model != nullptr ? static_cast(hybrid_model->GetRunningFlag()) : + static_cast(davinci_model->GetRunningFlag()); + uint32_t load = input_load + running_load; + if (load == 0) { + return model_id; + } + model_id_to_loads.emplace_back(model_id, load); + } + sort(model_id_to_loads.begin(), model_id_to_loads.end(), CompareByLoad); + if (model_id_to_loads.empty()) { + return kInvalidModelId; + } + return model_id_to_loads.begin()->first; +} + +Status GraphExecutor::SetCallback(uint32_t model_id, const GeRootModelPtr &ge_root_model, + const RunAsyncCallback &callback) { + auto model_manager = ge::ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + if (model_manager->IsNeedHybridLoad(*ge_root_model)) { + auto model = model_manager->GetHybridModel(model_id); + GE_CHECK_NOTNULL(model); + if (model->SetRunAsyncListenerCallback(callback) != SUCCESS) { + GELOGE(FAILED, "SetRunAsyncListenerCallback failed."); + return FAILED; + } + } else { + auto model = model_manager->GetModel(model_id); + GE_CHECK_NOTNULL(model); + if (model->SetRunAsyncListenerCallback(callback) != SUCCESS) { + GELOGE(FAILED, "SetRunAsyncListenerCallback failed."); + return FAILED; + } + } + return SUCCESS; +} + +Status GraphExecutor::AsyncExecuteModel(const GeRootModelPtr &ge_root_model, const std::vector &inputs, + const RunAsyncCallback &callback) { + uint32_t model_id = GetExecuteModelId(ge_root_model); + if (model_id == kInvalidModelId) { + GELOGE(INTERNAL_ERROR, "No valid model id."); + return INTERNAL_ERROR; + } try { auto model_manager = ge::ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); GELOGI("RunAsync begin.model_id %u", model_id); + if (SetCallback(model_id, ge_root_model, callback) != SUCCESS) { + GELOGE(FAILED, "RunAsync: SetCallBack for model fail"); + return FAILED; + } Status ret = model_manager->DataInputTensor(model_id, inputs); if (ret != SUCCESS) { @@ -416,12 +489,10 @@ Status GraphExecutor::AsyncExecuteModel(uint32_t model_id, const std::vectorDataInput(input_data, output_data); if (ret != SUCCESS) { GELOGE(ret, "DataInput: DataInput failed."); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return ret; } } catch (std::bad_alloc &) { REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed"); GELOGE(MEMALLOC_FAILED, "DataInput failed, bad memory allocation occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return MEMALLOC_FAILED; } catch (...) { REPORT_INNER_ERROR("E19999", "Some exceptions occur failed"); GELOGE(FAILED, "DataInput failed, some exceptions occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return FAILED; } @@ -461,18 +529,15 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vectorGetInputOutputDescInfo(model_id, input_desc, output_desc); if (ret != SUCCESS) { GELOGE(ret, "GetInputOutputDescInfo failed."); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return ret; } } catch (std::bad_alloc &) { REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed"); GELOGE(MEMALLOC_FAILED, "GetInputOutputDescInfo failed, bad memory allocation occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return MEMALLOC_FAILED; } catch (...) { REPORT_INNER_ERROR("E19999", "Some exceptions occur failed"); GELOGE(FAILED, "GetInputOutputDescInfo failed, some exceptions occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return FAILED; } @@ -490,18 +555,15 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vector &output_tensor); ge::Status ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr &ge_root_model, - const std::vector &input_tensor); + const std::vector &input_tensor, const RunAsyncCallback &callback); Status SetCondition(std::mutex *mutex, std::condition_variable *cond, std::shared_ptr listener); @@ -116,6 +116,8 @@ class GraphExecutor { static Status GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info); + uint32_t GetExecuteModelId(const GeRootModelPtr &ge_root_model); + private: Status PrepareInputData(const std::vector &input_tensor, InputData &graph_input_data, OutputData &graph_output_data, std::vector &output_desc); @@ -123,7 +125,8 @@ class GraphExecutor { Status SyncExecuteModel(uint32_t model_id, const std::vector &input_tensor, std::vector &output_tensor); - Status AsyncExecuteModel(uint32_t model_id, const std::vector &input_tensor); + Status AsyncExecuteModel(const GeRootModelPtr &ge_root_model, const std::vector &input_tensor, + const RunAsyncCallback &callback); void InitModelIdInfo(std::vector &out_model_id_info, std::vector &sub_graph_vec, uint32_t output_size); @@ -132,6 +135,9 @@ class GraphExecutor { Status MallocInOutBuffer(const std::vector &buffer_size, std::vector &data_addr); + static Status SetCallback(uint32_t model_id, const GeRootModelPtr &ge_root_model, + const RunAsyncCallback &callback); + bool init_flag_; bool train_graph_flag_; diff --git a/ge/graph/load/graph_loader.cc b/ge/graph/load/graph_loader.cc index cf95b271..ff1b2178 100755 --- a/ge/graph/load/graph_loader.cc +++ b/ge/graph/load/graph_loader.cc @@ -24,7 +24,6 @@ #include "graph/ge_context.h" #include "graph/load/model_manager/model_manager.h" #include "graph/manager/graph_var_manager.h" -#include "omm/csa_interact.h" namespace ge { Status GraphLoader::UnloadModel(uint32_t model_id) { @@ -40,7 +39,6 @@ Status GraphLoader::UnloadModel(uint32_t model_id) { ret = model_manager->Unload(model_id); if (ret != SUCCESS) { GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_UNLOAD); return ret; } GELOGI("UnLoad model success, model id:%u.", model_id); @@ -55,7 +53,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptrGetModelId(); auto model_manager = ModelManager::GetInstance(); GE_CHECK_NOTNULL(model_manager); Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener); if (ret != SUCCESS) { GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_LOAD); - rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", @@ -95,7 +89,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr v_memory_type, size_t i) { return has_mem_type_attr && (v_memory_type[i] == RT_MEMORY_L1); } - -static uint64_t GetNowTime() { - uint64_t ret = 0; - mmTimeval tv; - if (mmGetTimeOfDay(&tv, nullptr) == 0) { - ret = tv.tv_sec * 1000000ULL + tv.tv_usec; - } - - return ret; -} - -static void ReplaceStringElem(std::string &str) { - for_each(str.begin(), str.end(), [](char &ch) { - if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) { - ch = '_'; - } - }); -} } // namespace static int32_t GetIrDataType(ge::DataType data_type) { @@ -194,66 +176,6 @@ void DataDumper::SaveOpDebugId(uint32_t task_id, uint32_t stream_id, void *op_de is_op_debug_ = is_op_debug; } -void DataDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, - uint32_t stream_id) { - GELOGD("Start SaveDumpOpInfo of task_id: %u, stream_id: %u", task_id, stream_id); - OpDescInfo op_desc_info; - op_desc_info.op_name = op->GetName(); - op_desc_info.op_type = op->GetType(); - op_desc_info.task_id = task_id; - op_desc_info.stream_id = stream_id; - for (size_t i = 0; i < op->GetAllInputsSize(); ++i) { - GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i); - if (input_tensor_desc == nullptr) { - continue; - } - op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat()); - op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims()); - op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType()); - int64_t input_size = 0; - - if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) { - GELOGW("Get input size failed"); - return; - } - GELOGD("Save dump op info, the input size is %ld", input_size); - op_desc_info.input_size.emplace_back(input_size); - } - for (size_t j = 0; j < op->GetOutputsSize(); ++j) { - GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j); - if (output_tensor_desc == nullptr) { - continue; - } - op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat()); - op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims()); - op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType()); - int64_t output_size = 0; - if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) { - GELOGW("Get input size failed"); - return; - } - GELOGD("Save dump op info, the output size is %ld", output_size); - op_desc_info.output_size.emplace_back(output_size); - } - op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op); - op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op); - - op_desc_info_.emplace_back(op_desc_info); -} - -bool DataDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { - GELOGI("There are %zu op need to dump.", op_desc_info_.size()); - for (size_t index = 0; index < op_desc_info_.size(); ++index) { - OpDescInfo dump_op_info = op_desc_info_.at(index); - if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) { - GELOGI("find exception op of task_id: %u, stream_id: %u.", task_id, stream_id); - op_desc_info = dump_op_info; - return true; - } - } - return false; -} - void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr &op_desc, uintptr_t args) { if (op_desc == nullptr) { @@ -295,7 +217,7 @@ void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::s } static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uintptr_t loop_cond, - aicpu::dump::OpMappingInfo &op_mapping_info) { + toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { if (step_id != 0) { GELOGI("step_id exists."); op_mapping_info.set_step_id_addr(static_cast(step_id)); @@ -312,7 +234,8 @@ static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uin } } -Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor &tensor_descs, +Status DataDumper::GenerateOutput(toolkit::aicpu::dump::Output &output, + const OpDesc::Vistor &tensor_descs, const uintptr_t &addr, size_t index) { output.set_data_type(static_cast(GetIrDataType(tensor_descs.at(index).GetDataType()))); output.set_format(static_cast(tensor_descs.at(index).GetFormat())); @@ -343,7 +266,8 @@ Status DataDumper::GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vis return SUCCESS; } -Status DataDumper::DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Output &output, +Status DataDumper::DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, + toolkit::aicpu::dump::Output &output, size_t i, const std::string &node_name_index) { std::string dump_op_name; std::string input_or_output; @@ -384,17 +308,9 @@ Status DataDumper::DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_inf return SUCCESS; } -Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) { +Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Task &task) { const auto &output_descs = inner_dump_info.op->GetAllOutputsDesc(); const std::vector output_addrs = ModelUtils::GetOutputDataAddrs(*runtime_param_, inner_dump_info.op); - if (output_descs.size() != output_addrs.size()) { - REPORT_INNER_ERROR("E19999", "output_desc size:%zu != output addr size:%zu in op:%s(%s)", - output_descs.size(), output_addrs.size(), - inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str()); - GELOGE(PARAM_INVALID, "Invalid output desc addrs size %zu, op %s has %zu output desc.", output_addrs.size(), - inner_dump_info.op->GetName().c_str(), output_descs.size()); - return PARAM_INVALID; - } std::vector v_memory_type; bool has_mem_type_attr = ge::AttrUtils::GetListInt(inner_dump_info.op, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, v_memory_type); GE_RT_PARAM_INVALID_WITH_LOG_IF_TRUE(has_mem_type_attr && (v_memory_type.size() != output_descs.size()), @@ -402,10 +318,33 @@ Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicp inner_dump_info.op->GetName().c_str(), output_descs.size(), v_memory_type.size()); + size_t no_need_dump_output_num = 0; for (size_t i = 0; i < output_descs.size(); ++i) { - aicpu::dump::Output output; + toolkit::aicpu::dump::Output output; std::string node_name_index; const auto &output_desc = output_descs.at(i); + int32_t calc_type = 0; + bool has_calc_type = ge::AttrUtils::GetInt(output_desc, ATTR_NAME_MEMORY_SIZE_CALC_TYPE, calc_type); + if (has_calc_type && (calc_type == static_cast(ge::MemorySizeCalcType::ALWAYS_EMPTY))) { + GELOGD("Node[%s] output[index:%zu] [name:%s] is an optional output, don't need to dump this output.", + inner_dump_info.op->GetName().c_str(), i, output_desc.GetName().c_str()); + ++no_need_dump_output_num; + continue; + } + + if (output_descs.size() - no_need_dump_output_num < output_addrs.size()) { + REPORT_INNER_ERROR("E19999", "The number of output does not match in op:%s(%s). The size[%zu] of output which is " + "no need to dump should not greater than the size[%zu] of output descs minus the size[%zu] of " + "output which is need to dump.", inner_dump_info.op->GetName().c_str(), + inner_dump_info.op->GetType().c_str(), no_need_dump_output_num, output_descs.size(), + output_addrs.size()); + GELOGE(PARAM_INVALID, "The number of output does not match in op:%s(%s). The size[%zu] of output which is no need" + " to dump should not greater than the size[%zu] of output descs minus the size[%zu] of output which is " + "need to dump.", inner_dump_info.op->GetName().c_str(), inner_dump_info.op->GetType().c_str(), + no_need_dump_output_num, output_descs.size(), output_addrs.size()); + return PARAM_INVALID; + } + // check dump output tensor desc is redirected by attr ATTR_DATA_DUMP_REF if (AttrUtils::GetStr(&output_desc, ATTR_DATA_DUMP_REF, node_name_index)) { GE_CHK_STATUS_RET(DumpRefOutput(inner_dump_info, output, i, node_name_index), "DumpRefOutput failed"); @@ -433,14 +372,14 @@ Status DataDumper::DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicp return SUCCESS; } -Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) { +Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Task &task) { GELOGI("Start dump output"); if (inner_dump_info.is_task) { // tbe or aicpu op, these ops are with task return DumpOutputWithTask(inner_dump_info, task); } // else data, const or variable op - aicpu::dump::Output output; + toolkit::aicpu::dump::Output output; auto output_tensor = inner_dump_info.op->GetOutputDescPtr(inner_dump_info.output_anchor_index); const std::vector output_addrs = ModelUtils::GetOutputDataAddrs(*runtime_param_, inner_dump_info.op); if (output_tensor == nullptr) { @@ -485,7 +424,7 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump: return SUCCESS; } -Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor &tensor_descs, +Status DataDumper::GenerateInput(toolkit::aicpu::dump::Input &input, const OpDesc::Vistor &tensor_descs, const uintptr_t &addr, size_t index) { input.set_data_type(static_cast(GetIrDataType(tensor_descs.at(index).GetDataType()))); input.set_format(static_cast(tensor_descs.at(index).GetFormat())); @@ -510,8 +449,8 @@ Status DataDumper::GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor return SUCCESS; } -Status DataDumper::DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Input &input, size_t i, - const std::string &node_name_index) { +Status DataDumper::DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Input &input, + size_t i, const std::string &node_name_index) { std::string dump_op_name; std::string input_or_output; size_t index; @@ -551,7 +490,7 @@ Status DataDumper::DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info return SUCCESS; } -Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) { +Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Task &task) { GELOGI("Start dump input"); const auto &input_descs = inner_dump_info.op->GetAllInputsDesc(); const std::vector input_addrs = ModelUtils::GetInputDataAddrs(*runtime_param_, inner_dump_info.op); @@ -570,7 +509,7 @@ Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump:: inner_dump_info.op->GetName().c_str(), input_descs.size(), v_memory_type.size()); for (size_t i = 0; i < input_descs.size(); ++i) { - aicpu::dump::Input input; + toolkit::aicpu::dump::Input input; std::string node_name_index; // check dump input tensor desc is redirected by attr ATTR_DATA_DUMP_REF if (AttrUtils::GetStr(&input_descs.at(i), ATTR_DATA_DUMP_REF, node_name_index)) { @@ -601,15 +540,15 @@ Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump:: return SUCCESS; } -void DataDumper::GenerateOpBuffer(const int64_t &size, aicpu::dump::Task &task) { - aicpu::dump::OpBuffer op_buffer; - op_buffer.set_buffer_type(aicpu::dump::BufferType::L1); +void DataDumper::GenerateOpBuffer(const int64_t &size, toolkit::aicpu::dump::Task &task) { + toolkit::aicpu::dump::OpBuffer op_buffer; + op_buffer.set_buffer_type(toolkit::aicpu::dump::BufferType::L1); op_buffer.set_address(reinterpret_cast(l1_fusion_addr_)); op_buffer.set_size(size); task.mutable_buffer()->Add(std::move(op_buffer)); } -Status DataDumper::ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info) { +Status DataDumper::ExecuteLoadDumpInfo(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { std::string proto_str; size_t proto_size = op_mapping_info.ByteSizeLong(); bool ret = op_mapping_info.SerializeToString(&proto_str); @@ -653,7 +592,7 @@ Status DataDumper::ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_in return SUCCESS; } -Status DataDumper::ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info) { +Status DataDumper::ExecuteUnLoadDumpInfo(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { std::string proto_str; size_t proto_size = op_mapping_info.ByteSizeLong(); bool ret = op_mapping_info.SerializeToString(&proto_str); @@ -704,7 +643,7 @@ Status DataDumper::LoadDumpInfo() { GELOGD("op_list_ is empty"); } - aicpu::dump::OpMappingInfo op_mapping_info; + toolkit::aicpu::dump::OpMappingInfo op_mapping_info; auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id_) + "/"; op_mapping_info.set_dump_path(dump_path); @@ -733,11 +672,11 @@ Status DataDumper::LoadDumpInfo() { return SUCCESS; } -Status DataDumper::BuildTaskInfo(aicpu::dump::OpMappingInfo &op_mapping_info) { +Status DataDumper::BuildTaskInfo(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { for (const auto &op_iter : op_list_) { auto op_desc = op_iter.op; GELOGD("Op %s in model begin to add task in op_mapping_info", op_desc->GetName().c_str()); - aicpu::dump::Task task; + toolkit::aicpu::dump::Task task; task.set_end_graph(false); task.set_task_id(op_iter.task_id); task.set_stream_id(op_iter.stream_id); @@ -785,10 +724,10 @@ Status DataDumper::BuildTaskInfo(aicpu::dump::OpMappingInfo &op_mapping_info) { } void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, - aicpu::dump::OpMappingInfo &op_mapping_info) { + toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { if (dump_properties_.GetDumpMode() == kDumpOutput || dump_properties_.GetDumpMode() == kDumpInput || dump_properties_.GetDumpMode() == kDumpAll) { - aicpu::dump::Task task; + toolkit::aicpu::dump::Task task; task.set_end_graph(true); task.set_task_id(end_graph_task_id_); task.set_stream_id(end_graph_stream_id_); @@ -797,7 +736,7 @@ void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, op_mapping_info.mutable_task()->Add(std::move(task)); is_end_graph_ = true; - if (op_mapping_info.model_name_param_case() == aicpu::dump::OpMappingInfo::kModelName) { + if (op_mapping_info.model_name_param_case() == toolkit::aicpu::dump::OpMappingInfo::kModelName) { GELOGI("Add end_graph_info to aicpu, model_name is %s, task_id is %u, stream_id is %u", op_mapping_info.model_name().c_str(), end_graph_task_id_, end_graph_stream_id_); return; @@ -807,10 +746,10 @@ void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, } void DataDumper::SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr, - aicpu::dump::OpMappingInfo &op_mapping_info) { + toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) { if (is_op_debug_) { GELOGI("add op_debug_info to aicpu, task_id is %u, stream_id is %u", task_id, stream_id); - aicpu::dump::Task task; + toolkit::aicpu::dump::Task task; task.set_end_graph(false); task.set_task_id(task_id); task.set_stream_id(stream_id); @@ -818,7 +757,7 @@ void DataDumper::SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void task.mutable_op()->set_op_type(OP_TYPE_OP_DEBUG); // set output - aicpu::dump::Output output; + toolkit::aicpu::dump::Output output; output.set_data_type(DT_UINT8); output.set_format(FORMAT_ND); @@ -844,12 +783,12 @@ Status DataDumper::UnloadDumpInfo() { } GELOGI("UnloadDumpInfo start."); - aicpu::dump::OpMappingInfo op_mapping_info; + toolkit::aicpu::dump::OpMappingInfo op_mapping_info; op_mapping_info.set_model_id(model_id_); op_mapping_info.set_flag(kAicpuUnloadFlag); for (const auto &op_iter : op_list_) { - aicpu::dump::Task task; + toolkit::aicpu::dump::Task task; task.set_task_id(op_iter.task_id); task.set_stream_id(op_iter.stream_id); op_mapping_info.mutable_task()->Add(std::move(task)); @@ -904,98 +843,4 @@ void DataDumper::PrintCheckLog(string &dump_list_key) { } } } - -Status DataDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) { - GELOGI("Start to dump exception input"); - for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) { - if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); - return PARAM_INVALID; - } - } - return SUCCESS; -} - -Status DataDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) { - GELOGI("Start to dump exception output"); - for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) { - if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) != - SUCCESS) { - GELOGE(PARAM_INVALID, "Dump the %zu input data failed", i); - return PARAM_INVALID; - } - } - return SUCCESS; -} - -Status DataDumper::DumpExceptionInfo(const std::vector exception_infos) { - GELOGI("Start to dump exception info"); - for (const rtExceptionInfo &iter : exception_infos) { - OpDescInfo op_desc_info; - if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) { - toolkit::dumpdata::DumpData dump_data; - dump_data.set_version("2.0"); - dump_data.set_dump_time(GetNowTime()); - dump_data.set_op_name(op_desc_info.op_name); - for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) { - toolkit::dumpdata::OpInput input; - input.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.input_data_type[i]))); - input.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.input_format[i])); - for (auto dim : op_desc_info.input_shape[i]) { - input.mutable_shape()->add_dim(dim); - } - input.set_size(op_desc_info.input_size[i]); - GELOGI("The input size int exception is %ld", op_desc_info.input_size[i]); - dump_data.mutable_input()->Add(std::move(input)); - } - for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) { - toolkit::dumpdata::OpOutput output; - output.set_data_type(toolkit::dumpdata::OutputDataType(GetIrDataType(op_desc_info.output_data_type[j]))); - output.set_format(toolkit::dumpdata::OutputFormat(op_desc_info.output_format[j])); - for (auto dim : op_desc_info.output_shape[j]) { - output.mutable_shape()->add_dim(dim); - } - output.set_size(op_desc_info.output_size[j]); - GELOGI("The output size int exception is %ld", op_desc_info.output_size[j]); - dump_data.mutable_output()->Add(std::move(output)); - } - uint64_t now_time = GetNowTime(); - std::string op_name = op_desc_info.op_name; - std::string op_type = op_desc_info.op_type; - ReplaceStringElem(op_name); - ReplaceStringElem(op_type); - string dump_file_path = - "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time); - GELOGI("The exception dump file path is %s", dump_file_path.c_str()); - - uint64_t proto_size = dump_data.ByteSizeLong(); - std::unique_ptr proto_msg(new (std::nothrow) char[proto_size]); - bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size); - if (!ret || proto_size == 0) { - REPORT_INNER_ERROR("E19999", "Serialize proto to string fail"); - GELOGE(PARAM_INVALID, "Dump data proto serialize failed"); - return PARAM_INVALID; - } - - GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)), - "Failed to dump proto size"); - GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size), - "Failed to dump proto msg"); - if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump exception input failed"); - return PARAM_INVALID; - } - - if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) { - GELOGE(PARAM_INVALID, "Dump exception output failed"); - return PARAM_INVALID; - } - GELOGI("Dump exception info SUCCESS"); - } else { - GELOGE(PARAM_INVALID, "Get op desc info failed,task id:%u,stream id:%u", iter.taskid, iter.streamid); - return PARAM_INVALID; - } - } - return SUCCESS; -} } // namespace ge diff --git a/ge/graph/load/model_manager/data_dumper.h b/ge/graph/load/model_manager/data_dumper.h index 06b42afd..d1714950 100755 --- a/ge/graph/load/model_manager/data_dumper.h +++ b/ge/graph/load/model_manager/data_dumper.h @@ -27,7 +27,7 @@ #include "graph/node.h" #include "graph/compute_graph.h" #include "proto/ge_ir.pb.h" -#include "proto/op_mapping_info.pb.h" +#include "proto/op_mapping.pb.h" #include "runtime/mem.h" #include "task_info/task_info.h" #include "framework/common/ge_types.h" @@ -70,8 +70,6 @@ class DataDumper { void SaveDumpInput(const std::shared_ptr &node); - void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id); - // args is device memory stored first output addr void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::shared_ptr &op_desc, uintptr_t args); void SaveEndGraphId(uint32_t task_id, uint32_t stream_id); @@ -87,14 +85,8 @@ class DataDumper { void SetDumpProperties(const DumpProperties &dump_properties) { dump_properties_ = dump_properties; } const DumpProperties &GetDumpProperties() const { return dump_properties_; } - bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; const std::vector &GetAllOpDescInfo() const { return op_desc_info_; } - // Dump exception info - Status DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file); - Status DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file); - Status DumpExceptionInfo(const std::vector exception_infos); - private: void ReleaseDevMem(void **ptr) noexcept; @@ -136,24 +128,25 @@ class DataDumper { DumpProperties dump_properties_; // Build task info of op mapping info - Status BuildTaskInfo(aicpu::dump::OpMappingInfo &op_mapping_info); - Status DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task); - Status DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Output &output, size_t i, - const std::string &node_name_index); - Status DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task); - Status DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task); - Status DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, aicpu::dump::Input &input, size_t i, - const std::string &node_name_index); - Status ExecuteLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info); - void SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, aicpu::dump::OpMappingInfo &op_mapping_info); + Status BuildTaskInfo(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); + Status DumpOutput(const InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Task &task); + Status DumpRefOutput(const DataDumper::InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Output &output, + size_t i, const std::string &node_name_index); + Status DumpOutputWithTask(const InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Task &task); + Status DumpInput(const InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Task &task); + Status DumpRefInput(const DataDumper::InnerDumpInfo &inner_dump_info, toolkit::aicpu::dump::Input &input, + size_t i, const std::string &node_name_index); + Status ExecuteLoadDumpInfo(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); + void SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id, + toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); void SetOpDebugIdToAicpu(uint32_t task_id, uint32_t stream_id, void *op_debug_addr, - aicpu::dump::OpMappingInfo &op_mapping_info); - Status ExecuteUnLoadDumpInfo(aicpu::dump::OpMappingInfo &op_mapping_info); - Status GenerateInput(aicpu::dump::Input &input, const OpDesc::Vistor &tensor_descs, + toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); + Status ExecuteUnLoadDumpInfo(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info); + Status GenerateInput(toolkit::aicpu::dump::Input &input, const OpDesc::Vistor &tensor_descs, const uintptr_t &addr, size_t index); - Status GenerateOutput(aicpu::dump::Output &output, const OpDesc::Vistor &tensor_descs, + Status GenerateOutput(toolkit::aicpu::dump::Output &output, const OpDesc::Vistor &tensor_descs, const uintptr_t &addr, size_t index); - void GenerateOpBuffer(const int64_t &size, aicpu::dump::Task &task); + void GenerateOpBuffer(const int64_t &size, toolkit::aicpu::dump::Task &task); }; struct DataDumper::InnerDumpInfo { uint32_t task_id; diff --git a/ge/graph/load/model_manager/data_inputer.h b/ge/graph/load/model_manager/data_inputer.h index 14ebcea5..b8d145d4 100755 --- a/ge/graph/load/model_manager/data_inputer.h +++ b/ge/graph/load/model_manager/data_inputer.h @@ -134,6 +134,8 @@ class DataInputer { /// void Stop() { queue_.Stop(); } + uint32_t Size() { return queue_.Size(); } + private: /// /// @ingroup domi_ome diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc index 78f4a64c..81edd40b 100755 --- a/ge/graph/load/model_manager/davinci_model.cc +++ b/ge/graph/load/model_manager/davinci_model.cc @@ -50,7 +50,6 @@ #include "graph/utils/type_utils.h" #include "init/gelib.h" #include "mmpa/mmpa_api.h" -#include "omm/csa_interact.h" #include "runtime/base.h" #include "runtime/dev.h" #include "runtime/event.h" @@ -2656,9 +2655,9 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b GE_CHECK_NOTNULL(model_manager); auto exception_infos = model_manager->GetExceptionInfos(); if (exception_infos.size() > 0) { - GE_CHK_STATUS_RET(data_dumper_.DumpExceptionInfo(exception_infos), "Dump exception info failed"); + GE_CHK_STATUS_RET(DumpExceptionInfo(exception_infos), "[Dump][Exception] Dump exception info failed."); } else { - GELOGI("Exception info is null"); + GELOGI("[Dump][Exception] Exception info is null."); } GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed."); return INTERNAL_ERROR; @@ -2718,7 +2717,6 @@ Status DavinciModel::ReturnNoOutput(uint32_t data_id) { void *DavinciModel::Run(DavinciModel *model) { GE_CHK_BOOL_EXEC(model != nullptr, - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return nullptr, "model_pointer is null!") bool seq_end_flag = false; uint32_t model_id = model->Id(); @@ -2735,17 +2733,20 @@ void *DavinciModel::Run(DavinciModel *model) { // DeviceReset before thread run finished! GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(device_id)); }); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); while (model->RunFlag()) { + // Model hasn't truly started runing before received data + model->SetRunningFlag(false); bool rslt_flg = true; if (model->GetDataInputer() == nullptr) { GELOGW("Data inputer is nullptr."); - CsaInteract::GetInstance().StoreInternalErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); break; } std::shared_ptr data_wrapper; Status ret = model->GetDataInputer()->Pop(data_wrapper); + // Model run indeedly start after received data. + model->SetRunningFlag(true); if (data_wrapper == nullptr || ret != SUCCESS) { GELOGI("data_wrapper is null!"); continue; @@ -2759,7 +2760,6 @@ void *DavinciModel::Run(DavinciModel *model) { ret = model->SyncVarData(); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "Copy input data to model failed."); // [No need to check value] GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(Model_SyncVarData, "Model Run SyncVarData")); @@ -2769,7 +2769,6 @@ void *DavinciModel::Run(DavinciModel *model) { ret = model->CopyInputData(current_data, false); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "Copy input data to model failed."); // [No need to check value] if (model->is_online_infer_dynamic_ && !model->is_getnext_sink_dynamic_) { model->cur_dynamic_dims_.clear(); @@ -2790,7 +2789,6 @@ void *DavinciModel::Run(DavinciModel *model) { rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue); GELOGI("rtModelExecute end"); GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute")); @@ -2808,7 +2806,6 @@ void *DavinciModel::Run(DavinciModel *model) { rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag); (void)model->ReturnResult(current_data.index, false, seq_end_flag, data_wrapper->GetOutput()); // [No need to check value] - CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue); } @@ -2832,10 +2829,11 @@ void *DavinciModel::Run(DavinciModel *model) { model->iterator_count_++; model->is_first_execute_ = false; - GELOGI("run iterator count is %lu", model->iterator_count_); + // model run finished + model->SetRunningFlag(false); + GELOGI("run iterator count is %lu, model_id:%u", model->iterator_count_, model->model_id_); } - CsaInteract::GetInstance().WriteInternalErrorCode(); GELOGI("Model run end, model id:%u", model->model_id_); return nullptr; } @@ -2888,9 +2886,9 @@ Status DavinciModel::ModelRunStart() { int64_t maxDumpOpNum = std::strtol(opt.c_str(), nullptr, kDecimal); maxDumpOpNum_ = maxDumpOpNum; - error_context_ = ErrorManager::GetInstance().GetErrorContext(); + error_context_ = ErrorManager::GetInstance().GetErrorManagerContext(); CREATE_STD_THREAD(thread_id_, DavinciModel::Run, this); - GELOGI("model tread create success, model id:%u.", model_id_); + GELOGI("model thread create success, model id:%u.", model_id_); return SUCCESS; } @@ -4340,4 +4338,43 @@ Status DavinciModel::InitL1DataDumperArgs() { return SUCCESS; } +Status DavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + auto listener = dynamic_cast(listener_.get()); + GE_CHECK_NOTNULL(listener); + listener->SetCallback(callback); + return SUCCESS; +} + +void DavinciModel::UpdateOpIOAddrs(uint32_t task_id, uint32_t stream_id, const std::vector &io_addrs) { + if (fixed_mem_base_ == reinterpret_cast(mem_base_)) { + GELOGD("[Update][OpIOAddrs] No need to update op input output addr."); + return; + } + + OpDescInfo *op_desc_info = exception_dumper_.MutableOpDescInfo(task_id, stream_id); + if (op_desc_info == nullptr) { + GELOGW("[Update][OpIOAddrs] Find op desc failed, task_id: %u, stream_id: %u.", task_id, stream_id); + return; + } + size_t input_size = op_desc_info->input_addrs.size(); + size_t output_size = op_desc_info->output_addrs.size(); + if (input_size + output_size != io_addrs.size()) { + GELOGW("[Update][OpIOAddrs] Op[%s] input size[%zu] and output size[%zu] is not equal to io addr size[%zu]", + op_desc_info->op_name.c_str(), input_size, output_size, io_addrs.size()); + return; + } + + vector input_addrs; + vector output_addrs; + for (size_t i = 0; i < io_addrs.size(); i++) { + if (i < input_size) { + input_addrs.emplace_back(GetRunAddress(io_addrs[i])); + } else { + output_addrs.emplace_back(GetRunAddress(io_addrs[i])); + } + } + op_desc_info->input_addrs = input_addrs; + op_desc_info->output_addrs = output_addrs; + GELOGD("[Update][OpIOAddrs] Op [%s] update input output addr success.", op_desc_info->op_name.c_str()); +} } // namespace ge diff --git a/ge/graph/load/model_manager/davinci_model.h b/ge/graph/load/model_manager/davinci_model.h index 30240f25..736272f7 100755 --- a/ge/graph/load/model_manager/davinci_model.h +++ b/ge/graph/load/model_manager/davinci_model.h @@ -29,6 +29,7 @@ #include "common/helper/om_file_helper.h" #include "common/opskernel/ge_task_info.h" #include "common/properties_manager.h" +#include "common/dump/exception_dumper.h" #include "common/dump/opdebug_register.h" #include "common/types.h" #include "framework/common/util.h" @@ -221,6 +222,11 @@ class DavinciModel { /// DataInputer *const GetDataInputer() const { return data_inputer_; } + uint32_t GetDataInputerSize() { + GE_CHECK_NOTNULL(data_inputer_); + return data_inputer_->Size(); + } + // get Stream number uint32_t StreamNum() const { return runtime_param_.stream_num; } @@ -418,7 +424,7 @@ class DavinciModel { /// uint64_t GetSessionId() const { return session_id_; } - const struct ErrorMessage::Context &GetErrorContext() const { return error_context_; } + const struct error_message::Context &GetErrorContext() const { return error_context_; } /// /// @ingroup ge @@ -471,13 +477,17 @@ class DavinciModel { Status ReportProfilingData(); void SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op, uint32_t task_id, uint32_t stream_id) { - data_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id); + exception_dumper_.SaveDumpOpInfo(model_param, op, task_id, stream_id); } void SaveDumpTask(uint32_t task_id, uint32_t stream_id, const shared_ptr &op_desc, uintptr_t args) { data_dumper_.SaveDumpTask(task_id, stream_id, op_desc, args); } + Status DumpExceptionInfo(const std::vector &exception_infos) const { + return exception_dumper_.DumpExceptionInfo(exception_infos); + } + void SetKnownShapeGlobalStep(void *global_step) { known_shape_global_step_ = global_step; } @@ -557,8 +567,13 @@ class DavinciModel { const DumpProperties &GetDumpProperties() const { return data_dumper_.GetDumpProperties(); } bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { - return data_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info); + return exception_dumper_.GetOpDescInfo(stream_id, task_id, op_desc_info); } + void UpdateOpIOAddrs(uint32_t task_id, uint32_t stream_id, const std::vector &io_addrs); + + bool GetRunningFlag() const { return running_flg_; } + void SetRunningFlag(bool flag) { running_flg_ = flag; } + Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback); private: // memory address of weights @@ -924,6 +939,8 @@ class DavinciModel { shared_ptr listener_; bool run_flg_; + // check whether model is running with data + bool running_flg_ = false; mutex mux_run_flg_; @@ -975,7 +992,7 @@ class DavinciModel { vector output_mbuf_list_; // output mbuf created by dequeue task. uint64_t session_id_; - struct ErrorMessage::Context error_context_; + struct error_message::Context error_context_; uint32_t device_id_; @@ -1001,6 +1018,7 @@ class DavinciModel { int64_t maxDumpOpNum_; // for data dump DataDumper data_dumper_; + ExceptionDumper exception_dumper_; OpdebugRegister opdebug_register_; uint64_t iterator_count_; bool is_l1_fusion_enable_; diff --git a/ge/graph/load/model_manager/model_manager.cc b/ge/graph/load/model_manager/model_manager.cc index 84259731..a288e14e 100755 --- a/ge/graph/load/model_manager/model_manager.cc +++ b/ge/graph/load/model_manager/model_manager.cc @@ -280,6 +280,7 @@ ModelManager::~ModelManager() { model_map_.clear(); model_aicpu_kernel_.clear(); cust_aicpu_so_.clear(); + dump_exception_flag_ = false; GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0))); } @@ -330,6 +331,7 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptrGetSubgraphInstanceNameToModel(); string om_name; @@ -339,11 +341,7 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr davinci_model = MakeShared(0, listener); - if (davinci_model == nullptr) { - REPORT_CALL_ERROR("E19999", "New DavinciModel fail, model_id:%u", model_id); - GELOGE(FAILED, "davinci_model is nullptr"); - return FAILED; - } + GE_CHECK_NOTNULL(davinci_model); davinci_model->SetProfileTime(MODEL_LOAD_START, (timespec.tv_sec * kTimeSpecNano + timespec.tv_nsec)); // 1000 ^ 3 converts second to nanosecond davinci_model->SetId(model_id); @@ -363,7 +361,18 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptrAssign(ge_model)), GELOGW("assign model to modeldef failed."); break;); GE_TIMESTAMP_END(Assign, "GraphLoader::ModelAssign"); - + /// In multi-threaded inference, using the same session_id among multiple threads may cause some threads to fail. + /// These session_ids come from the same model, so the values of session_id are the same. + /// Update session_id for infer in load model to avoid the same session_id. + if (!ge_root_model->GetTrainFlag()) { + uint64_t new_session_id; + ret = GenSessionId(new_session_id); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Generate session_id for infer failed."); + ret = davinci_model->UpdateSessionId(new_session_id); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ret, "Update session_id for infer failed."); + ge_model->InsertSessionMap(model_id, new_session_id); + GELOGD("Update new session id: %lu.", new_session_id); + } GE_TIMESTAMP_START(Init); GE_IF_BOOL_EXEC(SUCCESS != (ret = davinci_model->Init()), GELOGW("DavinciInit failed."); break;); GE_TIMESTAMP_END(Init, "GraphLoader::ModelInit"); @@ -376,16 +385,16 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr &davinci_model) { - GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", id); +void ModelManager::InsertModel(uint32_t model_id, std::shared_ptr &davinci_model) { + GE_CHK_BOOL_EXEC(davinci_model != nullptr, return, "davinci_model ptr is null, id: %u", model_id); std::lock_guard lock(map_mutex_); - model_map_[id] = davinci_model; + model_map_[model_id] = davinci_model; } -void ModelManager::InsertModel(uint32_t id, shared_ptr &hybrid_model) { - GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", id); +void ModelManager::InsertModel(uint32_t model_id, shared_ptr &hybrid_model) { + GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", model_id); std::lock_guard lock(map_mutex_); - hybrid_model_map_[id] = hybrid_model; + hybrid_model_map_[model_id] = hybrid_model; } Status ModelManager::DeleteModel(uint32_t id) { @@ -1083,7 +1092,7 @@ Status ModelManager::GenSessionId(uint64_t &session_id) { mmTimeval tv; if (mmGetTimeOfDay(&tv, nullptr) != 0) { - REPORT_CALL_ERROR("E19999", "Call mmGetTimeOfDay fail"); + REPORT_CALL_ERROR("E19999", "Call mmGetTimeOfDay fail. errmsg:%s", strerror(errno)); GELOGE(INTERNAL_ERROR, "Failed to get current time."); return INTERNAL_ERROR; } @@ -1575,9 +1584,21 @@ Status ModelManager::GetOpDescInfo(uint32_t device_id, uint32_t stream_id, uint3 for (const auto &model : model_map_) { auto davinci_model = model.second; if (davinci_model->GetDeviceId() == device_id) { - GELOGI("Start to GetOpDescInfo of device_id: %u.", device_id); + GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in davinci model.", device_id); if (davinci_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) { - GELOGI("Find specific node of stream_id: %u, task_id: %u.", stream_id, task_id); + GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in davinci model.", + stream_id, task_id); + return SUCCESS; + } + } + } + for (const auto &model : hybrid_model_map_) { + auto hybrid_model = model.second; + if (hybrid_model->GetDeviceId() == device_id) { + GELOGI("[Get][OpDescInfo] Start to GetOpDescInfo of device_id: %u in hybrid model.", device_id); + if (hybrid_model->GetOpDescInfo(stream_id, task_id, op_desc_info)) { + GELOGI("[Get][OpDescInfo] Find specific node of stream_id: %u, task_id: %u in hybrid model.", + stream_id, task_id); return SUCCESS; } } @@ -1590,6 +1611,7 @@ Status ModelManager::EnableExceptionDump(const std::map &options if (iter != options.end()) { GELOGI("Find option enable_exeception_dump is %s", iter->second.c_str()); if (iter->second == "1") { + dump_exception_flag_ = true; rtError_t rt_ret = rtSetTaskFailCallback(reinterpret_cast(ExceptionCallback)); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtSetTaskFailCallback fail, ret = 0x%X", diff --git a/ge/graph/load/model_manager/model_manager.h b/ge/graph/load/model_manager/model_manager.h index b537943b..bf804d32 100755 --- a/ge/graph/load/model_manager/model_manager.h +++ b/ge/graph/load/model_manager/model_manager.h @@ -313,6 +313,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { instance->AddExceptionInfo(*rt_exception_info); } + bool IsDumpExceptionOpen() { return dump_exception_flag_; } private: /// /// @ingroup domi_ome @@ -330,8 +331,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { /// @ingroup domi_ome /// @brief insert new model into model manager set /// - void InsertModel(uint32_t id, std::shared_ptr &davinci_model); - void InsertModel(uint32_t id, std::shared_ptr &hybrid_model); + void InsertModel(uint32_t model_id, std::shared_ptr &davinci_model); + void InsertModel(uint32_t model_id, std::shared_ptr &hybrid_model); /// /// @ingroup domi_ome @@ -356,6 +357,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager { std::map> cust_aicpu_so_; static DumpProperties dump_properties_; + bool dump_exception_flag_ = false; }; } // namespace ge diff --git a/ge/graph/load/model_manager/model_utils.cc b/ge/graph/load/model_manager/model_utils.cc index 80bdec9b..058a538f 100755 --- a/ge/graph/load/model_manager/model_utils.cc +++ b/ge/graph/load/model_manager/model_utils.cc @@ -319,7 +319,7 @@ vector ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(static_cast(i)); GE_IF_BOOL_EXEC(tensor_desc == nullptr, GELOGD("Op: %s, Index: %zu, has no input", op_desc->GetName().c_str(), i); continue;) - if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) { + if ((i < v_is_input_const.size()) && v_is_input_const[i]) { // TBE: add weights address to input int64_t tensor_size = 0; GE_CHK_STATUS(TensorUtils::GetSize(*tensor_desc, tensor_size)); diff --git a/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc b/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc index de987d86..e2f600b3 100644 --- a/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc +++ b/ge/graph/load/model_manager/task_info/kernel_ex_task_info.cc @@ -357,6 +357,7 @@ void KernelExTaskInfo::SetIoAddrs(const OpDescPtr &op_desc) { Status KernelExTaskInfo::UpdateArgs() { GELOGI("KernelExTaskInfo::UpdateArgs in."); davinci_model_->SetTotalIOAddrs(io_addrs_); + davinci_model_->UpdateOpIOAddrs(task_id_, stream_id_, io_addrs_); GELOGI("KernelExTaskInfo::UpdateArgs success."); return SUCCESS; } diff --git a/ge/graph/load/model_manager/task_info/kernel_task_info.cc b/ge/graph/load/model_manager/task_info/kernel_task_info.cc index 4485515a..82c3e286 100755 --- a/ge/graph/load/model_manager/task_info/kernel_task_info.cc +++ b/ge/graph/load/model_manager/task_info/kernel_task_info.cc @@ -523,6 +523,7 @@ Status KernelTaskInfo::UpdateArgs() { return CopyNoncontinuousArgs(io_addr_offset_); } davinci_model_->SetTotalIOAddrs(io_addrs_); + davinci_model_->UpdateOpIOAddrs(task_id_, stream_id_, io_addrs_); } else if (kernel_type_ == ccKernelType::AI_CPU || kernel_type_ == ccKernelType::CUST_AI_CPU) { return CopyNoncontinuousArgs(sizeof(aicpu::AicpuParamHead)); } diff --git a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc index 9ba62475..2a3e3a17 100644 --- a/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc +++ b/ge/graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc @@ -31,7 +31,9 @@ Status SuperKernelFactory::Init() { std::string skt_bin = "libcce_aicore.so"; handle_ = mmDlopen(skt_bin.c_str(), MMPA_RTLD_NOW | MMPA_RTLD_GLOBAL); if (handle_ == nullptr) { - GELOGE(FAILED, "SKT: open skt lib failed, please check LD_LIBRARY_PATH."); + const char* error = mmDlerror(); + GE_IF_BOOL_EXEC(error == nullptr, error = ""); + GELOGE(FAILED, "SKT: open skt lib failed, please check LD_LIBRARY_PATH. errmsg:%s", error); } rtError_t rt_ret; rt_ret = rtGetFunctionByName(this->sk_stub_name_.c_str(), &this->func_stub_); diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc index f7357d9d..a1a7034b 100755 --- a/ge/graph/manager/graph_manager.cc +++ b/ge/graph/manager/graph_manager.cc @@ -101,7 +101,7 @@ #include "graph/utils/tensor_adapter.h" #include "inc/pass_manager.h" #include "init/gelib.h" -#include "ir_build/atc_ir_common.h" +#include "ir_build/option_utils.h" #include "graph/common/local_context.h" #include "graph/common/omg_util.h" #include "common/formats/utils/formats_trans_utils.h" @@ -121,6 +121,11 @@ const char *const kAIcoreEngine = "AIcoreEngine"; const int32_t kDynamicDimsTypeIsGetNext = 0; const int32_t kDynamicDimsTypeIsData = 1; const char *const kGetNextName = "IteratorV2"; +const uint32_t kInitGraphCount = 1; +const uint32_t kNotAdded = 0; +const uint32_t kStartAdd = 1; +const uint32_t kDoneAdded = 2; +const uint32_t kNeverLoaded = 0; bool IsTailingOptimization() { string is_tailing_optimization_option; @@ -160,7 +165,7 @@ GraphManager::GraphManager() } Status GraphManager::Initialize(const std::map &options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); if (init_flag_) { GELOGW("[Initialize] GraphManager already initialized."); return SUCCESS; @@ -202,6 +207,8 @@ Status GraphManager::Initialize(const std::map &options) { graph_map_.clear(); cache_helper_map_.clear(); + graph_id_to_add_graph_cond_.clear(); + graph_count_.clear(); init_flag_ = true; thread_run_flag_ = true; @@ -211,6 +218,20 @@ Status GraphManager::Initialize(const std::map &options) { return SUCCESS; } +Status GraphManager::UnloadModel(GeRootModelPtr ge_root_model, uint32_t graph_id) { + Status ret = SUCCESS; + for (size_t i = 0; i < ge_root_model->GetAllModelId().size(); ++i) { + uint32_t model_id = ge_root_model->GetAllModelId()[i]; + GELOGI("Unload model %u.", model_id); + ret = GraphLoader::UnloadModel(model_id); + if (ret != SUCCESS) { + GELOGW("[GraphManager] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id); + return ret; + } + } + return ret; +} + Status GraphManager::Finalize() { if (!init_flag_) { GELOGW("GraphManager has not been initialized."); @@ -241,7 +262,6 @@ Status GraphManager::Finalize() { unload_model_ret = GE_GRAPH_GRAPH_IS_RUNNING; continue; } - // unload model auto ge_root_model = graph_node->GetGeRootModel(); if (ge_root_model != nullptr && ge_root_model->GetModelId() != INVALID_MODEL_ID && graph_node->GetLoadFlag()) { @@ -251,15 +271,14 @@ Status GraphManager::Finalize() { unload_model_ret = FAILED; continue; } - ret = GraphLoader::UnloadModel(ge_root_model->GetModelId()); + ret = UnloadModel(ge_root_model, iter->first); if (ret != SUCCESS) { - GELOGW("[GraphManager] unload model failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), iter->first); + GELOGW("[GraphManager] unload model failed, graph_id=%u.", iter->first); unload_model_ret = ret; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - GELOGW("[GraphManager] rtDeviceReset failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), - iter->first); + GELOGW("[GraphManager] rtDeviceReset failed, graphId=%u.", iter->first); unload_model_ret = FAILED; continue; } @@ -274,6 +293,7 @@ Status GraphManager::Finalize() { } graph_map_.clear(); cache_helper_map_.clear(); + graph_count_.clear(); // graph context if (graph_context_ != nullptr) { @@ -326,35 +346,59 @@ Status GraphManager::InitDynamicParams(ComputeGraphPtr &compute_graph) { return SUCCESS; } -Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, - const std::map &options, - const OmgContext &omg_context) { - if (HasGraphNode(graph_id)) { - REPORT_INNER_ERROR("E19999", "graph_id:%u is exist, check invalid", graph_id); - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] graph exists, graph_id = %u.", graph_id); - return GE_GRAPH_GRAPH_ALREADY_EXIST; +void GraphManager::SetAddGraphCondition(GraphId graph_id, uint32_t cond) { + std::lock_guard lock(add_graph_cond_mutex_); + graph_id_to_add_graph_cond_[graph_id] = cond; + GELOGD("Graph [id:%u] has been added.", graph_id); +} + +uint32_t GraphManager::GetAddGraphCondition(GraphId graph_id) { + std::lock_guard lock(add_graph_cond_mutex_); + auto it = graph_id_to_add_graph_cond_.find(graph_id); + if (it != graph_id_to_add_graph_cond_.end()) { + return it->second; + } else { + GELOGD("Graph [id:%u] has not been added.", graph_id); + return kNotAdded; } +} - auto compute_graph = GraphUtils::GetComputeGraph(graph); - if (compute_graph != nullptr) { - compute_graph->SetGraphID(graph_id); - bool graph_has_been_added = false; - if (AttrUtils::GetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, graph_has_been_added) - && graph_has_been_added) { - REPORT_INNER_ERROR("E19999", "Get Attr:%s from graph:%u fail", - ATTR_NAME_GRAPH_HAS_BEEN_ADDED.c_str(), graph_id); - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, - "[GraphManager] same graph object can not be added again, graph_id = %u.", graph_id); - return GE_GRAPH_GRAPH_ALREADY_EXIST; - } - (void)AttrUtils::SetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, true); - compute_graph_ = compute_graph; +void GraphManager::RemoveAddGraphCondition(GraphId graph_id) { + std::lock_guard lock(add_graph_cond_mutex_); + auto it = graph_id_to_add_graph_cond_.find(graph_id); + if (it != graph_id_to_add_graph_cond_.end()) { + graph_id_to_add_graph_cond_.erase(it); + GELOGD("Successfully removed add_graph_cond of graph [id:%u].", graph_id); } else { - REPORT_INNER_ERROR("E19999", "compute_graph from graph:%u is nullptr, check invalid", - graph_id); - GELOGE(FAILED, "compute graph is null"); - return FAILED; + GELOGD("Graph [id:%u] has not been added. no need to remove.", graph_id); } +} + +Status GraphManager::CheckRepeatAdd(uint32_t graph_id, bool &is_added) { + uint32_t count = 0; + if (GetGraphCount(graph_id, count) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed, graph might have not been added.", graph_id); + return INTERNAL_ERROR; + } + // previous thread owns same graph_id has been in the middle of the AddGraph procession + if (count > 1 && GetAddGraphCondition(graph_id) == kStartAdd) { + std::unique_lock lock(add_graph_mutex_); + GELOGD("Waitting for build end of previous thread."); + while (GetAddGraphCondition(graph_id) != kDoneAdded) { + add_graph_cv_.wait(lock); + } + GraphNodePtr graph_node; + Status ret = GetGraphNode(graph_id, graph_node); + if (ret != SUCCESS) { + GELOGE(ret, "[AddGraph] GetGraphNode failed, graph_id = %u.", graph_id); + return ret; + } + is_added = true; + } + return SUCCESS; +} + +void GraphManager::SetSessionGraphId(ComputeGraphPtr compute_graph, uint32_t graph_id) { std::string session_graph_id; if (!AttrUtils::GetStr(*compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id) || session_graph_id.empty()) { session_graph_id = "-1_" + to_string(graph_id); @@ -366,7 +410,24 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, } GELOGD("Get graph session_graph_id attr failed, set session id to default value: [0]"); } +} + +Status GraphManager::NotifyWaittingGraph(uint32_t graph_id) { + uint32_t count = 0; + if (GetGraphCount(graph_id, count) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed, graph might have not been added.", graph_id); + return INTERNAL_ERROR; + } + GELOGD("Add graph finished, graph_id:%u", graph_id); + if (count > 1) { + GELOGD("Finish addgraph, graph_id:%u, graph_count:%u, start to notify.", graph_id, count); + add_graph_cv_.notify_all(); + } + return SUCCESS; +} +Status GraphManager::CreateGraphNode(uint32_t graph_id, const Graph &graph, + const std::map &options) { GraphNodePtr graph_node = MakeShared(graph_id); GE_IF_BOOL_EXEC(graph_node == nullptr, REPORT_CALL_ERROR("E19999", "New GraphNode fail, graph_id:%u", @@ -385,7 +446,62 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, ParseOption(options, TUNING_PATH, options_.tuning_path); graph_node->SetGraph(graph_ptr); graph_node->SetOptions(options); + graph_node->IncreaseLoadCount(); AddGraphNode(graph_id, graph_node); + return SUCCESS; +} + +Status GraphManager::SetStagesOptions(uint32_t graph_id, const GraphManagerOptions &options) { + CompilerStages &stages = GetCompilerStages(graph_id); + stages.preparer.SetOptions(options_); + Status status = stages.optimizer.SetOptions(options_); + if (status != SUCCESS) { + GELOGE(status, "Graph optimizer set options failed."); + return status; + } + stages.builder.SetOptions(options_); + return SUCCESS; +} + +Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, + const std::map &options, + const OmgContext &omg_context) { + IncreaseGraphCount(graph_id); + // validation for adding graphs of same graph_id in multi-thread secenario + // 1.previous thread owns same graph_id has finished the AddGraph procession + if (GetAddGraphCondition(graph_id) == kDoneAdded) { + GraphNodePtr graph_node; + if (GetGraphNode(graph_id, graph_node) != SUCCESS) { + GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "Graph not exist while done adding previously, graph_id = %u.", graph_id); + return GE_GRAPH_GRAPH_NOT_EXIST; + } + graph_node->IncreaseLoadCount(); + return SUCCESS; + } + // In multi-thread scenario, former thread owns same graph_id has been + // in the middle of the AddGraph procession while following threads have to wait until + // done adding graph of the former graph, avoiding repeatively adding same graph. + bool is_added = false; + if (CheckRepeatAdd(graph_id, is_added) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "CheckRepeatAdd for graph[id:%u] failed.", graph_id); + return INTERNAL_ERROR; + } + // The former graph (from different thread) owns same graph id has been successfully added. + if (is_added) { + return SUCCESS; + } + // Do add graph + SetAddGraphCondition(graph_id, kStartAdd); + auto compute_graph = GraphUtils::GetComputeGraph(graph); + GE_CHECK_NOTNULL(compute_graph); + compute_graph->SetGraphID(graph_id); + (void)AttrUtils::SetBool(*compute_graph, ATTR_NAME_GRAPH_HAS_BEEN_ADDED, true); + SetSessionGraphId(compute_graph, graph_id); + + if (CreateGraphNode(graph_id, graph, options) != SUCCESS) { + GELOGE(FAILED, "Failed to create graph_node."); + return FAILED; + } AddLocalOmgContext(graph_id, omg_context); if (!options_.output_datatype.empty()) { @@ -396,27 +512,22 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, return GRAPH_PARAM_INVALID; } - CompilerStages &stages = GetCompilerStages(graph_id); - stages.preparer.SetOptions(options_); - Status status = stages.optimizer.SetOptions(options_); - if (status != SUCCESS) { - GELOGE(status, "Graph optimizer set options failed."); - return status; + if (SetStagesOptions(graph_id, options_) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Set stage options failed."); + return INTERNAL_ERROR; } - stages.builder.SetOptions(options_); var_acc_ctrl_.AddGraph(graph_id, compute_graph); + SetAddGraphCondition(graph_id, kDoneAdded); + // There are threads waitting for adding same graph + if (NotifyWaittingGraph(graph_id) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "NotifyWaittingGraph failed."); + return INTERNAL_ERROR; + } return SUCCESS; } -Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &graph, - const std::map &options, - const OmgContext &omg_context) { - if (HasGraphNode(graph_id)) { - REPORT_INNER_ERROR("E19999", "graph_id:%u is exist, check invalid", graph_id); - GELOGE(GE_GRAPH_GRAPH_ALREADY_EXIST, "[GraphManager] graph exists, graph_id = %u.", graph_id); - return GE_GRAPH_GRAPH_ALREADY_EXIST; - } +Status GraphManager::CheckGraphAdded(const GraphId &graph_id, const Graph &graph) { auto compute_graph = GraphUtils::GetComputeGraph(graph); if (compute_graph != nullptr) { compute_graph->SetGraphID(graph_id); @@ -435,58 +546,44 @@ Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &grap GELOGE(FAILED, "compute graph is null"); return FAILED; } - std::vector input_nodes; - std::vector output_nodes; - auto new_compute_graph = GraphUtils::CloneGraph(compute_graph, "", input_nodes, output_nodes); - std::string session_graph_id; - if (!AttrUtils::GetStr(*new_compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id) || - session_graph_id.empty()) { - session_graph_id = "-1_" + to_string(graph_id); - if (!AttrUtils::SetStr(*new_compute_graph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id)) { - GELOGW("Set attribute of compute graph failed."); - } - for (auto &subgraph : new_compute_graph->GetAllSubgraphs()) { - (void)AttrUtils::SetStr(*subgraph, ATTR_NAME_SESSION_GRAPH_ID, session_graph_id); - } - GELOGD("Get graph session_graph_id attr failed, set session id to default value: [0]"); - } + return SUCCESS; +} - GraphNodePtr graph_node = MakeShared(graph_id); - if (graph_node == nullptr) { - REPORT_CALL_ERROR("E19999", "New GraphNode fail, graph_id:%u", - graph_id); - GELOGE(FAILED, "GraphNode make shared failed"); +Status GraphManager::AddGraphWithCopy(const GraphId &graph_id, const Graph &graph, + const std::map &options, + const OmgContext &omg_context) { + if (CheckGraphAdded(graph_id, graph) != SUCCESS) { + GELOGE(FAILED, "AddGraphWithCopy failed."); return FAILED; } - std::shared_ptr graph_ptr = GraphUtils::CreateGraphPtrFromComputeGraph(new_compute_graph); - if (graph_ptr == nullptr) { - REPORT_CALL_ERROR("E19999", "New Graph fail, graph_id:%u", - graph_id); - GELOGE(FAILED, "GraphPtr make shared failed"); + IncreaseGraphCount(graph_id); + // Do add graph + auto compute_graph = GraphUtils::GetComputeGraph(graph); + std::vector input_nodes; + std::vector output_nodes; + auto new_compute_graph = GraphUtils::CloneGraph(compute_graph, "", input_nodes, output_nodes); + GE_CHECK_NOTNULL(new_compute_graph); + new_compute_graph->SetGraphID(graph_id); + SetSessionGraphId(new_compute_graph, graph_id); + std::shared_ptr new_graph_ptr = GraphUtils::CreateGraphPtrFromComputeGraph(new_compute_graph); + if (CreateGraphNode(graph_id, *new_graph_ptr, options) != SUCCESS) { + GELOGE(FAILED, "Failed to create graph_node."); return FAILED; } - // update option about tuning graph - ParseOption(options, BUILD_MODE, options_.build_mode); - ParseOption(options, BUILD_STEP, options_.build_step); - ParseOption(options, TUNING_PATH, options_.tuning_path); - - graph_node->SetGraph(graph_ptr); - graph_node->SetOptions(options); - AddGraphNode(graph_id, graph_node); AddLocalOmgContext(graph_id, omg_context); if (!options_.output_datatype.empty()) { GetLocalOmgContext().output_type = options_.output_datatype; } + if (InitDynamicParams(new_compute_graph) != SUCCESS) { + GELOGE(GRAPH_PARAM_INVALID, "Failed to init params when online infer is dynamic."); + return GRAPH_PARAM_INVALID; + } - CompilerStages &stages = GetCompilerStages(graph_id); - stages.preparer.SetOptions(options_); - Status status = stages.optimizer.SetOptions(options_); - if (status != SUCCESS) { - GELOGE(status, "Graph optimizer set options failed."); - return status; + if (SetStagesOptions(graph_id, options_) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Set stage options failed."); + return INTERNAL_ERROR; } - stages.builder.SetOptions(options_); var_acc_ctrl_.AddGraph(graph_id, new_compute_graph); return SUCCESS; @@ -586,7 +683,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, compute_graph->GetGraphID(), subgraph, compute_graph->GetName(), session_id, - ErrorManager::GetInstance().GetErrorContext(), + ErrorManager::GetInstance().GetErrorManagerContext(), GetThreadLocalContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); @@ -603,7 +700,7 @@ Status GraphManager::OptimizeSubGraphWithMultiThreads(ComputeGraphPtr compute_gr std::future f = executor.commit(GraphManager::ProcessSubGraphWithMultiThreads, this, compute_graph->GetGraphID(), subgraph, compute_graph->GetName(), session_id, - ErrorManager::GetInstance().GetErrorContext(), + ErrorManager::GetInstance().GetErrorManagerContext(), GetThreadLocalContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); @@ -715,7 +812,7 @@ Status GraphManager::SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_gr Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, const std::vector &inputs, ge::ComputeGraphPtr &compute_graph, uint64_t session_id) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kPrepareOptimize); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kPrepareOptimize); GE_CHECK_NOTNULL(graph_node); GE_CHECK_NOTNULL(compute_graph); @@ -724,10 +821,10 @@ Status GraphManager::PreRunOptimizeOriginalGraph(const GraphNodePtr &graph_node, GM_RUN_AND_DUMP_PERF("HandleSummaryOp", stages.optimizer.HandleSummaryOp, compute_graph); GM_RUN_AND_DUMP_PERF("Prepare", stages.preparer.PrepareDynShape, graph_node, inputs, compute_graph, session_id); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOriginOptimize); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOriginOptimize); GM_RUN_AND_DUMP_PERF("OptimizeOriginalGraph", stages.optimizer.OptimizeOriginalGraph, compute_graph); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kPrepareOptimize); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kPrepareOptimize); GM_RUN_AND_DUMP_PERF("PrepareRunningFormatRefiner", stages.preparer.PrepareRunningFormatRefiner); GM_RUN_AND_DUMP_PERF("RefineRunningFormat", stages.optimizer.OptimizeOriginalGraphJudgeInsert, compute_graph); GM_RUN_AND_DUMP_PERF("SubexpressionMigration", SubexpressionMigration, compute_graph); @@ -770,7 +867,7 @@ Status GraphManager::PreRunAfterOptimizeSubGraph(const GraphNodePtr &graph_node, GE_CHECK_NOTNULL(graph_node); GE_CHECK_NOTNULL(compute_graph); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kMergeGraphOptimize); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kMergeGraphOptimize); CompilerStages &stages = GetCompilerStages(graph_node->GetGraphId()); GM_RUN_AND_DUMP_PERF("OptimizeWholeGraph", stages.optimizer.OptimizeWholeGraph, compute_graph); GM_RUN_AND_DUMP_PERF("Optimize2", OptimizeStage2, compute_graph); @@ -864,7 +961,7 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vectorGetGraphId(), compute_graph, ge_model); @@ -936,7 +1033,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: // it will not execute graph prreprocess, optimize, parition, build if the graph has built successful. Status ret = SUCCESS; if (IsGraphNeedBuild(graph_node)) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); if (graph_node->GetBuildFlag()) { REPORT_INNER_ERROR("E19999", "Graph:%u has not build before, can't run directly, " "check invalid", graph_node->GetGraphId()); @@ -958,7 +1055,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: return ret; } } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelLoad, ErrorMessage::kModelLoad); + ErrorManager::GetInstance().SetStage(error_message::kModelLoad, error_message::kModelLoad); if (!graph_node->IsAsync()) { ret = LoadGraph(ge_root_model, graph_node); } else { @@ -971,7 +1068,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: graph_node->SetBuildFlag(true); var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId()); } else if (!graph_node->GetLoadFlag()) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelLoad, ErrorMessage::kModelLoad); + ErrorManager::GetInstance().SetStage(error_message::kModelLoad, error_message::kModelLoad); GeRootModelPtr ge_root_model_ptr = graph_node->GetGeRootModel(); if (!graph_node->IsAsync()) { ret = LoadGraph(ge_root_model_ptr, graph_node); @@ -988,6 +1085,7 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std: Status GraphManager::LoadGraph(const GeRootModelPtr &ge_root_model, const GraphNodePtr &graph_node) { GELOGI("[LoadGraph] run_graph_flag[%d], graph_id[%u]", options_.run_graph_flag, graph_node->GetGraphId()); if (options_.run_graph_flag && ge_root_model != nullptr) { + ge_root_model->SetTrainFlag(GetTrainFlag()); // synchronization run graph with model std::shared_ptr model_listener = GetModelListener(); ModelIdInfo model_id_info; @@ -1129,7 +1227,7 @@ Status GraphManager::InnerRunGraph(GraphNodePtr &graph_node, const GraphId &grap Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector &inputs, std::vector &outputs, uint64_t session_id) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); std::lock_guard lock(run_mutex_); GELOGI("[RunGraph] start to run graph, graph_id = %u, is_train_graph: %d", graph_id, GetTrainFlag()); @@ -1190,7 +1288,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector &inputs, GeRootModelPtr &ge_root_model, uint64_t session_id, bool async) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGD("[BuildGraph] start to build graph, graph_id:%u", graph_id); if (inputs.empty()) { GELOGW("[BuildGraph] BuildGraph warning: empty GeTensor inputs"); @@ -1413,62 +1511,29 @@ bool GraphManager::CheckModelLoad(const GeRootModelPtr &ge_root_model, bool load } Status GraphManager::RemoveGraph(const GraphId &graph_id) { + auto it = to_be_deleted_graphs_.find(graph_id); + if (it != to_be_deleted_graphs_.end()) { + to_be_deleted_graphs_.erase(it); + } GraphNodePtr graph_node = nullptr; Status ret = GetGraphNode(graph_id, graph_node); - if (ret != SUCCESS) { - REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid", - graph_id); + if (ret != SUCCESS || graph_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Graph:%u not exist in graph_map, check invalid when GraphManager %s", + graph_id, __FUNCTION__); GELOGE(GE_GRAPH_GRAPH_NOT_EXIST, "[GraphManager] Id %u does not exists.", graph_id); return GE_GRAPH_GRAPH_NOT_EXIST; } - - if ((graph_node == nullptr) || (graph_node->GetRunFlag())) { - REPORT_INNER_ERROR("E19999", "Graph:%u is running, can't be remove, check invalid", - graph_id); - GELOGE(GE_GRAPH_GRAPH_IS_RUNNING, "[GraphManager] Id %u is running, can't be deleted.", graph_id); - return GE_GRAPH_GRAPH_IS_RUNNING; + if (graph_node->GetRunFlag()) { + // only put graph into to-be-deleted list when exceptional scenario + to_be_deleted_graphs_.insert(graph_id); + GELOGI("[GraphManager] Trying to remove running graph[Id:%u], added into to_be_deleted_graphs_.", graph_id); + return SUCCESS; } std::lock_guard lock(unload_model_mutex_); Status middle_ret; rtError_t rt_ret; - const std::vector &all_sub_graph = graph_node->GetAllSubGraph(); - for (size_t i = 0; i < all_sub_graph.size(); ++i) { - // must free buffer firstly - middle_ret = all_sub_graph[i]->FreeInOutBuffer(); - if (middle_ret != SUCCESS) { - GELOGE(middle_ret, "[GraphManager] RemoveGraph free mem failed, graph_id=%u.", graph_id); - ret = middle_ret; - } - if (all_sub_graph[i]->GeModelIsValid() && all_sub_graph[i]->GetModelIdInfo().model_id != INVALID_MODEL_ID) { - // unload model - GELOGI("UnloadModel via new ome."); - rt_ret = rtSetDevice(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, graph_id:%u", - GetContext().DeviceId(), graph_id); - GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", - all_sub_graph[i]->GetModelIdInfo().model_id, graph_id); - ret = FAILED; - continue; - } - middle_ret = GraphLoader::UnloadModel(all_sub_graph[i]->GetModelIdInfo().model_id); - if (middle_ret != SUCCESS) { - GELOGE(middle_ret, "[GraphManager:] unload model failed, modelId=%u, graph_id=%u.", - all_sub_graph[i]->GetModelIdInfo().model_id, graph_id); - ret = middle_ret; - } - rt_ret = rtDeviceReset(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtDeviceReset fail, device_id:%u, graph_id:%u", - GetContext().DeviceId(), graph_id); - GELOGE(RT_FAILED, "[GraphManager:] unload model failed, modelId=%u, graphId=%u.", - all_sub_graph[i]->GetModelIdInfo().model_id, graph_id); - ret = FAILED; - } - } - } var_acc_ctrl_.RemoveGraph(graph_id); RemoveGraphNode(graph_id); @@ -1476,7 +1541,6 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { auto ge_root_model = graph_node->GetGeRootModel(); if (CheckModelLoad(ge_root_model, graph_node->GetLoadFlag())) { - GELOGI("Unload model %u.", ge_root_model->GetModelId()); rt_ret = rtSetDevice(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, graph_id:%u", @@ -1485,23 +1549,27 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) { graph_id); return FAILED; } - middle_ret = GraphLoader::UnloadModel(ge_root_model->GetModelId()); + // same graph may be added for several times, different models were created separately, + // unload them respectively. + middle_ret = UnloadModel(ge_root_model, graph_id); if (middle_ret != SUCCESS) { - GELOGE(middle_ret, "[GraphManager:] unload model failed, modelId=%u, graph_id=%u.", ge_root_model->GetModelId(), - graph_id); + REPORT_INNER_ERROR("E19999", "UnloadModel for graph:%u failed, check unload detail in GraphLoader %s", + graph_id, __FUNCTION__); + GELOGE(middle_ret, "[GraphManager:] unload model failed, graph_id=%u.", graph_id); ret = middle_ret; } rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, graph_id:%u", - GetContext().DeviceId(), graph_id); - GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, modelId=%u, graphId=%u.", ge_root_model->GetModelId(), - graph_id); + REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, graph_id:%u, when GraphManager %s", + GetContext().DeviceId(), graph_id, __FUNCTION__); + GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, graphId=%u.", graph_id); ret = FAILED; } } RemoveCompilerStages(graph_id); + RemoveGraphCount(graph_id); + RemoveAddGraphCondition(graph_id); GE_CHK_STATUS_RET(ret, "[GraphManager:] Remove graph failed, graph_id=%u.", graph_id); GELOGI("[GraphManager] remove graph success, graph_id=%u.", graph_id); @@ -2588,6 +2656,7 @@ void GraphManager::ChangeConstTypeWhenTraining(const ComputeGraphPtr &compute_gr Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const GraphNodePtr &graph_node) { GELOGI("[LoadGraphAsync] run_graph_flag[%d], graph_id[%u]", options_.run_graph_flag, graph_node->GetGraphId()); if (options_.run_graph_flag && ge_root_model != nullptr) { + ge_root_model->SetTrainFlag(GetTrainFlag()); // synchronization run graph with model ModelIdInfo model_id_info; bool is_unknown_shape = false; @@ -2604,9 +2673,9 @@ Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const G } } GE_TIMESTAMP_START(LoadGraph); - GE_CHECK_NOTNULL(graph_node->graph_run_async_listener_); - Status ret = - GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, graph_node->graph_run_async_listener_); + auto listener = MakeShared(); + GE_CHECK_NOTNULL(listener); + Status ret = GraphLoader::LoadModelOnline(model_id_info.model_id, ge_root_model, listener); GE_TIMESTAMP_EVENT_END(LoadGraph, "GraphManager::LoadGraphAsync"); if (ret != SUCCESS) { GELOGE(ret, "[LoadGraphAsync] LoadGraphAsync Failed"); @@ -2620,6 +2689,61 @@ Status GraphManager::LoadGraphAsync(const GeRootModelPtr &ge_root_model, const G return SUCCESS; } +void GraphManager::ReleaseMemory(const GeModelPtr &ge_model, GraphNodePtr &graph_node, + const std::vector &model_ids, uint32_t graph_id, uint64_t session_id) { + rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, when GraphManager %s", + GetContext().DeviceId(), __FUNCTION__); + GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, graphId=%u.", graph_id); + return; + } + for (auto model_id : model_ids) { + uint64_t max_memory_size = 0; + Status result = GraphLoader::GetMaxUsedMemory(model_id, max_memory_size); + if (result != SUCCESS) { + continue; + } + GELOGI("CheckAndReleaseMemory try to UnloadGraph[%u], model[%u] which MaxUsedMemory[%lu].", graph_id, model_id, + max_memory_size); + if (model_ids.size() > 1) { + result = ge_model->GetSessionId(model_id, session_id); + if (result != SUCCESS) { + GELOGW("[GraphManager:] get session failed when dynamic memory, modelId=%u, graphId=%u.", model_id, + graph_id); + continue; + } + } + result = GraphLoader::DestroyAicpuKernel(session_id, model_id, 0); + if (result != SUCCESS) { + GELOGW("[GraphManager:] destroy aicpu kernel failed when dynamic memory, modelId=%u, graphId=%u.", model_id, + graph_id); + } + result = GraphLoader::UnloadModel(model_id); + if (result != SUCCESS) { + GELOGW("[GraphManager:] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id); + } + GELOGI("CheckAndReleaseMemory UnloadGraph[%u], model[%u] success.", graph_id, model_id); + } + graph_node->SetLoadFlag(false); + // Allow model to be loaded agagin without adding graph again + graph_node->SetLoadCount(graph_node->GetLoadRecord()); + graph_node->SetLoadRecord(kNeverLoaded); + GeRootModelPtr ge_root_model = graph_node->GetGeRootModel(); + if (ge_root_model == nullptr) { + GELOGW("ge_root_model is null, graph_id:%u", graph_id); + return; + } + ge_root_model->ClearAllModelId(); + rt_ret = rtDeviceReset(GetContext().DeviceId()); + if (rt_ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, when GraphManager %s", + GetContext().DeviceId(), __FUNCTION__); + GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, graphId=%u.", graph_id); + return; + } +} + Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const GraphNodePtr &graph_node) { GELOGI("CheckAndReleaseMemory graph_id[%u]", graph_node->GetGraphId()); int64_t value = 0; @@ -2665,6 +2789,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra continue; } auto model_id = model->GetModelId(); + auto model_ids = model->GetAllModelId(); // unload model not release bool is_unknown_shape = false; GE_CHK_STATUS_RET(model->CheckIsUnknownShape(is_unknown_shape)); @@ -2677,38 +2802,7 @@ Status GraphManager::CheckAndReleaseMemory(const GeModelPtr &ge_model, const Gra GELOGI("CheckAndReleaseMemory graph[%u] has not been loaded.", graph_id); continue; } - uint64_t max_memory_size = 0; - result = GraphLoader::GetMaxUsedMemory(model_id, max_memory_size); - if (result != SUCCESS) { - continue; - } - GELOGI("CheckAndReleaseMemory try to UnloadGraph[%u], model[%u] which MaxUsedMemory[%lu].", graph_id, model_id, - max_memory_size); - rtError_t rt_ret = rtSetDevice(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u", - GetContext().DeviceId()); - GELOGE(RT_FAILED, "[GraphManager:] rtSetDevice failed, modelId=%u, graphId=%u.", model_id, graph_id); - continue; - } - result = GraphLoader::DestroyAicpuKernel(session_id, model_id, 0); - if (result != SUCCESS) { - GELOGW("[GraphManager:] destroy aicpu kernel failed when dynamic memory, modelId=%u, graphId=%u.", model_id, - graph_id); - } - result = GraphLoader::UnloadModel(model_id); - if (result != SUCCESS) { - GELOGW("[GraphManager:] unload model failed, modelId=%u, graphId=%u.", model_id, graph_id); - } - rt_ret = rtDeviceReset(GetContext().DeviceId()); - if (rt_ret != RT_ERROR_NONE) { - REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u", - GetContext().DeviceId()); - GELOGE(RT_FAILED, "[GraphManager:] rtDeviceReset failed, modelId=%u, graphId=%u.", model_id, graph_id); - continue; - } - it.second->SetLoadFlag(false); - GELOGI("CheckAndReleaseMemory UnloadGraph[%u], model[%u] success and set LoadFlag to false.", graph_id, model_id); + ReleaseMemory(ge_model, it.second, model_ids, graph_id, session_id); } return SUCCESS; @@ -2718,10 +2812,10 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager const SubGraphInfoPtr &sub_graph_info_ptr, const std::string &root_graph_name, uint64_t session_id, - const struct ErrorMessage::Context &error_context, + const struct error_message::Context &error_context, const GEThreadLocalContext &ge_context) { + ErrorManager::GetInstance().SetErrorContext(error_context); if (sub_graph_info_ptr != nullptr && graph_manager != nullptr) { - ErrorManager::GetInstance().SetErrorContext(error_context); GetContext().SetSessionId(session_id); GetThreadLocalContext() = ge_context; graph_manager->UpdateLocalOmgContext(root_graph_id); @@ -2771,11 +2865,11 @@ Status GraphManager::ProcessSubGraphWithMultiThreads(GraphManager *graph_manager // run graph async on session Status GraphManager::RunGraphAsync(const GraphId &graph_id, const std::vector &inputs, uint64_t session_id, RunAsyncCallback callback) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); GELOGI("[GraphManager] Start to run graph async, graph_id=%u, inputsSize=%zu.", graph_id, inputs.size()); bool ret = prerun_args_q_.Push(PreRunArgs({graph_id, inputs, session_id, - ErrorManager::GetInstance().GetErrorContext(), + ErrorManager::GetInstance().GetErrorManagerContext(), GetThreadLocalContext(), callback})); if (!ret) { GELOGE(FAILED, "[GraphManager] Run graph async failed, graph_id=%u.", graph_id); @@ -2849,6 +2943,38 @@ void GraphManager::ConstructGeInput(const vector &inputs, vecto } } +Status GraphManager::CheckIncreBuildAndPreRun(GraphManager *graph_manager, const PreRunArgs &args, + GraphNodePtr &graph_node, GeRootModelPtr &ge_root_model) { + if (!graph_manager->IsGraphNeedBuild(graph_node)) { + ge_root_model = graph_node->GetGeRootModel(); + return SUCCESS; + } + if (graph_node->GetBuildFlag()) { + ReturnError(graph_manager, args.callback, PARAM_INVALID, + "The graph " + std::to_string(graph_node->GetGraphId()) + + " need to re-build, you should remove it" + " from GE first, then AddGraph again and rebuild it."); + graph_node->Unlock(); + return PARAM_INVALID; + } + // check need incre build. + GeModelPtr ge_model = nullptr; + if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) { + std::vector ge_inputs; + ConstructGeInput(args.input_tensor, ge_inputs); + Status ret = graph_manager->PreRun(graph_node, ge_inputs, ge_root_model, args.session_id); + // release rts generate context + RtContextUtil::GetInstance().DestroyRtContexts(args.session_id, graph_node->GetGraphId()); + if (ret != SUCCESS) { + ReturnError(graph_manager, args.callback, ret, "PreRun Failed."); + return ret; + } + } + graph_node->SetBuildFlag(true); + graph_manager->var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId()); + return SUCCESS; +} + void GraphManager::PreRunThread(GraphManager *graph_manager) { if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) { GELOGW("Set thread name failed."); @@ -2861,10 +2987,10 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { continue; } - GELOGI("A new loop start."); + GELOGI("[PreRunThread] A new loop start, graph_id:%u.", args.graph_id); ErrorManager::GetInstance().SetErrorContext(args.error_context); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GetContext().SetSessionId(args.session_id); GetThreadLocalContext() = args.context; graph_manager->UpdateLocalOmgContext(args.graph_id); @@ -2877,7 +3003,24 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { "[RunGraph] graph not exist, graph_id=" + std::to_string(args.graph_id)); return; } - + // more than one graph owns same graph_id + uint32_t count = 0; + if (graph_manager->GetGraphCount(args.graph_id, count) != SUCCESS) { + GELOGE(INTERNAL_ERROR, "Get graph [id:%u] count failed.", args.graph_id); + return; + } + // Avoid repeatively prerun for graphs owns same graph_id in online inference concurrency + if (count > 1 && graph_node->GetBuildFlag()) { + graph_node->Lock(); + GELOGD("Avoid repeatively prerun, graph_id:%u.", args.graph_id); + // In online inference concurrency senario, graph_node is allowed to be locked for 'count' times + graph_node->SetSemSize(count); + graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.error_context, + args.input_tensor, graph_node->GetGeRootModel(), GetThreadLocalContext(), args.callback })); + GELOGI("[PreRunThread] Loop end. Start to run with cached build model."); + continue; + } + // Cannot be put ahead of the repeatively prerun judgement graph_node->Lock(); if (graph_node->GetRunFlag()) { @@ -2909,46 +3052,24 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) { // it will not execute graph preprocess, optimize, parition, build if the graph has built successful. GELOGI("Start for run graph async."); GeRootModelPtr ge_root_model = nullptr; - if (graph_manager->IsGraphNeedBuild(graph_node)) { - if (graph_node->GetBuildFlag()) { - ReturnError(graph_manager, args.callback, PARAM_INVALID, - "The graph " + std::to_string(graph_node->GetGraphId()) + - " need to re-build, you should remove it" - " from GE first, then AddGraph again and rebuild it."); + + ret = CheckIncreBuildAndPreRun(graph_manager, args, graph_node, ge_root_model); + if (ret != SUCCESS) { + graph_node->SetRunFlag(false); + if (!ge::Analyzer::GetInstance()->IsEnableNetAnalyzeDebug()) { + ReturnError(graph_manager, args.callback, ret, "CheckIncreBuildAndPreRun Failed, thread exit.."); graph_node->Unlock(); return; + } else { + ReturnError(graph_manager, graph_node, args.callback, ret, + "CheckIncreBuildAndPreRun Failed, keep geop continue!"); + graph_node->Unlock(); + continue; } - - // check need incre build. - GeModelPtr ge_model = nullptr; - if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) { - std::vector ge_inputs; - ConstructGeInput(args.input_tensor, ge_inputs); - ret = graph_manager->PreRun(graph_node, ge_inputs, ge_root_model, args.session_id); - // release rts generate context - RtContextUtil::GetInstance().DestroyRtContexts(args.session_id, graph_node->GetGraphId()); - if (ret != SUCCESS) { - graph_node->SetRunFlag(false); - if (!ge::Analyzer::GetInstance()->IsEnableNetAnalyzeDebug()) { - ReturnError(graph_manager, args.callback, ret, "PreRun Failed, thread exit.."); - graph_node->Unlock(); - return; - } else { - ReturnError(graph_manager, graph_node, args.callback, ret, "PreRun Failed, keep geop continue!"); - graph_node->Unlock(); - continue; - } - } - } - graph_node->SetBuildFlag(true); - graph_manager->var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId()); - } else { - ge_root_model = graph_node->GetGeRootModel(); } - graph_manager->run_args_q_.Push(RunArgs( { graph_node, args.graph_id, args.session_id, args.error_context, args.input_tensor, ge_root_model, GetThreadLocalContext(), args.callback })); - GELOGI("Loop end."); + GELOGI("[PreRunThread] Loop end."); } } @@ -3039,7 +3160,7 @@ Status GraphManager::ParseInputsDims(const std::vector &input_t } void GraphManager::RunThread(GraphManager *graph_manager) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); if (prctl(PR_SET_NAME, ("GE_Run")) != 0) { GELOGW("Set thread name failed."); } @@ -3051,16 +3172,13 @@ void GraphManager::RunThread(GraphManager *graph_manager) { continue; } - GELOGI("A new loop start."); + GELOGI("[RunThread] A new loop start, graph_id:%u.", args.graph_id); ErrorManager::GetInstance().SetErrorContext(args.error_context); GetContext().SetSessionId(args.session_id); GetThreadLocalContext() = args.context; graph_manager->UpdateLocalOmgContext(args.graph_id); - if (args.graph_node->graph_run_async_listener_ != nullptr) { - args.graph_node->graph_run_async_listener_->SetCallback(args.callback); - } Status ret; // parse inputs.dims to vector> dynamic_dims ret = graph_manager->ParseInputsDims(args.input_tensor); @@ -3070,8 +3188,10 @@ void GraphManager::RunThread(GraphManager *graph_manager) { return; } + args.graph_node->UpdateLoadFlag(); if (!args.graph_node->GetLoadFlag()) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelLoad, ErrorMessage::kModelLoad); + ErrorManager::GetInstance().SetStage(error_message::kModelLoad, error_message::kModelLoad); + args.ge_root_model->SetTrainFlag(graph_manager->GetTrainFlag()); ret = graph_manager->LoadGraphAsync(args.ge_root_model, args.graph_node); if (ret != SUCCESS || args.ge_root_model == nullptr) { StopQueue(graph_manager); @@ -3079,12 +3199,16 @@ void GraphManager::RunThread(GraphManager *graph_manager) { args.graph_node->Unlock(); return; } + // control the times of graph loading in multi-thread scenario + args.graph_node->DecreaseLoadCount(); + args.graph_node->IncreaseLoadRecord(); + args.graph_node->SetLoadFlag(true); GELOGI("LoadGraph[%u], model[%u] success and set LoadFlag to true.", args.graph_node->GetGraphId(), args.ge_root_model->GetModelId()); } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelExecute, ErrorMessage::kModelExecute); + ErrorManager::GetInstance().SetStage(error_message::kModelExecute, error_message::kModelExecute); if (graph_manager->GetTrainFlag()) { ret = graph_manager->graph_executor_.SetGraphContext(graph_manager->GetGraphContext()); if (ret != SUCCESS) { @@ -3093,9 +3217,9 @@ void GraphManager::RunThread(GraphManager *graph_manager) { graph_manager->graph_executor_.SetTrainFlag(graph_manager->options_.train_graph_flag); } - args.graph_node->SetRunFlag(false); ret = graph_manager->graph_executor_.ExecuteGraphAsync(args.graph_id, args.graph_node->GetGeRootModel(), - args.input_tensor); + args.input_tensor, args.callback); + args.graph_node->SetRunFlag(false); if (ret != SUCCESS) { ReturnError(graph_manager, args.callback, ret, "ExecuteGraphAsync failed, thread exit."); args.graph_node->Unlock(); @@ -3382,7 +3506,7 @@ Status GraphManager::ConvertGraphToFile(ComputeGraphPtr &compute_graph, GraphPar Status GraphManager::Build(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, GeRootModelPtr &ge_root_model, uint64_t session_id) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); // build if (compute_graph != nullptr) { std::string graph_name = compute_graph->GetName(); @@ -3546,4 +3670,49 @@ void GraphManager::RemoveCompilerStages(GraphId graph_id) { std::lock_guard lock(member_mutex_); compiler_stages_.erase(graph_id); } + +void GraphManager::IncreaseGraphCount(GraphId graph_id) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + graph_count_.insert({graph_id, kInitGraphCount}); + GELOGD("After increaseGraphCount, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + } else { + ++graph_count_[graph_id]; + GELOGD("After increaseGraphCount, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + } +} + +void GraphManager::RemoveGraphCount(GraphId graph_id) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + GELOGW("Graph of id: %u has not been added, count cannot be decreased.", graph_id); + } else { + GELOGD("RemoveGraphCount success, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + graph_count_.erase(it); + } +} + +void GraphManager::DecreaseGraphCount(GraphId graph_id) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + GELOGW("Graph of id: %u has not been added, count cannot be decreased.", graph_id); + } else { + --it->second; + GELOGD("After DecreaseGraphCount, graph count of id[%u] is %u.", graph_id, graph_count_[graph_id]); + } +} + +Status GraphManager::GetGraphCount(GraphId graph_id, uint32_t &count) { + std::lock_guard lock(graph_count_mutex_); + auto it = graph_count_.find(graph_id); + if (it == graph_count_.end()) { + GELOGW("Graph [id:%u] has not been added.", graph_id); + return FAILED; + } + count = it->second; + return SUCCESS; +} } // namespace ge diff --git a/ge/graph/manager/graph_manager.h b/ge/graph/manager/graph_manager.h index b63b138a..960c253c 100644 --- a/ge/graph/manager/graph_manager.h +++ b/ge/graph/manager/graph_manager.h @@ -184,6 +184,20 @@ class GraphManager { Status SaveCheckPointResult(const Graph &graph, const std::vector &outputs, map &var_results); + void RemoveGraphCount(GraphId graph_id); + + void IncreaseGraphCount(GraphId graph_id); + + void DecreaseGraphCount(GraphId graph_id); + + Status GetGraphCount(GraphId graph_id, uint32_t &count); + + void SetAddGraphCondition(GraphId graph_id, uint32_t cond); + + uint32_t GetAddGraphCondition(GraphId graph_id); + + void RemoveAddGraphCondition(GraphId graph_id); + private: struct CompilerStages { GraphPrepare preparer; @@ -196,7 +210,7 @@ class GraphManager { GraphId graph_id; std::vector input_tensor; uint64_t session_id; - struct ErrorMessage::Context error_context; + struct error_message::Context error_context; GEThreadLocalContext context; RunAsyncCallback callback; }; @@ -205,7 +219,7 @@ class GraphManager { GraphNodePtr graph_node; GraphId graph_id; uint64_t session_id; - struct ErrorMessage::Context error_context; + struct error_message::Context error_context; std::vector input_tensor; GeRootModelPtr ge_root_model; GEThreadLocalContext context; @@ -223,7 +237,7 @@ class GraphManager { const SubGraphInfoPtr &sub_graph_info_ptr, const std::string &root_graph_name, uint64_t session_id, - const struct ErrorMessage::Context &error_context, + const struct error_message::Context &error_context, const GEThreadLocalContext &ge_context); Status ParseInputsDims(const std::vector &input_tensor); void ParseInputsDimsForData(const std::vector &input_tensor); @@ -381,6 +395,26 @@ class GraphManager { CompilerStages &GetCompilerStages(GraphId graph_id); void RemoveCompilerStages(GraphId graph_id); + static Status CheckIncreBuildAndPreRun(GraphManager *graph_manager, const PreRunArgs &args, GraphNodePtr &graph_node, + GeRootModelPtr &ge_root_model); + + void ReleaseMemory(const GeModelPtr &ge_model, GraphNodePtr &graph_node, const std::vector &model_ids, + uint32_t graph_id, uint64_t session_id); + + Status CheckRepeatAdd(uint32_t graph_id, bool &is_added); + + Status NotifyWaittingGraph(uint32_t graph_id); + + Status CreateGraphNode(uint32_t graph_id, const Graph &graph, const std::map &options); + + Status SetStagesOptions(uint32_t graph_id, const GraphManagerOptions &options); + + Status UnloadModel(GeRootModelPtr ge_root_model, uint32_t graph_id); + + void SetSessionGraphId(ComputeGraphPtr compute_graph, uint32_t graph_id); + + static Status CheckGraphAdded(const GraphId &graph_id, const Graph &graph); + std::atomic_bool thread_run_flag_; BlockingQueue prerun_args_q_{}; BlockingQueue run_args_q_{}; @@ -416,6 +450,16 @@ class GraphManager { std::mutex member_mutex_; std::mutex unload_model_mutex_; + // avoid repeatively add same graph (owns same graph id) + std::mutex add_graph_mutex_; + std::mutex add_graph_cond_mutex_; + std::condition_variable add_graph_cv_; + + std::map graph_id_to_add_graph_cond_; + // use for multi-thread online-infer scenario + std::set to_be_deleted_graphs_; + std::map graph_count_; + std::mutex graph_count_mutex_; }; } // namespace ge diff --git a/ge/graph/manager/graph_manager_utils.cc b/ge/graph/manager/graph_manager_utils.cc index 3a8d577c..e9d72bd8 100644 --- a/ge/graph/manager/graph_manager_utils.cc +++ b/ge/graph/manager/graph_manager_utils.cc @@ -60,6 +60,15 @@ void GraphNode::Unlock() { sem_.Pop(unused); } +void GraphNode::IncreaseLoadCount() { + std::unique_lock lock(load_count_mu_); + if (load_record_ == kMaxLoadNum) { + GELOGW("Reach the maximum of load_count:%u", kMaxLoadNum); + return; + } + ++load_count_; +} + SubGraphInfo::SubGraphInfo() : subgraph_ptr_(nullptr), ge_model_ptr_(nullptr), malloc_flag_(false) {} SubGraphInfo::~SubGraphInfo() { diff --git a/ge/graph/manager/graph_manager_utils.h b/ge/graph/manager/graph_manager_utils.h index cfe6588f..bebba93e 100644 --- a/ge/graph/manager/graph_manager_utils.h +++ b/ge/graph/manager/graph_manager_utils.h @@ -55,6 +55,7 @@ using ConstGraphPtr = std::shared_ptr; using GraphPtr = std::shared_ptr; const uint64_t INVALID_SESSION_ID = 0xffffffffffffffffULL; +const uint32_t kMaxLoadNum = 8; struct ModelIdInfo { uint32_t model_id{INVALID_MODEL_ID}; @@ -162,6 +163,8 @@ class GraphNode { bool GetBuildFlag() const { return build_flag_; } void SetBuildFlag(bool buildFlag) { build_flag_ = buildFlag; } bool GetLoadFlag() const { return load_flag_; } + // allow repeatively load graph owns same graph id + void UpdateLoadFlag() { load_flag_ = load_count_ == 0 || load_record_ >= kMaxLoadNum; } void SetLoadFlag(bool load_flag) { load_flag_ = load_flag; } void SetGeModel(const GeModelPtr &ge_model) { ge_model_ = ge_model; } GeModelPtr GetGeModel() const { return ge_model_; } @@ -172,6 +175,16 @@ class GraphNode { void Lock(); void Unlock(); + void SetSemSize(uint32_t size) { sem_.SetMaxSize(size); } + + uint32_t GetLoadCount() const { return load_count_; } + void SetLoadCount(uint32_t count) { load_count_ = count; } + uint32_t GetLoadRecord() const { return load_record_; } + void SetLoadRecord(uint32_t record) { load_record_ = record; } + void IncreaseLoadRecord() { ++load_record_; } + void IncreaseLoadCount(); + void DecreaseLoadCount() { --load_count_; } + // run graph asynchronous listener std::shared_ptr graph_run_async_listener_; @@ -184,11 +197,17 @@ class GraphNode { GraphPtr graph_; ComputeGraphPtr compute_graph_; bool build_flag_; + // load_flag_ is true if more than 1 model were loaded bool load_flag_; bool async_; GeModelPtr ge_model_; GeRootModelPtr ge_root_model_; BlockingQueue sem_; + // consist with graph_count of same graph_id in graph_manager + uint32_t load_count_ = 0; + // total times of loading a graph with same graph_id. + uint32_t load_record_ = 0; + std::mutex load_count_mu_; }; using GraphNodePtr = std::shared_ptr; diff --git a/ge/graph/manager/graph_mem_allocator.h b/ge/graph/manager/graph_mem_allocator.h index d5e8cf8d..9f8b86b2 100644 --- a/ge/graph/manager/graph_mem_allocator.h +++ b/ge/graph/manager/graph_mem_allocator.h @@ -226,7 +226,7 @@ class MemManager { // Usually impossible if (allocator == nullptr) { - GELOGE(ge::INTERNAL_ERROR, "Get allocator failed, memory type is %u.", memory_type); + GELOGW("Get allocator failed, memory type is %u.", memory_type); static T default_allocator(RT_MEMORY_RESERVED); return default_allocator; } diff --git a/ge/graph/manager/host_mem_manager.cc b/ge/graph/manager/host_mem_manager.cc index 40a0d1b9..2908df39 100644 --- a/ge/graph/manager/host_mem_manager.cc +++ b/ge/graph/manager/host_mem_manager.cc @@ -45,12 +45,11 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) { return GE_GRAPH_MEMORY_ALLOC_FAILED; } mem_info.fd = output_para.fd; - mem_info.host_aligned_ptr = AlignedPtr::BuildFromAllocFunc([&output_para](std::unique_ptr &ptr) { - ptr.reset(reinterpret_cast(output_para.ptr)); - }, - [](uint8_t *ptr) { - ptr = nullptr; - }); + mem_info.host_aligned_ptr = AlignedPtr::BuildFromAllocFunc( + [&output_para](std::unique_ptr &ptr) { + ptr.reset(reinterpret_cast(output_para.ptr)); + }, + [](uint8_t *ptr) { ptr = nullptr; }); mem_info.device_address = reinterpret_cast(output_para.devPtr); return SUCCESS; } diff --git a/ge/graph/manager/trans_var_data_utils.cc b/ge/graph/manager/trans_var_data_utils.cc index 7c96eb95..9c1290fa 100644 --- a/ge/graph/manager/trans_var_data_utils.cc +++ b/ge/graph/manager/trans_var_data_utils.cc @@ -487,7 +487,7 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, std::future f = executor.commit( [](const ge::NodePtr &node, uint64_t session_id, rtContext_t ctx, uint32_t graph_id, - const struct ErrorMessage::Context &error_context) -> Status { + const struct error_message::Context &error_context) -> Status { ErrorManager::GetInstance().SetErrorContext(error_context); rtError_t rt_ret = rtCtxSetCurrent(ctx); if (rt_ret != RT_ERROR_NONE) { @@ -525,7 +525,7 @@ Status TransVarDataUtils::TransAllVarData(const vector &variable_nodes, } return SUCCESS; }, - node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorContext()); + node, session_id, context, graph_id, ErrorManager::GetInstance().GetErrorManagerContext()); if (!f.valid()) { GELOGE(FAILED, "Future is invalid"); return FAILED; diff --git a/ge/graph/manager/util/debug.cc b/ge/graph/manager/util/debug.cc index 65aa3192..1dd97bc1 100644 --- a/ge/graph/manager/util/debug.cc +++ b/ge/graph/manager/util/debug.cc @@ -35,14 +35,14 @@ void Debug::DumpProto(const Message &proto, const char *file) { int fd = mmOpen2(file_path.c_str(), M_WRONLY | M_CREAT | O_TRUNC, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD | M_UMASK_OTHREAD); if (fd == -1) { - GELOGW("Write %s failed", file_path.c_str()); + GELOGW("Write %s failed. errmsg:%s", file_path.c_str(), strerror(errno)); return; } auto output = ge::MakeShared(fd); if (output == nullptr) { GELOGW("create output failed."); if (mmClose(fd) != 0) { - GELOGW("close fd failed."); + GELOGW("close fd failed. errmsg:%s", strerror(errno)); } return; } @@ -51,7 +51,7 @@ void Debug::DumpProto(const Message &proto, const char *file) { GELOGW("dump proto failed."); } if (mmClose(fd) != 0) { - GELOGW("close fd failed."); + GELOGW("close fd failed. errmsg:%s", strerror(errno)); } } diff --git a/ge/graph/passes/attach_stream_label_pass.cc b/ge/graph/passes/attach_stream_label_pass.cc index 75599c45..d8c81e92 100644 --- a/ge/graph/passes/attach_stream_label_pass.cc +++ b/ge/graph/passes/attach_stream_label_pass.cc @@ -24,34 +24,31 @@ namespace ge { Status AttachStreamLabelPass::Run(ComputeGraphPtr graph) { GELOGD("AttachStreamLabelPass Enter."); - FindNodes(graph); - for (const auto &node : need_label_nodes_) { - GE_CHK_STATUS_RET(UpdateCondBranch(node), "Update cond branch failed, start node:%s.", node->GetName().c_str()); + std::vector need_label_nodes; + std::vector enter_nodes; + std::map branch_head_nodes; + FindNodes(graph, need_label_nodes, enter_nodes, branch_head_nodes); + for (const auto &node : need_label_nodes) { + GE_CHK_STATUS_RET(UpdateCondBranch(node, branch_head_nodes), "Update cond branch failed, start node:%s.", node->GetName().c_str()); } - GE_CHK_STATUS_RET(UpdateEnterNode(), "UpdateEnterNode failed."); + GE_CHK_STATUS_RET(UpdateEnterNode(enter_nodes), "UpdateEnterNode failed."); GELOGD("AttachStreamLabelPass Leave."); return SUCCESS; } /// -/// @brief Clear Status, used for subgraph pass -/// @return -/// -Status AttachStreamLabelPass::ClearStatus() { - stream_switch_nodes_.clear(); - need_label_nodes_.clear(); - enter_nodes_.clear(); - branch_head_nodes_.clear(); - return SUCCESS; -} - -/// /// @brief Find StreamSwitch / StreamMerge / Enter node /// @param [in] graph +/// @param [out] need_label_nodes +/// @param [out] enter_nodes +/// @param [out] branch_head_nodes /// @return void /// -void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) { +void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph, std::vector &need_label_nodes, + std::vector &enter_nodes, + std::map &branch_head_nodes) { + std::vector stream_switch_nodes; for (const NodePtr &node : graph->GetDirectNode()) { const auto &op_desc = node->GetOpDesc(); if (op_desc == nullptr) { @@ -59,29 +56,31 @@ void AttachStreamLabelPass::FindNodes(const ComputeGraphPtr &graph) { } const std::string &type = op_desc->GetType(); if ((type == STREAMSWITCH) && op_desc->HasAttr(ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG)) { - stream_switch_nodes_.emplace_back(node); + stream_switch_nodes.emplace_back(node); } else if ((type == STREAMMERGE) && !op_desc->HasAttr(ATTR_NAME_NEXT_ITERATION)) { - need_label_nodes_.emplace_back(node); + need_label_nodes.emplace_back(node); } else if ((type == ENTER) || (type == REFENTER)) { - enter_nodes_.emplace_back(node); + enter_nodes.emplace_back(node); } } - for (const auto &node : stream_switch_nodes_) { + for (const auto &node : stream_switch_nodes) { for (const auto &out_ctrl_node : node->GetOutControlNodes()) { GELOGD("branch_head_node %s of stream_switch %s.", out_ctrl_node->GetName().c_str(), node->GetName().c_str()); - branch_head_nodes_[out_ctrl_node] = node; + branch_head_nodes[out_ctrl_node] = node; } - need_label_nodes_.emplace_back(node); + need_label_nodes.emplace_back(node); } } /// /// @brief update cond branch /// @param [in] node +/// @param [in] branch_head_nodes /// @return Status /// -Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) { +Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node, + const std::map &branch_head_nodes) { std::string stream_label; if (AttachFlag(node, stream_label) != SUCCESS) { GELOGE(FAILED, "Attach flag for node %s failed.", node->GetName().c_str()); @@ -103,8 +102,9 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) { const std::string &type = cur_node->GetType(); for (const auto &out_node : cur_node->GetOutAllNodes()) { const std::string &out_type = out_node->GetType(); + const auto &iter = branch_head_nodes.find(node); bool stop_flag = (end_type_set.count(out_type) > 0) || - ((branch_head_nodes_.count(out_node) > 0) && (branch_head_nodes_[out_node] != node)) || + ((iter != branch_head_nodes.end()) && (iter->second != node)) || (((type == ENTER) || (type == REFENTER)) && (out_type != STREAMACTIVE)); if (!stop_flag) { nodes.push(out_node); @@ -178,11 +178,12 @@ Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &strea /// /// @brief Update stream_label start with enter nodes +/// @param [in] enter_nodes /// @return Status /// -Status AttachStreamLabelPass::UpdateEnterNode() { +Status AttachStreamLabelPass::UpdateEnterNode(const std::vector &enter_nodes) { std::unordered_map> enter_active_map; - for (const auto &enter_node : enter_nodes_) { + for (const auto &enter_node : enter_nodes) { for (const auto &out_ctrl_node : enter_node->GetOutControlNodes()) { if (out_ctrl_node->GetType() != STREAMACTIVE) { continue; @@ -214,11 +215,11 @@ Status AttachStreamLabelPass::UpdateEnterNode() { return INTERNAL_ERROR; } - std::stack enter_nodes; + std::stack nodes; for (const auto &enter_node : pair.second) { - enter_nodes.emplace(enter_node); + nodes.emplace(enter_node); } - if (UpdateLoopBranch(enter_nodes, active_label_list[0]) != SUCCESS) { + if (UpdateLoopBranch(nodes, active_label_list[0]) != SUCCESS) { GELOGE(FAILED, "Update stream_label for loop_branch failed."); return FAILED; } diff --git a/ge/graph/passes/attach_stream_label_pass.h b/ge/graph/passes/attach_stream_label_pass.h index ad71d58f..a1600a58 100755 --- a/ge/graph/passes/attach_stream_label_pass.h +++ b/ge/graph/passes/attach_stream_label_pass.h @@ -25,26 +25,25 @@ class AttachStreamLabelPass : public GraphPass { public: Status Run(ComputeGraphPtr graph); - /// - /// @brief Clear Status, used for subgraph pass - /// @return - /// - Status ClearStatus() override; - private: /// /// @brief Find StreamSwitch / StreamMerge / Enter node /// @param [in] graph + /// @param [out] need_label_nodes + /// @param [out] enter_nodes + /// @param [out] branch_head_nodes /// @return void /// - void FindNodes(const ComputeGraphPtr &graph); + void FindNodes(const ComputeGraphPtr &graph, std::vector &need_label_nodes, + std::vector &enter_nodes, std::map &branch_head_nodes); /// /// @brief update cond branch /// @param [in] node + /// @param [in] branch_head_nodes /// @return Status /// - Status UpdateCondBranch(const NodePtr &node); + Status UpdateCondBranch(const NodePtr &node, const std::map &branch_head_nodes); /// /// @brief attach flag @@ -64,9 +63,10 @@ class AttachStreamLabelPass : public GraphPass { /// /// @brief Update stream_label start with enter nodes + /// @param [in] enter_nodes /// @return Status /// - Status UpdateEnterNode(); + Status UpdateEnterNode(const std::vector &enter_nodes); /// /// @brief Set stream_label for enter_nodes @@ -75,11 +75,6 @@ class AttachStreamLabelPass : public GraphPass { /// @return Status /// static Status SetEnterLabel(const std::vector &enter_nodes, const NodePtr &active_node); - - std::vector stream_switch_nodes_; - std::vector need_label_nodes_; - std::vector enter_nodes_; - std::unordered_map branch_head_nodes_; }; } // namespace ge #endif // GE_GRAPH_PASSES_ATTACH_STREAM_LABEL_PASS_H_ diff --git a/ge/graph/passes/dimension_adjust_pass.cc b/ge/graph/passes/dimension_adjust_pass.cc index 61480f17..dbea8dc9 100755 --- a/ge/graph/passes/dimension_adjust_pass.cc +++ b/ge/graph/passes/dimension_adjust_pass.cc @@ -78,7 +78,12 @@ Status DimensionAdjustPass::Run(ge::NodePtr &node) { GELOGE(ret, "DimensionAdjustPass compute failed"); return ret; } + // Need to handle axis_input of node like ExpandDims if (node->GetAllInDataAnchors().size() > static_cast(kRemoveInputIndex)) { + auto axis_node_out_anchor = node->GetInDataAnchor(kRemoveInputIndex)->GetPeerOutAnchor(); + GE_CHECK_NOTNULL(axis_node_out_anchor); + auto axis_node = axis_node_out_anchor->GetOwnerNode(); + // 1.Copy control dependency of axis node ret = PassUtils::UnlinkNodeWithControlCopy(node, kRemoveInputIndex); if (ret != SUCCESS) { REPORT_CALL_ERROR("E19999", "Unlink op:%s(%s) data input:%u with control edge copy failed", @@ -86,6 +91,13 @@ Status DimensionAdjustPass::Run(ge::NodePtr &node) { GELOGE(ret, "DimensionAdjustPass unlink node with control copy fail."); return ret; } + // 2.Remove const axis node without any output + if ((axis_node->GetType() == CONSTANT || axis_node->GetType() == CONSTANTOP) && + axis_node->GetOutDataNodesSize() == 0) { + ret = IsolateAndDeleteNode(axis_node, {}); + GE_CHK_GRAPH_STATUS_RET(ret, "Fail to remove node %s.", axis_node->GetName().c_str()); + GELOGI("Remove useless axis input const %s", axis_node->GetName().c_str()); + } } ret = DealWithInNodes(node); diff --git a/ge/graph/passes/link_gen_mask_nodes_pass.cc b/ge/graph/passes/link_gen_mask_nodes_pass.cc index 14f5dfc3..e00ede45 100755 --- a/ge/graph/passes/link_gen_mask_nodes_pass.cc +++ b/ge/graph/passes/link_gen_mask_nodes_pass.cc @@ -107,6 +107,16 @@ void LinkGenMaskNodesPass::GetAllGenMaskNodes(ComputeGraphPtr graph, vectorGetInDataNodes(); if (in_data_nodes.size() > kGenMaskInputIndex) { NodePtr &gen_mask = in_data_nodes.at(kGenMaskInputIndex); + for (auto &in_data_node : in_data_nodes) { + // node gen_mask is located at different place in the fused node + if (in_data_node->GetName().find(DROPOUTGENMASK) != in_data_node->GetName().npos) { + gen_mask = in_data_node; + GELOGD("The fused node type [%s], paired with the input node name [%s].", + node->GetType().c_str(), gen_mask->GetName().c_str()); + break; + } + } + if ((gen_mask->GetOpDesc() == nullptr) || (gen_mask->GetOpDesc()->HasAttr(ATTR_NAME_STREAM_LABEL))) { continue; } diff --git a/ge/graph/passes/net_output_pass.cc b/ge/graph/passes/net_output_pass.cc index 3ac1100d..aca7058d 100644 --- a/ge/graph/passes/net_output_pass.cc +++ b/ge/graph/passes/net_output_pass.cc @@ -514,7 +514,7 @@ Status NetOutputPass::Run(ge::ComputeGraphPtr graph) { GELOGE(GE_GRAPH_PARAM_NULLPTR, "Compute graph is null."); return GE_GRAPH_PARAM_NULLPTR; } - GELOGI("NetOutputPass Run.graph is [%s]", graph->GetName().c_str()); + GELOGI("[NETOUTPUT PASS] Run.graph is [%s]", graph->GetName().c_str()); NodePtr output_node = graph->FindFirstNodeMatchType(NETOUTPUT); // save user targets node SaveAndRemoveTargets(graph); @@ -552,10 +552,17 @@ Status NetOutputPass::AddNetOutputNodeToGraph(const ge::ComputeGraphPtr &graph, // If user does not set out nodes and targets and no retval node, also add netoutput node if ((graph->GetGraphOutNodesInfo().empty()) && (graph->GetGraphTargetNodesInfo().empty()) && !is_include_special_node_) { - GELOGI("[NETOUTPUT PASS] output_nodes and target_nodes and special nodes is empty!Add netoutput!"); + GELOGI("[NETOUTPUT PASS] Both output, target and special nodes are empty! add net output node"); output_node = graph->AddNode(net_output_desc); GE_CHK_STATUS_RET(AddCtrlEdgesBetweenLeafAndNetOutput(graph, output_node), "add ctrl edge between leaf and netoutput failed"); + if (!ge::AttrUtils::SetInt(output_node->GetOpDesc(), ATTR_NAME_TRUE_BRANCH_STREAM, 0)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_TRUE_BRANCH_STREAM.c_str(), + output_node->GetName().c_str(), output_node->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "set ATTR_NAME_TRUE_BRANCH_STREAM failed"); + return INTERNAL_ERROR; + } + GELOGI("[NETOUTPUT PASS] Add net output node succeed"); return SUCCESS; } GELOGI("[NETOUTPUT PASS] Output node size:%lu.", output_nodes_info.size()); diff --git a/ge/graph/passes/pass_utils.cc b/ge/graph/passes/pass_utils.cc index 69fe479e..db379433 100644 --- a/ge/graph/passes/pass_utils.cc +++ b/ge/graph/passes/pass_utils.cc @@ -334,6 +334,9 @@ Status PassUtils::UnlinkNodeWithControlCopy(NodePtr &node, int index) { auto father_node = out_data_anchor->GetOwnerNode(); // link father_node's in control nodes to node if (GraphUtils::CopyInCtrlEdges(father_node, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + father_node->GetName().c_str(), father_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } return SUCCESS; diff --git a/ge/graph/passes/same_transdata_breadth_fusion_pass.cc b/ge/graph/passes/same_transdata_breadth_fusion_pass.cc index 44778dd3..c0a3328e 100644 --- a/ge/graph/passes/same_transdata_breadth_fusion_pass.cc +++ b/ge/graph/passes/same_transdata_breadth_fusion_pass.cc @@ -71,6 +71,7 @@ OpDescPtr SameTransdataBreadthFusionPass::GetCastOp(const GeTensorDesc &in_desc, auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op); node_op.BreakConnect(); if (cast_op == nullptr) { + REPORT_INNER_ERROR("E19999", "Create Operator:%s(%s) failed", cast_op_name.str().c_str(), CAST); GELOGE(INTERNAL_ERROR, "new fusion cast op failed!"); return nullptr; } @@ -96,6 +97,8 @@ OpDescPtr SameTransdataBreadthFusionPass::GetCastOp(const GeTensorDesc &in_desc, } } if (!AttrUtils::SetInt(cast_op, CAST_ATTR_DST_TYPE, static_cast(out_desc.GetDataType()))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", CAST_ATTR_DST_TYPE.c_str(), + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set dst_type attr failed"); return nullptr; } @@ -204,6 +207,12 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkDataOutput2PreNode(const NodeP GELOGI("remove edge.src:%s, dst:%s", out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_anchor, transdata_peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), + out_anchor->GetOwnerNode()->GetType().c_str(), out_anchor->GetIdx(), + transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "remove edge failed!src node:%s, dst node:%s", transdata_node->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); return GRAPH_FAILED; @@ -211,6 +220,12 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkDataOutput2PreNode(const NodeP GELOGI("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), pre_out_anchor->GetIdx(), + transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge failed!src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_anchor->GetOwnerNode()->GetName().c_str()); @@ -231,6 +246,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutDataPeerInControlNodes2PreN GELOGD("remove edge.src:%s, dst:%s", out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), + out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove edge failed!src node:%s, dst node:%s", transdata_node->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); return GRAPH_FAILED; @@ -240,6 +260,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutDataPeerInControlNodes2PreN GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add edge failed!src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); @@ -249,6 +274,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutDataPeerInControlNodes2PreN GELOGD("add edge.src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_peer_out_control_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add edge failed!src node:%s, dst node:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); @@ -290,6 +320,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInControlAnchors GELOGD("remove edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_control_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_control_anchor->GetOwnerNode()->GetName().c_str(), + out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove transdata control edge failed!"); return GRAPH_FAILED; } @@ -298,6 +333,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInControlAnchors GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -305,6 +345,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInControlAnchors GELOGD("add edge.src:%s, dst:%s", transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_peer_out_control_anchor, transdata_peer_in_control_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_control_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -329,6 +374,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInDataAnchors( GELOGD("remove edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(out_control_anchor, transdata_peer_in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + out_control_anchor->GetOwnerNode()->GetName().c_str(), + out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove transdata control edge failed!"); return GRAPH_FAILED; } @@ -337,6 +387,12 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInDataAnchors( GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, transdata_peer_in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), pre_out_anchor->GetIdx(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_data_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -344,6 +400,11 @@ graphStatus SameTransdataBreadthFusionPass::ReLinkOutControlPeerInDataAnchors( GELOGD("add edge.src:%s, dst:%s", transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_peer_out_control_anchor, transdata_peer_in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + transdata_peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_in_data_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(GRAPH_FAILED, "add control edge failed!"); return GRAPH_FAILED; } @@ -460,6 +521,12 @@ graphStatus SameTransdataBreadthFusionPass::RelinkRemainTransdata(const ComputeG GELOGI("add edge.out node %s, in node %s", head_node->GetName().c_str(), transdata_node_keep->GetName().c_str()); if (GraphUtils::AddEdge(head_node_anchor, transdata_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + head_node_anchor->GetOwnerNode()->GetName().c_str(), + head_node_anchor->GetOwnerNode()->GetType().c_str(), head_node_anchor->GetIdx(), + transdata_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge failed!out node %s, in node %s", head_node->GetName().c_str(), transdata_node_keep->GetName().c_str()); return GRAPH_FAILED; @@ -545,6 +612,12 @@ graphStatus SameTransdataBreadthFusionPass::ReuseNodesBeforeTransdata(int anchor GELOGI("add edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), head_node_peer_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(transdata_out_anchor, head_node_peer_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + transdata_out_anchor->GetOwnerNode()->GetName().c_str(), + transdata_out_anchor->GetOwnerNode()->GetType().c_str(), transdata_out_anchor->GetIdx(), + head_node_peer_anchor->GetOwnerNode()->GetName().c_str(), + head_node_peer_anchor->GetOwnerNode()->GetType().c_str(), + head_node_peer_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge.src:%s, dst:%s", transdata_node_keep->GetName().c_str(), head_node_peer_anchor->GetOwnerNode()->GetName().c_str()); return GRAPH_FAILED; @@ -562,6 +635,8 @@ graphStatus SameTransdataBreadthFusionPass::ReuseNodesBeforeTransdata(int anchor auto input_desc = in_op_desc->GetInputDesc(in_data_anchor->GetIdx()); CopyTensorDesc(transdata_output_desc, input_desc); if (in_op_desc->UpdateInputDesc(in_data_anchor->GetIdx(), input_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input:%d desc in op:%s(%s) failed", in_data_anchor->GetIdx(), + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(FAILED, "UpdateInputDesc fail."); return FAILED; } @@ -569,6 +644,8 @@ graphStatus SameTransdataBreadthFusionPass::ReuseNodesBeforeTransdata(int anchor auto output_desc = in_op_desc->GetOutputDesc(output_idx); CopyTensorDesc(transdata_output_desc, output_desc); GE_IF_BOOL_EXEC(in_op_desc->UpdateOutputDesc(output_idx, output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update output:%d desc in op:%s(%s) failed", output_idx, + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(GRAPH_FAILED, "update input desc failed"); return GRAPH_FAILED); // relink control edge @@ -610,6 +687,13 @@ graphStatus SameTransdataBreadthFusionPass::LinkNewCastNode2RemainTransdata( GELOGI("remove edge.src:%s, dst:%s", transdata_peer_out_anchor->GetOwnerNode()->GetName().c_str(), transdata_remove_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::RemoveEdge(transdata_peer_out_anchor, transdata_remove_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + transdata_peer_out_anchor->GetOwnerNode()->GetName().c_str(), + transdata_peer_out_anchor->GetOwnerNode()->GetType().c_str(), + transdata_peer_out_anchor->GetIdx(), + transdata_remove_in_anchor->GetOwnerNode()->GetName().c_str(), + transdata_remove_in_anchor->GetOwnerNode()->GetType().c_str(), + transdata_remove_in_anchor->GetIdx()); return GRAPH_FAILED; } @@ -642,6 +726,9 @@ graphStatus SameTransdataBreadthFusionPass::LinkNewCastNode2RemainTransdata( } if (graph->RemoveNode(transdata_node_remove) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + transdata_node_remove->GetName().c_str(), transdata_node_remove->GetType().c_str(), + graph->GetName().c_str()); GELOGE(GRAPH_FAILED, "remove node %s failed!", transdata_node_remove->GetName().c_str()); return GRAPH_FAILED; } @@ -660,6 +747,10 @@ graphStatus SameTransdataBreadthFusionPass::RelinkInControlEdge(const NodePtr &n GELOGD("remove edge.src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_src->GetName().c_str()); if (GraphUtils::RemoveEdge(peer_out_control_anchor, node_src->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + node_src->GetName().c_str(), node_src->GetType().c_str()); GELOGE(GRAPH_FAILED, "remove edge faliled!src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_src->GetName().c_str()); return GRAPH_FAILED; @@ -667,6 +758,10 @@ graphStatus SameTransdataBreadthFusionPass::RelinkInControlEdge(const NodePtr &n GELOGD("add edge.src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_dst->GetName().c_str()); if (GraphUtils::AddEdge(peer_out_control_anchor, node_dst->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_control_anchor->GetOwnerNode()->GetType().c_str(), + node_dst->GetName().c_str(), node_dst->GetType().c_str()); GELOGE(GRAPH_FAILED, "add edge failed!src:%s, dst:%s", peer_out_control_anchor->GetOwnerNode()->GetName().c_str(), node_dst->GetName().c_str()); return GRAPH_FAILED; @@ -713,10 +808,16 @@ graphStatus SameTransdataBreadthFusionPass::AddCastNode(const ComputeGraphPtr &g auto cast_node = graph->AddNode(cast_op_desc); if (cast_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + cast_op_desc->GetName().c_str(), cast_op_desc->GetType().c_str(), graph->GetName().c_str()); return GRAPH_FAILED; } GELOGD("add edge.src:%s, dst:%s", pre_out_anchor->GetOwnerNode()->GetName().c_str(), cast_node->GetName().c_str()); if (GraphUtils::AddEdge(pre_out_anchor, cast_node->GetInDataAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + pre_out_anchor->GetOwnerNode()->GetName().c_str(), + pre_out_anchor->GetOwnerNode()->GetType().c_str(), pre_out_anchor->GetIdx(), + cast_node->GetName().c_str(), cast_node->GetType().c_str()); return GRAPH_FAILED; } if (i == 0) { @@ -724,6 +825,8 @@ graphStatus SameTransdataBreadthFusionPass::AddCastNode(const ComputeGraphPtr &g } if (!AttrUtils::SetBool(cast_op_desc, ATTR_NEED_COMPILE, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NEED_COMPILE.c_str(), + cast_op_desc->GetName().c_str(), cast_op_desc->GetType().c_str()); GELOGE(FAILED, "SetExtAttr fail."); return FAILED; } @@ -738,6 +841,7 @@ graphStatus SameTransdataBreadthFusionPass::GetSubGraphsBetweenNormalAndTransdat std::vector> &nodes_list) { graphStatus ret = GRAPH_SUCCESS; if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param out_anchor is nullptr, check invalid"); GELOGE(GRAPH_FAILED, "out data anchor is null!This should not happen!"); return GRAPH_FAILED; } diff --git a/ge/graph/passes/save_pass.cc b/ge/graph/passes/save_pass.cc index a2e34b1d..b82a6420 100755 --- a/ge/graph/passes/save_pass.cc +++ b/ge/graph/passes/save_pass.cc @@ -47,7 +47,9 @@ Status SavePass::Run(ge::ComputeGraphPtr graph) { out_index.emplace_back(out_anchor->GetIdx()); ge::OpDescPtr op_desc = peer_node->GetOpDesc(); GE_IF_BOOL_EXEC(!ge::AttrUtils::SetStr(op_desc, kVarAttrVarIsSave, kVarIsSave), - GELOGE(INTERNAL_ERROR, "get kVarAttrVarIsSave failed"); return INTERNAL_ERROR); + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kVarAttrVarIsSave, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "get kVarAttrVarIsSave failed"); return INTERNAL_ERROR); } } } @@ -65,6 +67,8 @@ Status SavePass::Run(ge::ComputeGraphPtr graph) { for (auto &node_ptr : del_nodes) { auto ret = graph->RemoveNode(node_ptr); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) from graph:%s failed", + node_ptr->GetName().c_str(), node_ptr->GetType().c_str(), graph->GetName().c_str()); GELOGE(ret, "GraphUtils::RemoveNodeWithoutRelink failed."); return ret; } diff --git a/ge/graph/passes/set_input_output_offset_pass.cc b/ge/graph/passes/set_input_output_offset_pass.cc index ec41d6be..d3c1e07d 100644 --- a/ge/graph/passes/set_input_output_offset_pass.cc +++ b/ge/graph/passes/set_input_output_offset_pass.cc @@ -54,6 +54,8 @@ Status SetInputOutputOffsetPass::SetInputOffsetForFusion(const std::vector input_offset_of_node; input_offset_of_node = op_desc->GetInputOffset(); if (input_offset_of_node.size() < i) { + REPORT_INNER_ERROR("E19999", "Input offsets size:%zu of node:%s(%s) < index:%zu, check invalid", + input_offset_of_node.size(), op_desc->GetName().c_str(), op_desc->GetType().c_str(), i); GELOGE(PARAM_INVALID, "not get input_offset of %zu", i); return PARAM_INVALID; } @@ -77,10 +79,15 @@ Status SetInputOutputOffsetPass::SetInputOffsetForFusion(const std::vectorGetName().c_str(), data_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC( ge::AttrUtils::SetListInt(data_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + data_op_desc->GetName().c_str(), data_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); } @@ -115,10 +122,15 @@ Status SetInputOutputOffsetPass::SetInputOffsetForHcom(const ge::NodePtr &node, zero_copy_basic_offset.emplace_back(output_offset); zero_copy_relative_offset.emplace_back(relative_offset); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(in_op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_ZERO_COPY_BASIC_OFFSET.c_str(), + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC( ge::AttrUtils::SetListInt(in_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + in_op_desc->GetName().c_str(), in_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); } @@ -159,6 +171,9 @@ Status SetInputOutputOffsetPass::SetOutputOffsetForConcat(const NodePtr &node) { output_offset_of_concat = op_desc->GetOutputOffset(); // phony_concat has one output GE_IF_BOOL_EXEC(output_offset_of_concat.size() != 1, + REPORT_INNER_ERROR("E19999", "Output offsets size:%zu of node:%s(%s) not equal to 1, check invalid", + output_offset_of_concat.size(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(PARAM_INVALID, "%s should has one output.", node->GetName().c_str()); return PARAM_INVALID); NodePtr net_output = node->GetOutDataNodes().at(0); @@ -186,9 +201,14 @@ Status SetInputOutputOffsetPass::SetOutputOffsetForConcat(const NodePtr &node) { zero_copy_relative_offset.emplace_back(relative_offset); } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_BASIC_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); return SUCCESS; @@ -232,9 +252,14 @@ Status SetInputOutputOffsetPass::SetOutputOffsetForHcom(const NodePtr &node, con } GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_BASIC_OFFSET, zero_copy_basic_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_ZERO_COPY_BASIC_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_basic_offset failed."); return FAILED); GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(out_op_desc, ATTR_ZERO_COPY_RELATIVE_OFFSET, zero_copy_relative_offset), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", + ATTR_ZERO_COPY_RELATIVE_OFFSET.c_str(), + out_op_desc->GetName().c_str(), out_op_desc->GetType().c_str()); GELOGE(FAILED, "SetListInt of zero_copy_relative_offset failed."); return FAILED); return SUCCESS; diff --git a/ge/graph/passes/snapshot_pass.cc b/ge/graph/passes/snapshot_pass.cc index 2b578e51..469a70af 100644 --- a/ge/graph/passes/snapshot_pass.cc +++ b/ge/graph/passes/snapshot_pass.cc @@ -29,6 +29,8 @@ Status SnapshotPass::Run(NodePtr &node) { string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get OriginalType of op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(status_ret, "SnapshotPass get original type failed."); return status_ret; } diff --git a/ge/graph/passes/stop_gradient_pass.cc b/ge/graph/passes/stop_gradient_pass.cc index 223e4513..33d07803 100644 --- a/ge/graph/passes/stop_gradient_pass.cc +++ b/ge/graph/passes/stop_gradient_pass.cc @@ -20,12 +20,15 @@ namespace ge { Status StopGradientPass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } string type; Status status_ret = GetOriginalType(node, type); if (status_ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Get OriginalType of op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(status_ret, "StopGradientPass get original type failed."); return status_ret; } diff --git a/ge/graph/passes/subexpression_migration_pass.cc b/ge/graph/passes/subexpression_migration_pass.cc index 05b7baa1..d70ed05d 100755 --- a/ge/graph/passes/subexpression_migration_pass.cc +++ b/ge/graph/passes/subexpression_migration_pass.cc @@ -144,6 +144,8 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + graph->GetName().c_str(), name.c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -156,6 +158,8 @@ Status SubexpressionMigrationPass::ClassifyDataNodes(const ComputeGraphPtr &grap uint32_t parent_index = 0; if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", data->GetName().c_str()); return FAILED; } @@ -229,6 +233,7 @@ bool SubexpressionMigrationPass::IsParallelNodeSame(const mapsecond; auto data_it = data_nodes.find(node_idx); if (data_it == data_nodes.end()) { + REPORT_INNER_ERROR("E19999", "Find node in data_nodes by index:%u failed", node_idx); GELOGE(FAILED, "Data: %s not fount, index: %u", base_node->GetName().c_str(), node_idx); return false; } @@ -238,12 +243,15 @@ bool SubexpressionMigrationPass::IsParallelNodeSame(const mapGetPeerInDataAnchors(); const auto &in_anchor = in_anchors.at(anchor_idx); if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u anchor not exist in out:%u data anchor's peer of node:%s(%s)", + node_idx, kDataOutIndex, work_data->GetName().c_str(), work_data->GetType().c_str()); GELOGE(FAILED, "Data anchor size: %u, anchor size: %zu", anchor_idx, in_anchors.size()); return false; } const auto &work_node = in_anchor->GetOwnerNode(); if (work_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner node of anchor is nullptr, check invalid"); GELOGE(FAILED, "Data: %s not found, index: %u", base_node->GetName().c_str(), node_idx); return false; } @@ -338,17 +346,22 @@ Status SubexpressionMigrationPass::AppendParallelNode(mapGetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, item.second)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } @@ -392,12 +405,14 @@ Status SubexpressionMigrationPass::DetachParallelNode(const mapGetAllOutDataAnchorsSize(); ++i) { auto it_idx = outputs.find(i); if (it_idx == outputs.end()) { + REPORT_INNER_ERROR("E19999", "Node: %s parent index %u not found, check invalid", detach->GetName().c_str(), i); GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i); return FAILED; } auto it_data = graph_datas.find(it_idx->second); if (it_data == graph_datas.end()) { + REPORT_INNER_ERROR("E19999", "Node: %s parent index %u not found, check invalid", detach->GetName().c_str(), i); GELOGE(FAILED, "Node: %s parent index %u not found", detach->GetName().c_str(), i); return FAILED; } @@ -444,6 +459,7 @@ Status SubexpressionMigrationPass::AttachParallelNode(const ComputeGraphPtr &gra for (uint32_t i = 0; i < attach->GetAllInDataAnchorsSize(); ++i) { auto it_idx = inputs.find(i); if (it_idx == inputs.end()) { + REPORT_INNER_ERROR("E19999", "Node: %s parent index %u not found, check invalid", attach->GetName().c_str(), i); GELOGE(FAILED, "Node: %s parent index %u not found", attach->GetName().c_str(), i); return FAILED; } @@ -505,6 +521,7 @@ Status SubexpressionMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph uint32_t anchor_idx, const map &inputs, const map &outputs) { if (inputs.empty()) { + REPORT_INNER_ERROR("E19999", "Param inputs is empty, check invalid"); GELOGE(FAILED, "Graph: %s, inputs is empty", graph->GetName().c_str()); return FAILED; } @@ -516,6 +533,8 @@ Status SubexpressionMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto &subnodes = groups.second; auto it = subnodes.find(base_index); if (it == subnodes.end()) { + REPORT_INNER_ERROR("E19999", "Index:%u data node not found in graph:%s, check invalid", + base_index, subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Data: %u node not found", subgraph->GetName().c_str(), base_index); return FAILED; } @@ -525,12 +544,15 @@ Status SubexpressionMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto &in_anchors = out_anchor->GetPeerInDataAnchors(); const auto &in_anchor = in_anchors.at(anchor_idx); if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Index:%u anchor not exist in out:%u data anchor's peer of node:%s(%s)", + anchor_idx, kDataOutIndex, base_data->GetName().c_str(), base_data->GetType().c_str()); GELOGE(FAILED, "Data anchor index: %u, anchor size: %zu", anchor_idx, in_anchors.size()); return FAILED; } move_node = in_anchor->GetOwnerNode(); if (move_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner node of anchor is nullptr, check invalid"); GELOGE(FAILED, "Data: %s not found, index: %u", base_data->GetName().c_str(), base_index); return FAILED; } diff --git a/ge/graph/passes/subgraph_const_migration_pass.cc b/ge/graph/passes/subgraph_const_migration_pass.cc index 0c0ca1d5..3b3b7e0b 100644 --- a/ge/graph/passes/subgraph_const_migration_pass.cc +++ b/ge/graph/passes/subgraph_const_migration_pass.cc @@ -141,6 +141,8 @@ Status SubgraphConstMigrationPass::ClassifyGraphNodes(const ComputeGraphPtr &gra for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_INNER_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + graph->GetName().c_str(), name.c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -152,6 +154,8 @@ Status SubgraphConstMigrationPass::ClassifyGraphNodes(const ComputeGraphPtr &gra if (node->GetType() == DATA) { uint32_t parent_index = kInvalidParent; if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } @@ -326,17 +330,22 @@ Status SubgraphConstMigrationPass::AppendParallelNode(const NodePtr &func_node, OpDescBuilder op_builder(data_name, DATA); const auto op_desc = op_builder.AddInput("x").AddOutput("y").Build(); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "Build op:%s(%s) failed", data_name.c_str(), DATA); GELOGE(OUT_OF_MEMORY, "Create multi-batch subgraph data desc failed"); return OUT_OF_MEMORY; } uint32_t data_index = parent_index - kCaseInputBase; if (!AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, data_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } if (!AttrUtils::SetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", op_desc->GetName().c_str()); return FAILED; } @@ -460,6 +469,8 @@ Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const map> &all_data_nodes, const string &node_key, uint32_t parent_index) { if (node_key.empty() || parent_index == kInvalidParent) { + REPORT_INNER_ERROR("E19999", "Param node_key is empty or param parent_index is 0x%X, check invalid", + kInvalidParent); GELOGE(FAILED, "Graph: %s, node key: %s, parent index: %u invalid", graph->GetName().c_str(), node_key.c_str(), parent_index); return FAILED; @@ -470,6 +481,8 @@ Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto &subgraph = item.first; const auto it_const = item.second.find(node_key); if (it_const == item.second.end()) { + REPORT_INNER_ERROR("E19999", "Const node name:%s not found in graph:%s, check invalid", + node_key.c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str()); return FAILED; } @@ -477,11 +490,15 @@ Status SubgraphConstMigrationPass::MoveNodeToParent(const ComputeGraphPtr &graph const auto it_nodes = all_data_nodes.find(subgraph); if (it_nodes == all_data_nodes.end()) { + REPORT_INNER_ERROR("E19999", "Const node name:%s not found in graph:%s, check invalid", + node_key.c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str()); return FAILED; } const auto it_data = it_nodes->second.find(parent_index); if (it_data == it_nodes->second.end()) { + REPORT_INNER_ERROR("E19999", "Const node name:%s not found in graph:%s, check invalid", + node_key.c_str(), subgraph->GetName().c_str()); GELOGE(FAILED, "Graph: %s, Const: %s node not found", subgraph->GetName().c_str(), node_key.c_str()); return FAILED; } diff --git a/ge/graph/passes/subgraph_pass.cc b/ge/graph/passes/subgraph_pass.cc index f140644e..b931eea8 100755 --- a/ge/graph/passes/subgraph_pass.cc +++ b/ge/graph/passes/subgraph_pass.cc @@ -94,6 +94,8 @@ Status SubgraphPass::SubgraphInputNode(const ComputeGraphPtr &graph, const NodeP uint32_t parent_index = 0; if (!AttrUtils::GetInt(node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "Get attr PARENT_NODE_INDEX failed, node:%s.", node->GetName().c_str()); return FAILED; } @@ -208,6 +210,8 @@ Status SubgraphPass::WhileBodySubgraph(const ComputeGraphPtr &graph, const NodeP // index of body_subgraph is 1 ComputeGraphPtr while_body = NodeUtils::GetSubgraph(*node, 1); if (while_body == nullptr) { + REPORT_INNER_ERROR("E19999", "While_body of node:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "while_body of %s is NULL.", node->GetName().c_str()); return FAILED; } @@ -242,12 +246,16 @@ Status SubgraphPass::WhileBodySubgraph(const ComputeGraphPtr &graph, const NodeP if (output_node == nullptr) { output_node = n; } else { + REPORT_INNER_ERROR("E19999", "While_body graph:%s exists multi NetOutput nodes, check invalid", + while_body->GetName().c_str()); GELOGE(FAILED, "while_body %s exists multi NetOutput nodes.", while_body->GetName().c_str()); return FAILED; } } } if (output_node == nullptr) { + REPORT_INNER_ERROR("E19999", "While_body graph:%s has no output, check invalid", + while_body->GetName().c_str()); GELOGE(FAILED, "while_body %s has no output.", while_body->GetName().c_str()); return FAILED; } @@ -462,6 +470,10 @@ Status SubgraphPass::InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDat (void)AttrUtils::SetBool(op_desc, ATTR_NO_NEED_CONSTANT_FOLDING, false); (void)AttrUtils::SetBool(op_desc, ATTR_NAME_CANNOT_BE_DELETED, true); if (GraphUtils::InsertNodeAfter(out_anchor, in_anchors, graph->AddNode(op_desc)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Insert Cast node %s(%s) after %s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), + out_anchor->GetOwnerNode()->GetName().c_str(), + out_anchor->GetOwnerNode()->GetType().c_str()); GELOGE(FAILED, "Insert IDENTITY node %s after %s failed.", name.c_str(), in_node->GetName().c_str()); return FAILED; } @@ -481,6 +493,9 @@ Status SubgraphPass::InsertMemcpyNode(const ComputeGraphPtr &graph, const OutDat Status SubgraphPass::InsertNodeBetween(const OutDataAnchorPtr &src, const std::vector &dsts, const NodePtr &insert_node, uint32_t input_index, uint32_t output_index) { if (GraphUtils::AddEdge(src, insert_node->GetInDataAnchor(input_index)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + src->GetOwnerNode()->GetName().c_str(), src->GetOwnerNode()->GetType().c_str(), src->GetIdx(), + insert_node->GetName().c_str(), insert_node->GetType().c_str(), input_index); GELOGE(FAILED, "Add data_edge %s:%d->%s:%u failed.", src->GetOwnerNode()->GetName().c_str(), src->GetIdx(), insert_node->GetName().c_str(), input_index); return FAILED; @@ -490,6 +505,12 @@ Status SubgraphPass::InsertNodeBetween(const OutDataAnchorPtr &src, const std::v dst->GetOwnerNode()->GetName().c_str()); if ((GraphUtils::RemoveEdge(src, dst) != GRAPH_SUCCESS) || (GraphUtils::AddEdge(insert_node->GetOutDataAnchor(output_index), dst) != GRAPH_SUCCESS)) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) or " + "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + src->GetOwnerNode()->GetName().c_str(), src->GetOwnerNode()->GetType().c_str(), src->GetIdx(), + dst->GetOwnerNode()->GetName().c_str(), dst->GetOwnerNode()->GetType().c_str(), dst->GetIdx(), + insert_node->GetName().c_str(), insert_node->GetType().c_str(), output_index, + dst->GetOwnerNode()->GetName().c_str(), dst->GetOwnerNode()->GetType().c_str(), dst->GetIdx()); GELOGE(FAILED, "Replace data_edge %s:%d->%s:%d by %s:%u->%s:%d failed.", src->GetOwnerNode()->GetName().c_str(), src->GetIdx(), dst->GetOwnerNode()->GetName().c_str(), dst->GetIdx(), diff --git a/ge/graph/passes/switch_data_edges_bypass.cc b/ge/graph/passes/switch_data_edges_bypass.cc index f7453dd7..6a925ae3 100644 --- a/ge/graph/passes/switch_data_edges_bypass.cc +++ b/ge/graph/passes/switch_data_edges_bypass.cc @@ -50,6 +50,8 @@ bool IsSwitchInWhileLoop(const NodePtr &node) { std::vector> GetOutDataNodesByIndex(const NodePtr &node, int index) { auto out_anchor = node->GetOutDataAnchor(index); if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d out data anchor, check invalid", + node->GetName().c_str(), node->GetType().c_str(), index); GELOGE(PARAM_INVALID, "Failed to get out data nodes of index %d from node %s, the anchor does not exists", index, node->GetName().c_str()); return {}; @@ -84,18 +86,23 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { auto node_desc = node->GetOpDesc(); if (node_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to add identity after node %s index %d, the op desc is null", node->GetName().c_str(), index); return nullptr; } auto tensor = node_desc->GetOutputDescPtr(index); if (tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d output tensor, check invalid", + node_desc->GetName().c_str(), node_desc->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Failed to find the tensor by index %d from node %s, can not add the identity node", index, node->GetName().c_str()); return nullptr; } auto anchor = node->GetOutDataAnchor(index); if (anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d out data anchor, check invalid", + node->GetName().c_str(), node->GetType().c_str(), index); GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d, the out anchor does not exists", node->GetName().c_str(), index); return nullptr; @@ -104,6 +111,7 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { auto identity_opdesc = MakeShared("SwitchDataEdgesByPass_Identity_" + std::to_string(identity_counter), IDENTITY); if (identity_opdesc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d", node->GetName().c_str(), index); return nullptr; } @@ -111,6 +119,9 @@ NodePtr AddIdentityAfterNode(const NodePtr &node, int index) { auto ret2 = identity_opdesc->AddOutputDesc("y", *tensor); auto identity = node->GetOwnerComputeGraph()->AddNode(identity_opdesc); if (ret1 != GRAPH_SUCCESS || ret2 != GRAPH_SUCCESS || identity == nullptr) { + REPORT_CALL_ERROR("E19999", "Add input ouput desc to op:%s(%s) failed or add it to graph:%s failed", + identity_opdesc->GetName().c_str(), identity_opdesc->GetType().c_str(), + node->GetOwnerComputeGraph()->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to add identity after node %s index %d", node->GetName().c_str(), index); return nullptr; } @@ -124,18 +135,23 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { auto node_desc = node->GetOpDesc(); if (node_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to add memcpy before node %s index %d, null op desc", node->GetName().c_str(), index); return nullptr; } auto tensor = node_desc->GetInputDescPtr(index); if (tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d input tensor, check invalid", + node_desc->GetName().c_str(), node_desc->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Failed to find the tensor by index %d from node %s, can not add the memcpy node", index, node->GetName().c_str()); return nullptr; } auto anchor = node->GetInDataAnchor(index); if (anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d in data anchor, check invalid", + node->GetName().c_str(), node->GetType().c_str(), index); GELOGE(INTERNAL_ERROR, "Failed to add memcpy before node %s index %d, the in anchor does not exists", node->GetName().c_str(), index); return nullptr; @@ -143,6 +159,7 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { auto memcpy_opdesc = MakeShared("SwitchDataEdgesByPass_Memcpy_" + std::to_string(counter), MEMCPYASYNC); if (memcpy_opdesc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(OUT_OF_MEMORY, "Failed to add memcpy before node %s index %d", node->GetName().c_str(), index); return nullptr; } @@ -150,6 +167,9 @@ NodePtr AddMemcpyBeforeNode(const NodePtr &node, int index) { auto ret2 = memcpy_opdesc->AddOutputDesc(*tensor); auto memcpy_node = node->GetOwnerComputeGraph()->AddNode(memcpy_opdesc); if (ret1 != GRAPH_SUCCESS || ret2 != GRAPH_SUCCESS || memcpy_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add input ouput desc to op:%s(%s) failed or add it to graph:%s failed", + memcpy_opdesc->GetName().c_str(), memcpy_opdesc->GetType().c_str(), + node->GetOwnerComputeGraph()->GetName().c_str()); GELOGE(OUT_OF_MEMORY, "Failed to add memcpy before node %s index %d", node->GetName().c_str(), index); return nullptr; } diff --git a/ge/graph/passes/switch_dead_branch_elimination.cc b/ge/graph/passes/switch_dead_branch_elimination.cc index 20598f17..b840bfc7 100644 --- a/ge/graph/passes/switch_dead_branch_elimination.cc +++ b/ge/graph/passes/switch_dead_branch_elimination.cc @@ -31,6 +31,7 @@ const int kDefaultInputIndex = -1; bool ParsePred(const ConstGeTensorPtr &tensor) { if (tensor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param tensor is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return false; } @@ -65,6 +66,8 @@ bool ParseOutDataAnchors(const NodePtr &node, const NodePtr &pred_node, OutDataA OutDataAnchorPtr &inactive_out_data_anchor) { auto tensors = OpDescUtils::MutableWeights(pred_node); if (tensors.empty()) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no weight, check invalid", + pred_node->GetName().c_str(), pred_node->GetType().c_str()); return false; } @@ -72,6 +75,7 @@ bool ParseOutDataAnchors(const NodePtr &node, const NodePtr &pred_node, OutDataA int inactive_output_index = pred_value ? 0 : 1; if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return false; } @@ -91,6 +95,7 @@ bool ParseOutDataAnchors(const NodePtr &node, const NodePtr &pred_node, OutDataA Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pred_node, const OutDataAnchorPtr &active_out_data_anchor) { if (node == nullptr || active_out_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node or active_out_data_anchor is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } @@ -102,6 +107,9 @@ Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pre // link pred's in control nodes to switch if (GraphUtils::CopyInCtrlEdges(pred_node, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + pred_node->GetName().c_str(), pred_node->GetType().c_str(), + node->GetName().c_str(), node->GetType().c_str()); return FAILED; } // Remove link between pred and switch @@ -114,6 +122,8 @@ Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pre std::vector switch_io_map = {kDefaultInputIndex, kDefaultInputIndex}; size_t out_index = static_cast(active_out_data_anchor->GetIdx()); if (out_index >= switch_io_map.size()) { + REPORT_INNER_ERROR("E19999", "Out index:%zu of node:%s(%s) >= %zu, check invalid", out_index, + node->GetName().c_str(), node->GetType().c_str(), switch_io_map.size()); GELOGE(FAILED, "[%s] out index check failed, out_index:%zu.", node->GetName().c_str(), out_index); return FAILED; } @@ -123,6 +133,7 @@ Status SwitchDeadBranchElimination::DeleteSwitchNode(NodePtr &node, NodePtr &pre Status SwitchDeadBranchElimination::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "Param [node] must not be null."); return PARAM_INVALID; } @@ -168,6 +179,8 @@ Status SwitchDeadBranchElimination::Run(NodePtr &node) { std::vector end_nodes; Status ret = PassUtils::RemoveInactiveBranchToMerge(inactive_out_data_anchor, del_nodes, end_nodes); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove inactive branch from node:%s(%s) to merge failed", + node->GetName().c_str(), node->GetType().c_str()); return ret; } diff --git a/ge/graph/passes/switch_logic_remove_pass.cc b/ge/graph/passes/switch_logic_remove_pass.cc index a6758e86..bce714ad 100644 --- a/ge/graph/passes/switch_logic_remove_pass.cc +++ b/ge/graph/passes/switch_logic_remove_pass.cc @@ -45,11 +45,15 @@ Status GetPredNode(const NodePtr &switch_node, PredNodeAndOut &pred_node_index) GE_CHECK_NOTNULL(switch_node); auto pred_in_anchor = switch_node->GetInDataAnchor(kSwitchPredIndex); if (pred_in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no index:%d in data anchor, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), kSwitchPredIndex); GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, no pred anchor", switch_node->GetName().c_str()); return INTERNAL_ERROR; } auto pred_node_anchor = pred_in_anchor->GetPeerOutAnchor(); if (pred_node_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d in data anchor, its peer anchor is nullptr, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), kSwitchPredIndex); GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, node peer out anchor", switch_node->GetName().c_str()); @@ -57,6 +61,8 @@ Status GetPredNode(const NodePtr &switch_node, PredNodeAndOut &pred_node_index) } auto pred_node = pred_node_anchor->GetOwnerNode(); if (pred_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d in data anchor, its peer node is nullptr, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str(), kSwitchPredIndex); GELOGE(INTERNAL_ERROR, "Failed to get pred node for switch %s, null node", switch_node->GetName().c_str()); @@ -89,11 +95,15 @@ Status SwitchLogicRemovePass::Run(NodePtr &node) { } for (auto &in_anchor : out_anchor->GetPeerInDataAnchors()) { if (in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d out data anchor, its peer anchors has nullptr, " + "check invalid", node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "The in-anchor from out anchor %d node %s is null", i, node->GetName().c_str()); return INTERNAL_ERROR; } auto dst_node = in_anchor->GetOwnerNode(); if (dst_node == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s)'s index:%d out data anchor, its peer nodes has nullptr, " + "check invalid", node->GetName().c_str(), node->GetType().c_str(), i); GELOGE(INTERNAL_ERROR, "The peer node from out anchor %d node %s is null", i, node->GetName().c_str()); return INTERNAL_ERROR; } @@ -143,6 +153,8 @@ Status SwitchLogicRemovePass::RemoveSwitchNodeLogically(int parent_index, NodePt std::vector end_nodes; auto ret = PassUtils::RemoveInactiveBranchToMerge(out_anchor, deleted_nodes, end_nodes); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove inactive branch from node:%s(%s) to merge failed", + switch_node->GetName().c_str(), switch_node->GetType().c_str()); return ret; } diff --git a/ge/graph/passes/switch_to_stream_switch_pass.cc b/ge/graph/passes/switch_to_stream_switch_pass.cc index af8017d8..97d9926f 100644 --- a/ge/graph/passes/switch_to_stream_switch_pass.cc +++ b/ge/graph/passes/switch_to_stream_switch_pass.cc @@ -33,8 +33,14 @@ Status SwitchToStreamSwitchPass::Run(ComputeGraphPtr graph) { GE_CHK_STATUS_RET(CombineSwitchNode(graph), "Combine StreamSwitch nodes failed."); for (const auto &node : bypass_nodes_) { - GE_CHK_BOOL_EXEC(graph->IsolateNode(node) == GRAPH_SUCCESS, return FAILED, "Isolate node failed."); - GE_CHK_BOOL_EXEC(GraphUtils::RemoveNodeWithoutRelink(graph, node) == GRAPH_SUCCESS, return FAILED, + GE_CHK_BOOL_EXEC(graph->IsolateNode(node) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); + return FAILED, "Isolate node failed."); + GE_CHK_BOOL_EXEC(GraphUtils::RemoveNodeWithoutRelink(graph, node) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); + return FAILED, "Remove switch node failed."); } @@ -159,7 +165,11 @@ Status SwitchToStreamSwitchPass::ReplaceSwitchNode(const ComputeGraphPtr &graph, OpDescPtr cond_desc = peer_cond_anchor->GetOwnerNode()->GetOpDesc(); GE_CHECK_NOTNULL(cond_desc); DataType cond_data_type = cond_desc->GetOutputDesc(peer_cond_anchor->GetIdx()).GetDataType(); - GE_CHK_BOOL_EXEC(cond_data_type == DT_BOOL, return FAILED, + GE_CHK_BOOL_EXEC(cond_data_type == DT_BOOL, + REPORT_INNER_ERROR("E19999", "Pred_input of Switch node:%s(%s) only support DT_BOOL data_type, " + "but %s exactly", switch_node->GetName().c_str(), switch_node->GetType().c_str(), + TypeUtils::DataTypeToSerialString(cond_data_type).c_str()); + return FAILED, "pred_input of Switch only support DT_BOOL data_type, but %s exactly.", TypeUtils::DataTypeToSerialString(cond_data_type).c_str()); @@ -176,6 +186,8 @@ Status SwitchToStreamSwitchPass::ReplaceSwitchNode(const ComputeGraphPtr &graph, stream_switch = CreateStreamSwitchNode(graph, switch_node, true_branch_flag ? "_t" : "_f", peer_cond_anchor); GE_CHK_BOOL_EXEC(stream_switch != nullptr, return FAILED, "Create stream_switch node failed."); if (SetSwitchTrueBranchFlag(stream_switch, true_branch_flag) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set switch true branch flag from node:%s(%s) failed", + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); GELOGE(FAILED, "SetSwitchTrueBranchFlag for node %s failed.", stream_switch->GetName().c_str()); return FAILED; } @@ -204,6 +216,8 @@ Status SwitchToStreamSwitchPass::ReplaceSwitchNode(const ComputeGraphPtr &graph, MoveCtrlEdges(switch_node, stream_switch); switch_node_map_[stream_switch] = out_node_list; if (SetOriginalNodeName(stream_switch, switch_node->GetName()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set original node name:%s to node:%s(%s) failed", switch_node->GetName().c_str(), + stream_switch->GetName().c_str(), stream_switch->GetType().c_str()); GELOGE(FAILED, "SetOriginalNodeName for node %s failed.", stream_switch->GetName().c_str()); return FAILED; } @@ -230,6 +244,10 @@ Status SwitchToStreamSwitchPass::BypassSwitchNode(const NodePtr &switch_node, Ou GE_CHECK_NOTNULL(peer_out_anchor); // Remove Switch data input. if (GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%u) failed", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), peer_out_anchor->GetIdx(), + switch_node->GetName().c_str(), switch_node->GetType().c_str(), idx); GELOGE(FAILED, "Remove data edge %s->%s failed.", peer_out_anchor->GetOwnerNode()->GetName().c_str(), switch_node->GetName().c_str()); return FAILED; @@ -284,8 +302,13 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & const std::string &suffix, const OutDataAnchorPtr &peer_cond_anchor) { OpDescPtr switch_op_desc = switch_node->GetOpDesc(); - GE_CHK_BOOL_EXEC(switch_op_desc != nullptr, return nullptr, "OpDesc of Switch node is invalid."); + GE_CHK_BOOL_EXEC(switch_op_desc != nullptr, + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); + return nullptr, "OpDesc of Switch node is invalid."); GE_IF_BOOL_EXEC(switch_op_desc->GetInputsSize() != SWITCH_INPUT_NUM, { + REPORT_INNER_ERROR("E19999", "Input desc size:%zu of node:%s(%s) not equal to %u, check invalid", + switch_op_desc->GetInputsSize(), + switch_op_desc->GetName().c_str(), switch_op_desc->GetType().c_str(), SWITCH_INPUT_NUM); GELOGE(FAILED, "Switch input param invalid, input_size=%lu, should be %u.", switch_op_desc->GetInputsSize(), SWITCH_INPUT_NUM); return nullptr; @@ -295,6 +318,7 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & GELOGI("Create StreamSwitch, name=%s.", node_name.c_str()); OpDescPtr op_desc = MakeShared(node_name, STREAMSWITCH); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, StreamSwitch:%s.", node_name.c_str()); return nullptr; } @@ -316,6 +340,9 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & if (!AttrUtils::SetInt(op_desc, ATTR_NAME_SWITCH_DATA_TYPE, RT_SWITCH_INT32) || !AttrUtils::SetInt(op_desc, ATTR_NAME_STREAM_SWITCH_COND, (int64_t)RT_EQUAL)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s or Attr:%s to op:%s(%s) failed", + ATTR_NAME_SWITCH_DATA_TYPE.c_str(), ATTR_NAME_STREAM_SWITCH_COND.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set int failed"); return nullptr; } @@ -323,13 +350,22 @@ NodePtr SwitchToStreamSwitchPass::CreateStreamSwitchNode(const ComputeGraphPtr & // Already checked, first input is Variable will passed, second is condition will checked. GeTensorDesc cond_input_desc = switch_op_desc->GetInputDesc(SWITCH_PRED_INPUT); GeTensorDesc input_desc(GeShape(cond_input_desc.GetShape().GetDims()), cond_input_desc.GetFormat(), DT_INT32); - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return nullptr, "Create StreamSwitch node: add input desc failed."); - GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(op_desc->AddInputDesc(input_desc) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return nullptr, "Create StreamSwitch node: add input desc failed."); NodePtr stream_switch = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(stream_switch != nullptr, return nullptr, "Insert StreamSwitch node failed."); + GE_CHK_BOOL_EXEC(stream_switch != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); + return nullptr, "Insert StreamSwitch node failed."); GE_CHK_STATUS(GraphUtils::AddEdge(peer_cond_anchor, stream_switch->GetInDataAnchor(0)), "StreamSwitch node add cond edge failed."); @@ -361,6 +397,8 @@ Status SwitchToStreamSwitchPass::MarkBranches(const OutDataAnchorPtr &peer_cond_ it->second[switch_group_id] = switch_list; } else { GE_IF_BOOL_EXEC(switch_group_it->second.size() != SWITCH_OUTPUT_NUM, { + REPORT_INNER_ERROR("E19999", "switch group size:%zu not equal to %u, group_id:%ld, check invalid", + switch_group_it->second.size(), SWITCH_OUTPUT_NUM, switch_group_id); GELOGE(INTERNAL_ERROR, "Check size failed, node: %s", stream_switch->GetName().c_str()); return FAILED; }); @@ -443,6 +481,8 @@ Status SwitchToStreamSwitchPass::CombineSwitchNode(const ComputeGraphPtr &graph) GE_CHK_STATUS(GraphUtils::AddEdge(cast_node->GetOutControlAnchor(), active_node->GetInControlAnchor()), "StreamActive add ctl edge failed."); if (SetActiveLabelList(active_node, { cast_node->GetName() }) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set active label list:%s to op:%s(%s) failed", + cast_node->GetName().c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(FAILED, "Set active_label_list attr for node %s failed.", active_node->GetName().c_str()); return FAILED; } @@ -456,7 +496,13 @@ Status SwitchToStreamSwitchPass::CombineSwitchNode(const ComputeGraphPtr &graph) // select first stream_switch NodePtr stream_switch = switch_list.front(); // set stream_label - GE_CHK_STATUS_RET(SetStreamLabel(stream_switch, cast_node->GetName()), "Set stream label failed."); + if (SetStreamLabel(stream_switch, cast_node->GetName()) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set stream_label:%s to op:%s(%s) failed", + cast_node->GetName().c_str(), stream_switch->GetName().c_str(), + stream_switch->GetType().c_str()); + GELOGE(FAILED, "Set stream label failed."); + return FAILED; + } OpDescPtr switch_desc = stream_switch->GetOpDesc(); GE_CHECK_NOTNULL(switch_desc); switch_desc->SetName(CheckDuplicateName(cond_group + "/" + STREAMSWITCH + (true_branch_flag ? "_t" : "_f"))); @@ -497,18 +543,27 @@ NodePtr SwitchToStreamSwitchPass::CreateActiveNode(const ComputeGraphPtr &graph, GELOGI("Create StreamActive op:%s.", node_name.c_str()); OpDescPtr op_desc = MakeShared(node_name, STREAMACTIVE); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, StreamActive:%s.", node_name.c_str()); return nullptr; } NodePtr active_node = graph->AddNode(op_desc); - GE_CHK_BOOL_EXEC(active_node != nullptr, return nullptr, "Create StreamActive node failed."); + GE_CHK_BOOL_EXEC(active_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), graph->GetName().c_str()); + return nullptr, "Create StreamActive node failed."); GE_IF_BOOL_EXEC(GraphUtils::AddEdge(node->GetOutControlAnchor(), active_node->GetInControlAnchor()) != SUCCESS, + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add edge failed"); return nullptr); GE_IF_BOOL_EXEC(SetSwitchBranchNodeLabel(active_node, node_name) != SUCCESS, + REPORT_CALL_ERROR("E19999", "Set switch branch node label:%s to node:%s(%s) failed", + node_name.c_str(), active_node->GetName().c_str(), active_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set switch branch node label failed"); return nullptr); @@ -529,6 +584,7 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con GELOGI("Create cast_node: %s, input datatype:DT_BOOL, out datatype:DT_INT32", cast_name.c_str()); OpDescPtr cast_desc = MakeShared(cast_name, CAST); if (cast_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, Cast:%s.", cast_name.c_str()); return nullptr; } @@ -536,6 +592,10 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con AttrUtils::SetInt(cast_desc, CAST_ATTR_DSTT, (int64_t)DT_INT32) && AttrUtils::SetInt(cast_desc, CAST_ATTR_DST_TYPE, (int64_t)DT_INT32) && AttrUtils::SetBool(cast_desc, CAST_ATTR_TRUNCATE, false))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s or %s or %s or %s to op:%s(%s) failed", + CAST_ATTR_SRCT.c_str(), CAST_ATTR_DSTT.c_str(), + CAST_ATTR_DST_TYPE.c_str(), CAST_ATTR_TRUNCATE.c_str(), + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); GELOGE(FAILED, "Set CAST_ATTR_SRCT or CAST_ATTR_DSTT or CAST_ATTR_DST_TYPE or CAST_ATTR_TRUNCATE failed, node: %s.", cast_name.c_str()); return nullptr; @@ -543,14 +603,24 @@ NodePtr SwitchToStreamSwitchPass::CreateCastOp(const ComputeGraphPtr &graph, con GeTensorDesc tensor_desc = cond_desc->GetOutputDesc(peer_cond_anchor->GetIdx()); tensor_desc.SetDataType(DT_BOOL); - GE_CHK_BOOL_EXEC(cast_desc->AddInputDesc(tensor_desc) == SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(cast_desc->AddInputDesc(tensor_desc) == SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); + return nullptr, "Cast_node add input desc failed."); tensor_desc.SetDataType(DT_INT32); - GE_CHK_BOOL_EXEC(cast_desc->AddOutputDesc(tensor_desc) == SUCCESS, return nullptr, + GE_CHK_BOOL_EXEC(cast_desc->AddOutputDesc(tensor_desc) == SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str()); + return nullptr, "Cast_node add output desc failed."); NodePtr cast_node = graph->AddNode(cast_desc); - GE_CHK_BOOL_EXEC(cast_node != nullptr, return nullptr, "Create cast_node failed."); + GE_CHK_BOOL_EXEC(cast_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + cast_desc->GetName().c_str(), cast_desc->GetType().c_str(), + graph->GetName().c_str()); + return nullptr, "Create cast_node failed."); // Cast node has and only has one input GE_CHK_STATUS(GraphUtils::AddEdge(peer_cond_anchor, cast_node->GetInDataAnchor(0)), "Cast add data edge failed."); @@ -567,13 +637,18 @@ Status SwitchToStreamSwitchPass::AddConstNode(const ComputeGraphPtr &graph, cons OpDescPtr op_desc = stream_switch->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); bool value = false; - GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), return FAILED, + GE_CHK_BOOL_EXEC(AttrUtils::GetBool(op_desc, ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG, value), + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", + ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + return FAILED, "StreamSwitch get attr TRUE_BRANCH_STREAM failed."); const std::string &const_node_name = op_desc->GetName() + "_Constant_" + (value ? "t" : "f"); GELOGI("Create const op: %s", const_node_name.c_str()); OpDescPtr const_op_desc = MakeShared(const_node_name, CONSTANT); if (const_op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "Create op_desc failed, Constant:%s.", const_node_name.c_str()); return FAILED; } @@ -583,15 +658,26 @@ Status SwitchToStreamSwitchPass::AddConstNode(const ComputeGraphPtr &graph, cons GeTensorPtr const_value = MakeShared(data_desc, reinterpret_cast(&resize_value), sizeof(int32_t)); if (const_value == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "Create tensor failed."); return FAILED; } - GE_CHK_BOOL_EXEC(AttrUtils::SetTensor(const_op_desc, ATTR_NAME_WEIGHTS, const_value), return FAILED); - GE_CHK_BOOL_EXEC(const_op_desc->AddOutputDesc(data_desc) == GRAPH_SUCCESS, return FAILED, + GE_CHK_BOOL_EXEC(AttrUtils::SetTensor(const_op_desc, ATTR_NAME_WEIGHTS, const_value), + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str()); + return FAILED); + GE_CHK_BOOL_EXEC(const_op_desc->AddOutputDesc(data_desc) == GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str()); + return FAILED, "Create Const op: add output desc failed."); NodePtr const_node = graph->AddNode(const_op_desc); - GE_CHK_BOOL_EXEC(const_node != nullptr, return FAILED, "Insert Const node failed."); + GE_CHK_BOOL_EXEC(const_node != nullptr, + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + const_op_desc->GetName().c_str(), const_op_desc->GetType().c_str(), + graph->GetName().c_str()); + return FAILED, "Insert Const node failed."); GE_CHK_STATUS(GraphUtils::AddEdge(const_node->GetOutDataAnchor(0), stream_switch->GetInDataAnchor(1)), "StreamSwitch node add ctl edge failed."); @@ -613,6 +699,8 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no OpDescPtr switch_desc = switch_node->GetOpDesc(); GE_CHECK_NOTNULL(switch_desc); if (!AttrUtils::GetStr(switch_desc, ATTR_NAME_ORIG_NODE_NAME, orig_switch_name) || orig_switch_name.empty()) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_ORIG_NODE_NAME.c_str(), + switch_desc->GetName().c_str(), switch_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get attr ATTR_NAME_ORIG_NODE_NAME failed, node: %s", switch_desc->GetName().c_str()); return INTERNAL_ERROR; } @@ -634,6 +722,8 @@ Status SwitchToStreamSwitchPass::ModifySwitchInCtlEdges(const NodePtr &switch_no auto find_res1 = switch_node_map_.find(in_ctrl_node); GE_IF_BOOL_EXEC(find_res1 == switch_node_map_.end(), { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) can't find in switch_node_map_, check invalid", + in_ctrl_node->GetName().c_str(), in_ctrl_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", in_ctrl_node->GetName().c_str()); return INTERNAL_ERROR; }); @@ -662,10 +752,14 @@ Status SwitchToStreamSwitchPass::ModifySwitchOutCtlEdges(const NodePtr &switch_n stream_switch->GetName().c_str(), active_node->GetName().c_str()); auto find_res = switch_node_map_.find(switch_node); GE_IF_BOOL_EXEC(find_res == switch_node_map_.end(), { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) can't find in switch_node_map_, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "StreamSwitch node %s not found in switch_node_map_.", switch_node->GetName().c_str()); return INTERNAL_ERROR; }); GE_IF_BOOL_EXEC(find_res->second.empty(), { + REPORT_INNER_ERROR("E19999", "True_nodes of StreamSwitch node:%s(%s) is empty, check invalid", + switch_node->GetName().c_str(), switch_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "true_nodes of StreamSwitch node %s is empty.", switch_node->GetName().c_str()); return INTERNAL_ERROR; }); @@ -678,6 +772,8 @@ Status SwitchToStreamSwitchPass::ModifySwitchOutCtlEdges(const NodePtr &switch_n std::string orig_name = op_desc->GetName(); GE_IF_BOOL_EXEC(op_desc->HasAttr(ATTR_NAME_ORIG_NODE_NAME), { if (!AttrUtils::GetStr(op_desc, ATTR_NAME_ORIG_NODE_NAME, orig_name) || orig_name.empty()) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_ORIG_NODE_NAME.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Get attr ATTR_NAME_ORIG_NODE_NAME failed, node: %s.", op_desc->GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/graph/passes/transop_breadth_fusion_pass.cc b/ge/graph/passes/transop_breadth_fusion_pass.cc index 654c3822..a52f4389 100644 --- a/ge/graph/passes/transop_breadth_fusion_pass.cc +++ b/ge/graph/passes/transop_breadth_fusion_pass.cc @@ -31,6 +31,7 @@ Status TransOpBreadthFusionPass::Run(ge::ComputeGraphPtr graph) { // breadth fusion pass requires new topologic Status ret_topo = graph->TopologicalSorting(); if (ret_topo != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Topological sorting for graph:%s failed", graph->GetName().c_str()); GELOGE(ret_topo, "TopologicalSorting the merged graph failed."); return ret_topo; } @@ -60,7 +61,9 @@ std::string TransOpBreadthFusionPass::GetNodeId(const int anchor_index, const No bool trans_format = false; bool trans_shape = false; - GE_IF_BOOL_EXEC(node == nullptr || node->GetOpDesc() == nullptr, GELOGE(FAILED, "node is null"); return ""); + GE_IF_BOOL_EXEC(node == nullptr || node->GetOpDesc() == nullptr, + REPORT_INNER_ERROR("E19999", "Param node or its op_desc is nullptr, check invalid"); + GELOGE(FAILED, "node is null"); return ""); if (node->GetType() == CAST) { trans_data_type = true; } else if (node->GetType() == TRANSPOSE || node->GetType() == TRANSPOSED || node->GetType() == EXPANDDIMS) { diff --git a/ge/graph/passes/transop_depth_fusion_pass.cc b/ge/graph/passes/transop_depth_fusion_pass.cc index 85106e08..05b55307 100755 --- a/ge/graph/passes/transop_depth_fusion_pass.cc +++ b/ge/graph/passes/transop_depth_fusion_pass.cc @@ -82,6 +82,7 @@ graphStatus TransOpDepthFusionPass::RecursiveInDepth(const InDataAnchorPtr &dst_ if (dst_in_anchor == nullptr || dst_in_anchor->GetOwnerNode() == nullptr || dst_in_anchor->GetOwnerNode()->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param dst_in_anchor related node info has nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return GRAPH_FAILED; } @@ -257,11 +258,13 @@ graphStatus TransOpDepthFusionPass::RelinkEdges(const OutDataAnchorPtr &new_out_ const OutDataAnchorPtr &old_out_anchor, const InDataAnchorPtr &in_data_anchor) { if (new_out_anchor == nullptr || old_out_anchor == nullptr || in_data_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param anchor info has nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "new_out_anchor or old_out_anchor or in_data_anchor is nullptr"); return GRAPH_FAILED; } if (new_out_anchor->GetOwnerNode() == nullptr || old_out_anchor->GetOwnerNode() == nullptr || in_data_anchor->GetOwnerNode() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param anchor info owner node has nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "anchor's owner node is nullptr"); return GRAPH_FAILED; } @@ -305,11 +308,14 @@ graphStatus TransOpDepthFusionPass::RemoveNode(const NodePtr &node, const ge::Co return GRAPH_FAILED; } if (GraphUtils::IsolateNode(node, {0}) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", node->GetName().c_str(), node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", node->GetName().c_str(), node->GetType().c_str()); return GRAPH_FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Remove node: %s, type: %s without relink failed", node->GetName().c_str(), node->GetType().c_str()); return GRAPH_FAILED; diff --git a/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc b/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc index b207abe9..78c60eda 100644 --- a/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc +++ b/ge/graph/passes/transop_nearby_allreduce_fusion_pass.cc @@ -99,6 +99,9 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt auto in_data_anchors = node->GetAllInDataAnchors(); auto out_data_anchors = node->GetAllOutDataAnchors(); if (in_data_anchors.size() != out_data_anchors.size()) { + REPORT_INNER_ERROR("E19999", "In data anchors size:%zu not equal to out data anchors size:%zu in node:%s(%s), " + "check invalid", in_data_anchors.size(), out_data_anchors.size(), + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "in and out data anchor size are not equal, node=%s, in_size=%zu, out_size=%zu", node->GetName().c_str(), in_data_anchors.size(), out_data_anchors.size()); return FAILED; @@ -143,6 +146,8 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt // delete in_node if (IsolateAndDeleteNode(in_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + in_node->GetName().c_str(), in_node->GetType().c_str()); GELOGE(FAILED, "remove node %s failed", in_node->GetName().c_str()); return FAILED; } @@ -150,6 +155,8 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt // delete out_node if (IsolateAndDeleteNode(out_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + out_node->GetName().c_str(), out_node->GetType().c_str()); GELOGE(FAILED, "remove node %s failed", out_node->GetName().c_str()); return FAILED; } @@ -162,9 +169,13 @@ Status TransOpNearbyAllreduceFusionPass::RemoveNearbyPairedTransOps(const NodePt auto input_desc = in_node->GetOpDesc()->GetInputDesc(0); auto output_desc = out_node->GetOpDesc()->GetOutputDesc(0); if (node->GetOpDesc()->UpdateInputDesc(static_cast(i), input_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update input:%zu desc in op:%s(%s) failed", + i, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "UpdateInputDesc fail."); } if (node->GetOpDesc()->UpdateOutputDesc(static_cast(i), output_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update output:%zu desc in op:%s(%s) failed", + i, node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "UpdateOutputDesc"); } GELOGI("successfully remove paired transop (%s and %s) for node %s", diff --git a/ge/graph/passes/transop_symmetry_elimination_pass.cc b/ge/graph/passes/transop_symmetry_elimination_pass.cc index 9db3aea1..2ea7fac1 100644 --- a/ge/graph/passes/transop_symmetry_elimination_pass.cc +++ b/ge/graph/passes/transop_symmetry_elimination_pass.cc @@ -172,6 +172,12 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const // 1.Unlink T1->T2 auto ret = src_out_anchor->Unlink(dst_in_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", + "Op:%s(%s) out index:%d unlink from op:%s(%s) in index:%d failed", + src_out_anchor->GetOwnerNode()->GetName().c_str(), + src_out_anchor->GetOwnerNode()->GetType().c_str(), src_out_anchor->GetIdx(), + dst_in_anchor->GetOwnerNode()->GetName().c_str(), + dst_in_anchor->GetOwnerNode()->GetType().c_str(), dst_in_anchor->GetIdx()); GELOGE(FAILED, "Unlink data anchor from %s to %s.", src_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; } @@ -183,6 +189,11 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const auto pre_normal_node = in_anchor->GetPeerOutAnchor()->GetOwnerNode(); ret = GraphUtils::AddEdge(in_anchor->GetPeerOutAnchor(), dst_in_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + pre_normal_node->GetName().c_str(), pre_normal_node->GetType().c_str(), + in_anchor->GetPeerOutAnchor()->GetIdx(), + dst_in_anchor->GetOwnerNode()->GetName().c_str(), + dst_in_anchor->GetOwnerNode()->GetType().c_str(), dst_in_anchor->GetIdx()); GELOGE(FAILED, "Add data edge from %s to %s failed.", pre_normal_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; @@ -190,6 +201,9 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const // 3.Copy in-control/data-in-control from T1->T2 ret = GraphUtils::CopyInCtrlEdges(src_node, dst_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), + dst_node->GetName().c_str(), dst_node->GetType().c_str()); GELOGE(FAILED, "Copy control edge from %s to %s failed.", src_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; } @@ -198,6 +212,9 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const if (in_node->GetName() == pre_normal_node->GetName()) { continue; } ret = GraphUtils::AddEdge(in_node->GetOutControlAnchor(), dst_node->GetInControlAnchor()); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + in_node->GetName().c_str(), in_node->GetType().c_str(), + dst_node->GetName().c_str(), dst_node->GetType().c_str()); GELOGE(FAILED, "Add control edge from %s to %s failed.", in_node->GetName().c_str(), dst_node->GetName().c_str()); return ret; } @@ -205,6 +222,8 @@ Status TransOpSymmetryEliminationPass::EliminateTransOp(NodePtr &src_node, const // 5.IsolateAndDelete T2, A will link to B automatically, and all control edge will also relink. ret = IsolateAndDeleteNode(dst_node, {0}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + dst_node->GetName().c_str(), dst_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", dst_node->GetName().c_str(), dst_node->GetType().c_str()); return ret; @@ -223,6 +242,9 @@ Status TransOpSymmetryEliminationPass::RemoveTransOpWithoutOutput(NodePtr &pre_n // 6.1 Copy out control to pre normal node Status ret = GraphUtils::CopyOutCtrlEdges(trans_node, pre_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy out control edge from node:%s(%s) to node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), + pre_node->GetName().c_str(), pre_node->GetType().c_str()); GELOGE(FAILED, "Copy control edge from %s to %s failed.", trans_node->GetName().c_str(), pre_node->GetName().c_str()); return ret; @@ -230,6 +252,8 @@ Status TransOpSymmetryEliminationPass::RemoveTransOpWithoutOutput(NodePtr &pre_n // 6.2 Isolate and delete T1 ret = IsolateAndDeleteNode(trans_node, {}); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate and delete node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", trans_node->GetName().c_str(), trans_node->GetType().c_str()); return ret; diff --git a/ge/graph/passes/transop_without_reshape_fusion_pass.cc b/ge/graph/passes/transop_without_reshape_fusion_pass.cc index 6bea9edc..00896235 100644 --- a/ge/graph/passes/transop_without_reshape_fusion_pass.cc +++ b/ge/graph/passes/transop_without_reshape_fusion_pass.cc @@ -63,7 +63,10 @@ void TransOpWithoutReshapeFusionPass::SetRemainNode( continue; } GELOGI("SetRemainNode node is %s", op_desc->GetName().c_str()); - GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), GELOGE(INTERNAL_ERROR, "set ext attr failed"); return); + GE_IF_BOOL_EXEC(!op_desc->SetExtAttr(kRemainNode, true), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kRemainNode, + op_desc->GetName().c_str(), op_desc->GetType().c_str()); + GELOGE(INTERNAL_ERROR, "set ext attr failed"); return); } } @@ -74,17 +77,29 @@ bool TransOpWithoutReshapeFusionPass::FormatContinuousCheck(const OutDataAnchorP return false; } auto in_node = in_anchor->GetOwnerNode(); - GE_IF_BOOL_EXEC(in_node == nullptr, GELOGE(INTERNAL_ERROR, "in_node is null"); return false); + GE_IF_BOOL_EXEC(in_node == nullptr, + REPORT_INNER_ERROR("E19999", "Param in_anchor's owner node is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "in_node is null"); return false); auto in_op = in_node->GetOpDesc(); auto out_owner_node = out_anchor->GetOwnerNode(); - GE_IF_BOOL_EXEC(out_owner_node == nullptr, GELOGE(INTERNAL_ERROR, "out_owner_node is null"); return false); + GE_IF_BOOL_EXEC(out_owner_node == nullptr, + REPORT_INNER_ERROR("E19999", "Param out_anchor's owner node is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "out_owner_node is null"); return false); auto out_op = out_owner_node->GetOpDesc(); - GE_IF_BOOL_EXEC(in_op == nullptr, GELOGE(INTERNAL_ERROR, "in_op is null"); return false); - GE_IF_BOOL_EXEC(out_op == nullptr, GELOGE(INTERNAL_ERROR, "out_op is null"); return false); + GE_IF_BOOL_EXEC(in_op == nullptr, + REPORT_INNER_ERROR("E19999", "Param in_anchor's owner op_desc is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "in_op is null"); return false); + GE_IF_BOOL_EXEC(out_op == nullptr, + REPORT_INNER_ERROR("E19999", "Param out_anchor's owner op_desc is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "out_op is null"); return false); auto in_op_desc = in_op->GetInputDescPtr(in_anchor->GetIdx()); auto out_op_desc = out_op->GetOutputDescPtr(out_anchor->GetIdx()); - GE_IF_BOOL_EXEC(in_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "in_op_desc is null"); return false); - GE_IF_BOOL_EXEC(out_op_desc == nullptr, GELOGE(INTERNAL_ERROR, "out_op_desc is null"); return false); + GE_IF_BOOL_EXEC(in_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Param in_anchor corresponding tensor is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "in_op_desc is null"); return false); + GE_IF_BOOL_EXEC(out_op_desc == nullptr, + REPORT_INNER_ERROR("E19999", "Param out_anchor corresponding tensor is nullptr, check invalid"); + GELOGE(INTERNAL_ERROR, "out_op_desc is null"); return false); if (!ShapeEqualCheck(in_op_desc->GetShape(), out_op_desc->GetShape())) { return false; } @@ -357,6 +372,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkSubGraphControlEdges( GELOGI("add control edge.src:%s, dst:%s", out_owner_node->GetName().c_str(), in_owner_node->GetName().c_str()); if (GraphUtils::AddEdge(out_owner_node->GetOutControlAnchor(), in_owner_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str()); return GRAPH_FAILED; } } @@ -365,6 +383,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkSubGraphControlEdges( GELOGI("add out data 2 in contorl edge.src:%s, dst:%s", out_owner_node->GetName().c_str(), in_owner_node->GetName().c_str()); if (GraphUtils::AddEdge(out_anchor, in_owner_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str()); return GRAPH_FAILED; } } @@ -392,6 +413,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add control edge.src:%s, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_owner_node->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -401,6 +426,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add control edge.src:%s, src idx:%d, dst:%s", peer_out_anchor->GetOwnerNode()->GetName().c_str(), peer_out_anchor->GetIdx(), in_owner_node->GetName().c_str()); if (GraphUtils::AddEdge(peer_out_anchor, in_owner_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str()); return GRAPH_FAILED; } } @@ -410,6 +439,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add out control 2 in data edge.src:%s, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_owner_node->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -419,6 +452,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdgesWhenDescNotChange GELOGI("add out data 2 in control edge.src:%s, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str(), peer_in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_anchor, peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -443,6 +480,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkNodesWhenDescNotChanged( GELOGI("relink node.src node:%s, src idx:%d, dst node:%s, dst idx:%d", out_owner_node->GetName().c_str(), out_anchor->GetIdx(), in_owner_node->GetName().c_str(), in_anchor->GetIdx()); if (GraphUtils::AddEdge(out_anchor, in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), out_anchor->GetIdx(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str(), in_anchor->GetIdx()); GELOGE(GRAPH_FAILED, "add edge failed!src:%s, src idx:%d, dst:%s, dst idx:%d", out_owner_node->GetName().c_str(), out_anchor->GetIdx(), in_owner_node->GetName().c_str(), in_anchor->GetIdx()); return GRAPH_FAILED; @@ -466,16 +506,21 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDes format_transfer_op_name << "fusion_format_transfer_" << fusion_format_transfer_op_count; OpDescPtr format_transfer_op = MakeShared(format_transfer_op_name.str().c_str(), TRANSDATA); if (format_transfer_op == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(INTERNAL_ERROR, "new format transfer op failed!"); return nullptr; } GE_IF_BOOL_EXEC(!AttrUtils::SetInt(format_transfer_op, ATTR_NAME_INPUT_FORMAT, static_cast(format_trans_input_desc.GetFormat())), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_INPUT_FORMAT.c_str(), + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_INPUT_FORMAT failed"); return nullptr); GE_IF_BOOL_EXEC(!AttrUtils::SetInt(format_transfer_op, ATTR_NAME_OUTPUT_FORMAT, static_cast(format_trans_output_desc.GetFormat())), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_OUTPUT_FORMAT.c_str(), + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ATTR_NAME_OUTPUT_FORMAT failed"); return nullptr); @@ -483,22 +528,32 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetFormatTransferOp(const GeTensorDes string dst_format = TypeUtils::FormatToSerialString(format_trans_output_desc.GetFormat()); GE_IF_BOOL_EXEC(!AttrUtils::SetStr(format_transfer_op, kAttrNameSrcFormat, src_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kAttrNameSrcFormat, + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set kAttrNameSrcFormat failed"); return nullptr); GE_IF_BOOL_EXEC(!AttrUtils::SetStr(format_transfer_op, kAttrNameDstFormat, dst_format), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", kAttrNameDstFormat, + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set kAttrNameDstFormat failed"); return nullptr); GE_IF_BOOL_EXEC(format_transfer_op->AddInputDesc(format_trans_input_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add input desc failed"); return nullptr); GE_IF_BOOL_EXEC(format_transfer_op->AddOutputDesc(format_trans_output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add ouput desc to op:%s(%s) failed", + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add output desc failed"); return nullptr); GE_IF_BOOL_EXEC(!ge::AttrUtils::SetBool(format_transfer_op, ATTR_NEED_COMPILE, true), + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NEED_COMPILE.c_str(), + format_transfer_op->GetName().c_str(), format_transfer_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set ext attr failed"); return nullptr); return format_transfer_op; @@ -515,6 +570,7 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetCastOp(const GeTensorDesc &cast_in auto cast_op = ge::OpDescUtils::GetOpDescFromOperator(node_op); node_op.BreakConnect(); if (cast_op == nullptr) { + REPORT_CALL_ERROR("E19999", "Create operator:%s(%s) failed", cast_op_name.str().c_str(), CAST); GELOGE(INTERNAL_ERROR, "new cast op failed!"); return nullptr; } @@ -522,29 +578,41 @@ OpDescPtr TransOpWithoutReshapeFusionPass::GetCastOp(const GeTensorDesc &cast_in const int default_output_index = 0; if (cast_op->GetInputsSize() == 0) { GE_IF_BOOL_EXEC(cast_op->AddInputDesc(cast_input_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add input desc to op:%s(%s) failed", + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add input desc failed"); return nullptr); } else { GE_IF_BOOL_EXEC(cast_op->UpdateInputDesc(default_input_index, cast_input_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update input:%d desc of op:%s(%s) failed", default_input_index, + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "update input desc failed"); return nullptr); } if (cast_op->GetOutputsSize() == 0) { GE_IF_BOOL_EXEC(cast_op->AddOutputDesc(cast_output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Add output desc to op:%s(%s) failed", + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "add output desc failed"); return nullptr); } else { GE_IF_BOOL_EXEC(cast_op->UpdateOutputDesc(default_output_index, cast_output_desc) != GRAPH_SUCCESS, + REPORT_CALL_ERROR("E19999", "Update output:%d desc of op:%s(%s) failed", default_output_index, + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "update output desc failed"); return nullptr); } if (!AttrUtils::SetInt(cast_op, CAST_ATTR_DST_TYPE, static_cast(cast_output_desc.GetDataType()))) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", CAST_ATTR_DST_TYPE.c_str(), + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set dst_type attr failed"); return nullptr; } if (!AttrUtils::SetBool(cast_op, ATTR_NEED_COMPILE, true)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NEED_COMPILE.c_str(), + cast_op->GetName().c_str(), cast_op->GetType().c_str()); GELOGE(INTERNAL_ERROR, "set need_compile attr failed"); return nullptr; } @@ -879,6 +947,8 @@ graphStatus TransOpWithoutReshapeFusionPass::AddTransNode(const ComputeGraphPtr trans_node = graph->AddNode(transop); if (trans_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s failed", + transop->GetName().c_str(), transop->GetType().c_str(), graph->GetName().c_str()); GELOGE(GRAPH_FAILED, "add node failed!"); return GRAPH_FAILED; } @@ -945,6 +1015,9 @@ graphStatus TransOpWithoutReshapeFusionPass::InsertNewTransOp(const ComputeGraph GELOGI("add edge.src:%s, src idx:%d, dst:%s", out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetIdx(), new_trans_nodes.front()->GetName().c_str()); if (GraphUtils::AddEdge(out_anchor, new_trans_nodes.front()->GetInAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + out_owner_node->GetName().c_str(), out_owner_node->GetType().c_str(), out_anchor->GetIdx(), + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str()); return GRAPH_FAILED; } else { auto old_peer_in_anchor = begin_out.second; @@ -957,6 +1030,9 @@ graphStatus TransOpWithoutReshapeFusionPass::InsertNewTransOp(const ComputeGraph new_trans_nodes.back()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.front()->GetOutAnchor(0), new_trans_nodes.back()->GetInAnchor(0)) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:0) failed", + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str(), + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str()); return GRAPH_FAILED; } else { auto old_peer_out_anchor = end_in.first; @@ -967,6 +1043,9 @@ graphStatus TransOpWithoutReshapeFusionPass::InsertNewTransOp(const ComputeGraph GELOGI("add edge.src:%s, dst:%s, dst idx:%d", new_trans_nodes.back()->GetName().c_str(), in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetIdx()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutAnchor(0), in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%d) failed", + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str(), + in_owner_node->GetName().c_str(), in_owner_node->GetType().c_str(), in_anchor->GetIdx()); return GRAPH_FAILED; } @@ -977,6 +1056,7 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, const vector &new_trans_nodes) { GE_CHECK_NOTNULL(out_anchor); if (new_trans_nodes.front() == nullptr || new_trans_nodes.back() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param new_trans_nodes front or back is nullptr, check invalid"); return GRAPH_FAILED; } if (sub_graph_has_control_edge_[index]) { @@ -984,6 +1064,9 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, new_trans_nodes.front()->GetName().c_str()); if (GraphUtils::AddEdge(out_anchor->GetOwnerNode()->GetOutControlAnchor(), new_trans_nodes.front()->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + out_anchor->GetOwnerNode()->GetName().c_str(), out_anchor->GetOwnerNode()->GetType().c_str(), + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str()); return GRAPH_FAILED; } } @@ -993,6 +1076,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", new_trans_nodes.back()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -1002,6 +1089,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", peer_out_anchor->GetOwnerNode()->GetName().c_str(), new_trans_nodes.front()->GetName().c_str()); if (GraphUtils::AddEdge(peer_out_anchor, new_trans_nodes.front()->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), + new_trans_nodes.front()->GetName().c_str(), new_trans_nodes.front()->GetType().c_str()); return GRAPH_FAILED; } } @@ -1011,6 +1102,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", new_trans_nodes.back()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutControlAnchor(), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str()); return GRAPH_FAILED; } } @@ -1020,6 +1115,10 @@ graphStatus TransOpWithoutReshapeFusionPass::RelinkControlEdge(const int index, GELOGI("add control edge.src:%s, dst:%s", new_trans_nodes.back()->GetName().c_str(), peer_in_anchor->GetOwnerNode()->GetName().c_str()); if (GraphUtils::AddEdge(new_trans_nodes.back()->GetOutDataAnchor(0), peer_in_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:0) and op:%s(%s)(index:%d) failed", + new_trans_nodes.back()->GetName().c_str(), new_trans_nodes.back()->GetType().c_str(), + peer_in_anchor->GetOwnerNode()->GetName().c_str(), + peer_in_anchor->GetOwnerNode()->GetType().c_str(), peer_in_anchor->GetIdx()); return GRAPH_FAILED; } } @@ -1081,6 +1180,7 @@ graphStatus TransOpWithoutReshapeFusionPass::GetSubGraphsBetweenNormalNode( vector> &nodes_list) { graphStatus ret = GRAPH_SUCCESS; if (out_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Param out_anchor is nullptr, check invalid"); return GRAPH_FAILED; } diff --git a/ge/graph/passes/transpose_transdata_pass.cc b/ge/graph/passes/transpose_transdata_pass.cc index 810f5639..674804bd 100644 --- a/ge/graph/passes/transpose_transdata_pass.cc +++ b/ge/graph/passes/transpose_transdata_pass.cc @@ -34,11 +34,13 @@ const char *const kAttrNameSrcFormat = "src_format"; namespace ge { Status TransposeTransDataPass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [node] must not be null."); return PARAM_INVALID; } auto op_desc = node->GetOpDesc(); if (op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "OpDesc of param [node] must not be null."); return PARAM_INVALID; } @@ -77,6 +79,7 @@ Status TransposeTransDataPass::Run(NodePtr &node) { GE_CHECK_NOTNULL(out_node); OpDescPtr out_op_desc = out_node->GetOpDesc(); if (out_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "OpDesc of out data node of [%s] must not be null.", node->GetName().c_str()); return FAILED; } @@ -111,6 +114,10 @@ Status TransposeTransDataPass::CheckOneInAndOneOutDataAnchor(NodePtr &node) cons // Trans op has one input data node, maybe has N output data nodes uint32_t in_data_node_nums = node->GetInDataNodes().size(); if (in_data_anchor_nums != 1 || out_data_anchor_nums != 1 || in_data_node_nums != 1) { + REPORT_INNER_ERROR("E19999", "In data anchor num:%u, out data anchor num:%u, in data node num:%u of node:%s(%s) " + "must be all equal to 1, check invalid", + in_data_anchor_nums, out_data_anchor_nums, in_data_node_nums, + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[%s] %s has %u in %u out data anchor, has %u in data node.", node->GetType().c_str(), node->GetName().c_str(), in_data_anchor_nums, out_data_anchor_nums, in_data_node_nums); return FAILED; @@ -122,6 +129,8 @@ Status TransposeTransDataPass::RemoveTranspose(NodePtr &node) { GE_CHECK_NOTNULL(node); ComputeGraphPtr graph = node->GetOwnerComputeGraph(); if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Owner graph of node:%s(%s) is nullptr, check invalid", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "[%s] The owner graph must not be null.", node->GetName().c_str()); return FAILED; } @@ -146,6 +155,8 @@ Status TransposeTransDataPass::RemoveTranspose(NodePtr &node) { } AddNodeDeleted(node); if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/unused_args_clean_pass.cc b/ge/graph/passes/unused_args_clean_pass.cc index ec66b129..df70e99b 100755 --- a/ge/graph/passes/unused_args_clean_pass.cc +++ b/ge/graph/passes/unused_args_clean_pass.cc @@ -101,6 +101,8 @@ Status UnusedArgsCleanPass::ClassifyDataNodes(const ComputeGraphPtr &graph, cons for (const auto &name : func_desc->GetSubgraphInstanceNames()) { const auto &subgraph = graph->GetSubgraph(name); if (subgraph == nullptr) { + REPORT_CALL_ERROR("E19999", "Get subgraph from graph:%s by name:%s failed", + graph->GetName().c_str(), name.c_str()); GELOGE(GE_GRAPH_EMPTY_SUBGRAPH, "Subgraph not found, name: %s", name.c_str()); return GE_GRAPH_EMPTY_SUBGRAPH; } @@ -113,6 +115,8 @@ Status UnusedArgsCleanPass::ClassifyDataNodes(const ComputeGraphPtr &graph, cons uint32_t parent_index = 0; if (!AttrUtils::GetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, parent_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Parent index not found, name: %s", data->GetName().c_str()); return FAILED; } @@ -150,6 +154,8 @@ Status UnusedArgsCleanPass::UpdateInputTensor(const mapsecond; if (!AttrUtils::SetInt(data->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, update_index)) { + REPORT_CALL_ERROR("E19999", "Get Attr:%s from op:%s(%s) failed", ATTR_NAME_PARENT_NODE_INDEX.c_str(), + data->GetName().c_str(), data->GetType().c_str()); GELOGE(FAILED, "Set parent index failed, name: %s", data->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/unused_const_pass.cc b/ge/graph/passes/unused_const_pass.cc index 7c57c53e..80e43d08 100644 --- a/ge/graph/passes/unused_const_pass.cc +++ b/ge/graph/passes/unused_const_pass.cc @@ -27,10 +27,12 @@ namespace ge { /// Status UnusedConstPass::Run(NodePtr &node) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); GELOGE(FAILED, "parameter is null."); return FAILED; } if (node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node's op_desc is nullptr, check invalid"); GELOGE(PARAM_INVALID, "param [opDesc] must not be null."); return PARAM_INVALID; } diff --git a/ge/graph/passes/var_is_initialized_op_pass.cc b/ge/graph/passes/var_is_initialized_op_pass.cc index b9c752d8..e1f982d6 100644 --- a/ge/graph/passes/var_is_initialized_op_pass.cc +++ b/ge/graph/passes/var_is_initialized_op_pass.cc @@ -61,6 +61,8 @@ Status VarIsInitializedOpPass::CheckSrcNode(const NodePtr &node, bool &inited) c GE_CHECK_NOTNULL(node); auto input_nodes = node->GetInDataNodes(); if (input_nodes.size() != kVarIsInitializedIOCnt) { + REPORT_INNER_ERROR("E19999", "In data node num:%zu of node:%s(%s) not equal to %d, check invalid", + input_nodes.size(), node->GetName().c_str(), node->GetType().c_str(), kVarIsInitializedIOCnt); GELOGE(FAILED, "[%s] Node input data nodes size [%zu] is not equal 1.", node->GetName().c_str(), @@ -73,6 +75,9 @@ Status VarIsInitializedOpPass::CheckSrcNode(const NodePtr &node, bool &inited) c auto input_node_name = input_node->GetName(); auto input_node_type = input_node->GetType(); if (input_node_type != VARIABLE) { + REPORT_INNER_ERROR("E19999", "Index:%d In data node of node:%s(%s), type:%s not %s, check invalid", + kVarIsInitVarInputIndex, node->GetName().c_str(), node->GetType().c_str(), + input_node_type.c_str(), VARIABLE); GELOGE(FAILED, "[%s] Src node %s is not Variable, is %s.", node->GetName().c_str(), input_node_name.c_str(), input_node_type.c_str()); return FAILED; @@ -95,6 +100,7 @@ Status VarIsInitializedOpPass::CreateConstant(NodePtr &node, OpDescPtr &op_desc, // 1. create Constant OpDesc op_desc = MakeShared(node->GetName().c_str(), CONSTANT); if (op_desc == nullptr) { + REPORT_CALL_ERROR("E19999", "New OpDesc failed"); GELOGE(FAILED, "[%s] Make shared of Constant op desc failed.", node->GetName().c_str()); return FAILED; } @@ -102,6 +108,7 @@ Status VarIsInitializedOpPass::CreateConstant(NodePtr &node, OpDescPtr &op_desc, // 2. get OpDesc of VarIsInitializedOp OpDescPtr original_op_desc = node->GetOpDesc(); if (original_op_desc == nullptr) { + REPORT_INNER_ERROR("E19999", "OpDesc in node is nullptr, check invalid"); GELOGE(FAILED, "[%s] Op desc must not be null.", node->GetName().c_str()); return FAILED; } @@ -111,10 +118,13 @@ Status VarIsInitializedOpPass::CreateConstant(NodePtr &node, OpDescPtr &op_desc, bool val = inited; GeTensorPtr const_tensor_ptr = MakeShared(original_desc, reinterpret_cast(&val), sizeof(bool)); if (const_tensor_ptr == nullptr) { + REPORT_CALL_ERROR("E19999", "New GeTensor failed"); GELOGE(FAILED, "[%s] Make shared of Constant tensor failed.", node->GetName().c_str()); return FAILED; } if (!AttrUtils::SetTensor(op_desc, ATTR_NAME_WEIGHTS, const_tensor_ptr)) { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to op:%s(%s) failed", ATTR_NAME_WEIGHTS.c_str(), + op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(INTERNAL_ERROR, "get ATTR_NAME_WEIGHTS failed"); return FAILED; } @@ -131,6 +141,9 @@ Status VarIsInitializedOpPass::ProcessInAnchor(NodePtr &node, NodePtr &new_node) auto out_anchors = node->GetAllOutDataAnchors(); if ((in_anchors.size() != kVarIsInitializedIOCnt) || (out_anchors.size() != kVarIsInitializedIOCnt)) { + REPORT_INNER_ERROR("E19999", "In data anchor num:%zu and out data anchor num:%zu of node:%s(%s), " + "must botch equal to %d, check invalid", in_anchors.size(), out_anchors.size(), + node->GetName().c_str(), node->GetType().c_str(), kVarIsInitializedIOCnt); GELOGE(FAILED, "[%s] Node input/output data anchors" " size [%lu][%lu] is not all equal 1.", @@ -144,22 +157,36 @@ Status VarIsInitializedOpPass::ProcessInAnchor(NodePtr &node, NodePtr &new_node) auto peer_out_anchor = in_anchor->GetPeerOutAnchor(); GE_CHECK_NOTNULL(peer_out_anchor); if (GraphUtils::RemoveEdge(in_anchor, peer_out_anchor) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove edge between op:%s(%s)(index:%d) and op:%s(%s)(index:%d) failed", + in_anchor->GetOwnerNode()->GetName().c_str(), in_anchor->GetOwnerNode()->GetType().c_str(), + in_anchor->GetIdx(), + peer_out_anchor->GetOwnerNode()->GetName().c_str(), + peer_out_anchor->GetOwnerNode()->GetType().c_str(), peer_out_anchor->GetIdx()); GELOGE(FAILED, "[%s] Remove in data edge failed.", node->GetName().c_str()); return FAILED; } auto src_node = peer_out_anchor->GetOwnerNode(); if (GraphUtils::AddEdge(src_node->GetOutControlAnchor(), new_node->GetInControlAnchor()) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add control edge between op:%s(%s) and op:%s(%s) failed", + src_node->GetName().c_str(), src_node->GetType().c_str(), + new_node->GetName().c_str(), new_node->GetType().c_str()); GELOGE(FAILED, "Failed to link control edges from var %s to new const %s", src_node->GetName().c_str(), new_node->GetName().c_str()); return FAILED; } if (GraphUtils::MoveInCtrlEdges(node, new_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move in control edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + new_node->GetName().c_str(), new_node->GetType().c_str()); GELOGE(FAILED, "Failed to move in ctrl edges from %s to new const", node->GetName().c_str()); return FAILED; } if (GraphUtils::MoveOutCtrlEdges(node, new_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out control edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + new_node->GetName().c_str(), new_node->GetType().c_str()); GELOGE(FAILED, "Failed to move out ctrl edges from %s to new const", node->GetName().c_str()); return FAILED; } @@ -177,6 +204,9 @@ Status VarIsInitializedOpPass::ChangeNodeToConstant(NodePtr &node, bool inited) NodePtr const_node = graph->AddNodeFront(constant_op_desc); if (const_node == nullptr) { + REPORT_CALL_ERROR("E19999", "Add node:%s(%s) to graph:%s front failed", + constant_op_desc->GetName().c_str(), constant_op_desc->GetType().c_str(), + graph->GetName().c_str()); return FAILED; } @@ -185,11 +215,16 @@ Status VarIsInitializedOpPass::ChangeNodeToConstant(NodePtr &node, bool inited) } if (NodeUtils::MoveOutputEdges(node, const_node) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Move out edge from node:%s(%s) to node:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str(), + const_node->GetName().c_str(), const_node->GetType().c_str()); GELOGE(FAILED, "[%s] Move output edges to new node failed.", node->GetName().c_str()); return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + node->GetName().c_str(), node->GetType().c_str(), graph->GetName().c_str()); GELOGE(FAILED, "[%s] RemoveNodeWithoutRelink failed.", node->GetName().c_str()); return FAILED; } @@ -263,6 +298,7 @@ Status VarIsInitializedOpPass::UpdateInitedVars(const NodePtr &node) { std::set *VarIsInitializedOpPass::CreateInitedVars() { std::unique_ptr> inited_vars_keeper(new(std::nothrow) std::set()); if (inited_vars_keeper == nullptr) { + REPORT_CALL_ERROR("E19999", "New set failed"); GELOGE(OUT_OF_MEMORY, "Failed to alloc set memory"); return nullptr; } diff --git a/ge/graph/passes/variable_op_pass.cc b/ge/graph/passes/variable_op_pass.cc index 8f33335d..c605d305 100644 --- a/ge/graph/passes/variable_op_pass.cc +++ b/ge/graph/passes/variable_op_pass.cc @@ -47,6 +47,9 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { GELOGD("Begin to bypass trans node %s", trans_node->GetName().c_str()); auto ret = GraphUtils::CopyInCtrlEdges(trans_node, ref_node); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Copy in control edge from node:%s(%s) to node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), + ref_node->GetName().c_str(), ref_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to move control edges from trans " "node %s to var-ref %s", @@ -55,6 +58,8 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { } auto ref_in_anchor = ref_node->GetInDataAnchor(0); if (ref_in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no input anchor, check invalid", + ref_node->GetName().c_str(), ref_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "The variable ref node %s does not have an " "input anchor", @@ -64,6 +69,8 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { ref_in_anchor->UnlinkAll(); auto trans_in_anchor = trans_node->GetInDataAnchor(0); if (trans_in_anchor == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no input anchor, check invalid", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to get the in data anchor from trans" " node %s type %s", @@ -79,6 +86,11 @@ Status ByPassTransNode(NodePtr &trans_node, NodePtr &ref_node) { } else { ret = GraphUtils::AddEdge(prev_trans_node_out_anchor, ref_in_anchor); if (ret != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Add edge between op:%s(%s)(index:%d) and op:%s(%s)(index:0) failed", + prev_trans_node_out_anchor->GetOwnerNode()->GetName().c_str(), + prev_trans_node_out_anchor->GetOwnerNode()->GetType().c_str(), + prev_trans_node_out_anchor->GetIdx(), + ref_node->GetName().c_str(), ref_node->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Failed to add edge between ref node %s " "and the prev node of trans node %s", @@ -115,6 +127,7 @@ bool IsTransSupport(const TransNodeInfo &trans_info) { Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { if (graph == nullptr) { + REPORT_INNER_ERROR("E19999", "Param graph is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to run variable op pass, null graph"); return INTERNAL_ERROR; } @@ -124,6 +137,7 @@ Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { GetContext().SessionId(), graph_id); if (var_accelerate_ctrl_ == nullptr) { + REPORT_INNER_ERROR("E19999", "The variable accelerate control is nullptr, check invalid"); GELOGE(INTERNAL_ERROR, "Failed to run var op pass, the variable accelerate control is null"); return INTERNAL_ERROR; } @@ -174,11 +188,15 @@ Status VariableOpPass::Run(ge::ComputeGraphPtr graph) { ret = VarManager::Instance(graph->GetSessionID())->SetTransRoad(node->GetName(), fusion_road); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Set Trans road for node:%s(%s) failed, session_id:%lu", + node->GetName().c_str(), node->GetType().c_str(), graph->GetSessionID()); GELOGE(INTERNAL_ERROR, "Failed to update the format fusion road for var %s", node->GetName().c_str()); return INTERNAL_ERROR; } ret = VarManager::Instance(graph->GetSessionID())->SetChangedGraphId(node->GetName(), graph_id); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update graph_id:%u for node:%s(%s) failed, session_id:%lu", + graph_id, node->GetName().c_str(), node->GetType().c_str(), graph->GetSessionID()); GELOGE(INTERNAL_ERROR, "Failed to update the graph id for var %s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -210,10 +228,14 @@ Status VariableOpPass::DealFusion(const ge::NodePtr &var_node) { trans_node->GetType().c_str(), var_node->GetName().c_str()); if (GraphUtils::IsolateNode(trans_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, trans_node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), graph->GetName().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } } @@ -245,9 +267,13 @@ Status VariableOpPass::DealFusion(const ge::NodePtr &var_node) { " one output data nodes, isolate and remove it.", trans_node->GetName().c_str(), trans_node->GetType().c_str(), ref_node->GetName().c_str()); if (GraphUtils::IsolateNode(trans_node, {0}) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, trans_node) != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + trans_node->GetName().c_str(), trans_node->GetType().c_str(), graph->GetName().c_str()); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } } @@ -365,6 +391,7 @@ Status VariableOpPass::CheckVariableRefLegally(const ge::NodePtr &var_node, bool Status VariableOpPass::UpdateVarAndRefOutputFormatInfo(const GeTensorDesc &final_output, const ge::NodePtr &node) { if (node == nullptr || node->GetOpDesc() == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node or its op_desc is nullptr, check invalid"); GELOGE(FAILED, "node or opdesc is nullptr"); return FAILED; } @@ -377,6 +404,8 @@ Status VariableOpPass::UpdateVarAndRefOutputFormatInfo(const GeTensorDesc &final auto node_desc = node->GetOpDesc()->GetOutputDesc(0); CopyVariableFormatDataTypeAndShape(final_output, node_desc); if (node->GetOpDesc()->UpdateOutputDesc(0, node_desc) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Update ouput:0 desc in op:%s(%s) failed", + node->GetName().c_str(), node->GetType().c_str()); GELOGE(FAILED, "update output desc fail."); return FAILED; } @@ -460,6 +489,10 @@ Status VariableOpPass::CheckVarAndVarRefAreAlike(const NodePtr &var_node, const GELOGD("var_ref_node_trans_nodes size is %zu.", var_ref_node_trans_nodes.size()); if (var_ref_node_trans_nodes.size() > 1) { + REPORT_INNER_ERROR("E19999", "In data node num:%zu of node:%s(%s) bigger than 1, check invalid", + var_ref_node_trans_nodes.size(), + var_ref_node->GetName().c_str(), var_ref_node->GetType().c_str()); + GELOGE(GE_GRAPH_VARIABLE_OP_PASS_FAILED, "var_ref_node_trans_nodes.size() > 1."); return GE_GRAPH_VARIABLE_OP_PASS_FAILED; } @@ -525,6 +558,7 @@ void VariableOpPass::CopyVariableFormatDataTypeAndShape(const GeTensorDesc &src_ Status VariableOpPass::CheckIfCouldBeOptimized(const ge::NodePtr &node, bool &flag, VarTransRoad &fusion_road) { if (node == nullptr) { + REPORT_INNER_ERROR("E19999", "Param node is nullptr, check invalid"); return FAILED; } bool is_matched = false; @@ -602,6 +636,8 @@ Status VariableOpPass::RenewVarDesc(ge::ComputeGraphPtr &graph) { GE_CHECK_NOTNULL(node->GetOpDesc()); ret = ge::VarManager::Instance(graph->GetSessionID())->RenewCurVarDesc(node->GetName(), node->GetOpDesc()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Renew descriptor for node:%s(%s) failed, session_id:%lu", + node->GetName().c_str(), node->GetType().c_str(), graph->GetSessionID()); GELOGE(FAILED, "var manager renew var[%s] descriptor failed!", node->GetName().c_str()); return FAILED; } @@ -626,6 +662,8 @@ Status VariableOpPass::RenewVarDesc(uint64_t session_id, const NodePtr &node, co GE_CHECK_NOTNULL(node->GetOpDesc()); Status ret = ge::VarManager::Instance(session_id)->RenewCurVarDesc(node->GetName(), node->GetOpDesc()); if (ret != SUCCESS) { + REPORT_CALL_ERROR("E19999", "Renew descriptor for node:%s(%s) failed, session_id:%lu", + node->GetName().c_str(), node->GetType().c_str(), session_id); GELOGE(FAILED, "var manager renew var[%s] descriptor failed!", node->GetName().c_str()); return FAILED; } diff --git a/ge/graph/passes/variable_ref_delete_op_pass.cc b/ge/graph/passes/variable_ref_delete_op_pass.cc index 8e625857..a0e0bcba 100644 --- a/ge/graph/passes/variable_ref_delete_op_pass.cc +++ b/ge/graph/passes/variable_ref_delete_op_pass.cc @@ -35,6 +35,8 @@ Status VariableRefDeleteOpPass::Run(ge::ComputeGraphPtr graph) { continue; } if (all_var_names.count(ref_var_src_var_name) == 0) { + REPORT_INNER_ERROR("E19999", "Can not find source variable[%s] of variable ref[%s], check invalid", + ref_var_src_var_name.c_str(), node->GetName().c_str()); GELOGE(FAILED, "Can not find source variable[%s] of variable ref[%s]", ref_var_src_var_name.c_str(), node->GetName().c_str()); return FAILED; @@ -53,6 +55,8 @@ Status VariableRefDeleteOpPass::DealVariableRef(ge::ComputeGraphPtr &graph, ge:: GE_CHECK_NOTNULL(variable_ref); auto inAnchor0 = variable_ref->GetInDataAnchor(0); if (inAnchor0 == nullptr) { + REPORT_INNER_ERROR("E19999", "Node:%s(%s) has no input anchor, check invalid", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); GELOGE(FAILED, "variable_ref [%s] no input", variable_ref->GetName().c_str()); return FAILED; } @@ -73,17 +77,23 @@ Status VariableRefDeleteOpPass::DealVariableRef(ge::ComputeGraphPtr &graph, ge:: GELOGI("[%s-%d]: add attr [REF_VAR_SRC_VAR_NAME: %s ] ", peer_node->GetName().c_str(), index, ref_var_src_var_name.c_str()); } else { + REPORT_CALL_ERROR("E19999", "Set Attr:%s to output:%d desc of op:%s(%s) failed", REF_VAR_SRC_VAR_NAME.c_str(), + index, op_desc->GetName().c_str(), op_desc->GetType().c_str()); GELOGE(FAILED, "[%s-%d]: add attr [REF_VAR_SRC_VAR_NAME: %s ] failed", peer_node->GetName().c_str(), index, ref_var_src_var_name.c_str()); return FAILED; } // remove variable_ref if (GraphUtils::IsolateNode(variable_ref, {0}) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Isolate node:%s(%s) failed", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); GELOGE(INTERNAL_ERROR, "Isolate removed node: %s, type: %s failed", variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); return FAILED; } if (GraphUtils::RemoveNodeWithoutRelink(graph, variable_ref) != GRAPH_SUCCESS) { + REPORT_CALL_ERROR("E19999", "Remove node:%s(%s) without relink in graph:%s failed", + variable_ref->GetName().c_str(), variable_ref->GetType().c_str(), graph->GetName().c_str()); GELOGE(INTERNAL_ERROR, "Remove node: %s, type: %s without relink failed", variable_ref->GetName().c_str(), variable_ref->GetType().c_str()); return FAILED; diff --git a/ge/graph/preprocess/graph_preprocess.cc b/ge/graph/preprocess/graph_preprocess.cc index 4fb80646..2d06cd5d 100644 --- a/ge/graph/preprocess/graph_preprocess.cc +++ b/ge/graph/preprocess/graph_preprocess.cc @@ -27,6 +27,7 @@ #include "common/helper/model_helper.h" #include "common/math/math_util.h" #include "common/op/ge_op_utils.h" +#include "ir_build/option_utils.h" #include "graph/common/ge_call_wrapper.h" #include "graph/common/local_context.h" #include "graph/common/transop_util.h" @@ -991,101 +992,6 @@ Status ProcessNetoutputNodeDynShape(NodePtr &node) { } return SUCCESS; } -long StringToLongNoThrow(const string &str) { - try { - return std::stol(str); - } catch (const std::invalid_argument) { - GELOGE(PARAM_INVALID, - "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example:" - "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", - str.c_str()); - return PARAM_INVALID; - } catch (const std::out_of_range) { - GELOGE(PARAM_INVALID, - "Parse shape range of input failed when transfer from string to int64. Given %s, while correct example:" - "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", - str.c_str()); - return PARAM_INVALID; - } -} -/** - * Parser shape_range from string to vector - * shape_range from option normally is "[1~20,3,3~6,-1],[1~20,3,3~6,-1]" - * @param shape_range - */ -Status ParseDynamicInputShapeRange(const std::string &shape_range, - std::vector>> &range) { - if (shape_range.size() < 2) { - REPORT_INNER_ERROR("E19999", "shape_range.size:%zu < 2, check invalid", shape_range.size()); - GELOGE(PARAM_INVALID, "Shape range %s is invalid.", shape_range.c_str()); - return PARAM_INVALID; - } - // different shape_range of single input are split by ']' - vector shape_range_set = ge::StringUtils::Split(shape_range, ']'); - if (shape_range_set.empty()) { - REPORT_INNER_ERROR("E19999", "Shape range %s is not valid. Correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", - shape_range.c_str()); - GELOGE(PARAM_INVALID, "Shape range %s is not valid. Correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", - shape_range.c_str()); - return PARAM_INVALID; - } - for (auto &shape_range_str : shape_range_set) { - if (shape_range_str.size() < 3) { - // shape_range_str should be "[2~3,1" - // or ",[2~3,1". because we should trim '[' or ',[' - // so shape_range_str.size() < 3 is invalid - continue; - } - // trim start bytes, after that, single input should be "1~20,3,3~6,-1" - if (ge::StringUtils::StartWith(shape_range_str, "[")) { - shape_range_str = shape_range_str.substr(1, shape_range_str.size()); - } - if (ge::StringUtils::StartWith(shape_range_str, ",")) { - shape_range_str = shape_range_str.substr(2, shape_range_str.size()); - } - - // parse shape_range of single input. eg. "1~20,3,3~6,-1" - std::vector> range_of_single_input; - vector dim_range_set = ge::StringUtils::Split(shape_range_str, ','); - for (const auto &range_pair_str : dim_range_set) { - vector range_pair_set = ge::StringUtils::Split(range_pair_str, '~'); - pair range_pair; - if (range_pair_set.size() == 1) { - // fix dim - auto range_value = StringToLongNoThrow(range_pair_set.at(0).c_str()); - if (range_value < 0) { - range_pair = std::make_pair(1, range_value); - } else { - range_pair = std::make_pair(range_value, range_value); - } - } else if (range_pair_set.size() == 2) { - // unknown dim, should get range. - auto range_left = StringToLongNoThrow(range_pair_set.at(0).c_str()); - auto range_right = StringToLongNoThrow(range_pair_set.at(1).c_str()); - if (range_left < 0 || range_right < 0) { - REPORT_INNER_ERROR("E19999", "Shape range of input is invalid. Given range pair [%ld,%ld], " - "while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", range_left, range_right); - GELOGE(PARAM_INVALID, - "Shape range of input is invalid. Given range pair [%ld,%ld], while correct example: " - "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", - range_left, range_right); - return PARAM_INVALID; - } - range_pair = std::make_pair(range_left, range_right); - } else { - REPORT_INNER_ERROR("E19999", "Shape range of input is invalid. Given %s, " - "while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", shape_range.c_str()); - GELOGE(PARAM_INVALID, - "Shape range of input is invalid. Given %s, while correct example: \"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\"", - shape_range.c_str()); - return PARAM_INVALID; - } - range_of_single_input.emplace_back(range_pair); - } - range.emplace_back(range_of_single_input); - } - return SUCCESS; -} Status GetDynamicInputShapeRange(const std::vector &user_input, const std::map &graph_option, vector>> &range_vec) { @@ -1114,9 +1020,10 @@ Status GetDynamicInputShapeRange(const std::vector &user_input, const OPTION_EXEC_DYNAMIC_EXECUTE_MODE, OPTION_EXEC_DATA_INPUTS_SHAPE_RANGE); return PARAM_INVALID; } - - auto ret = ParseDynamicInputShapeRange(iter->second, range_vec); - GE_CHK_STATUS_RET(ret, "Parse dynamic input shape range failed."); + if (ParseInputShapeRange(iter->second, range_vec) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Parse][ShapeRange] Parse dynamic input shape range failed."); + return PARAM_INVALID; + } if (range_vec.size() != user_input.size()) { GELOGE(PARAM_INVALID, "Dynamic input shape range size is %zu, inputs size is %zu. Not match.", range_vec.size(), user_input.size()); diff --git a/ge/hybrid/executor/hybrid_execution_context.cc b/ge/hybrid/executor/hybrid_execution_context.cc index bde30932..f1357285 100644 --- a/ge/hybrid/executor/hybrid_execution_context.cc +++ b/ge/hybrid/executor/hybrid_execution_context.cc @@ -63,5 +63,27 @@ Status GraphExecutionContext::Synchronize(rtStream_t rt_stream) { REPORT_CALL_ERROR("E19999", "invoke rtStreamSynchronize failed, ret = %d", rt_ret); return RT_FAILED; } + +Status GraphExecutionContext::DumpExceptionInfo(const std::vector &exception_infos) { + if (exception_infos.empty()) { + GELOGI("[Dump][ExceptionInfo] Exception info is null."); + return SUCCESS; + } + GELOGI("[Dump][ExceptionInfo] Start to search dynamic op info and to dump."); + if (exception_dumper.DumpExceptionInfo(exception_infos) != SUCCESS) { + GELOGE(FAILED, "[Dump][Exception] Dump dynamic op exception info failed."); + return FAILED; + } + GELOGI("[Dump][ExceptionInfo] Start to search static op info and to dump."); + for (const auto &iter : davinci_model) { + if (iter != nullptr) { + if (iter->DumpExceptionInfo(exception_infos) != SUCCESS) { + GELOGE(FAILED, "[Dump][ExceptionInfo] Dump static op exception info failed."); + return FAILED; + } + } + } + return SUCCESS; +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/executor/hybrid_execution_context.h b/ge/hybrid/executor/hybrid_execution_context.h index 003e8010..67a96e98 100644 --- a/ge/hybrid/executor/hybrid_execution_context.h +++ b/ge/hybrid/executor/hybrid_execution_context.h @@ -23,6 +23,7 @@ #include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" #include "graph/ge_local_context.h" +#include "graph/load/model_manager/davinci_model.h" #include "hybrid/common/npu_memory_allocator.h" #include "hybrid/common/tensor_value.h" #include "hybrid/executor/hybrid_profiler.h" @@ -54,6 +55,7 @@ struct GraphExecutionContext { void SetErrorCode(Status error_code); Status GetStatus() const; Status Synchronize(rtStream_t rt_stream); + Status DumpExceptionInfo(const std::vector &exception_infos); uint64_t session_id = 0; uint64_t context_id = 0; @@ -68,7 +70,9 @@ struct GraphExecutionContext { DumpProperties dump_properties; bool trace_enabled = false; bool dump_enabled = false; - std::atomic_bool is_eos_; + ExceptionDumper exception_dumper; + std::vector> davinci_model; + std::atomic_bool is_eos_{false}; long profiling_level = 0; long iteration = 0; void *global_step = nullptr; diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index ca505618..3294a286 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -19,7 +19,6 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "graph/ge_context.h" -#include "omm/csa_interact.h" namespace ge { namespace hybrid { @@ -144,12 +143,14 @@ Status HybridModelAsyncExecutor::RunInternal() { GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(device_id)); }); while (run_flag_) { + // Model has not indeedly started running before received data + SetRunningFlag(false); std::shared_ptr data_wrapper; Status ret = data_inputer_->Pop(data_wrapper); - if (data_wrapper == nullptr || ret != SUCCESS) { - GELOGI("data_wrapper is null!, ret = %u", ret); - continue; - } + // Model indeedly start running + SetRunningFlag(true); + GE_IF_BOOL_EXEC(data_wrapper == nullptr || ret != SUCCESS, GELOGI("data_wrapper is null!, ret = %u", ret); + continue); GELOGI("Getting the input data, model_id:%u", model_id_); GE_IF_BOOL_EXEC(!run_flag_, break); @@ -161,7 +162,6 @@ Status HybridModelAsyncExecutor::RunInternal() { ret = PreRun(current_data, args); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "[Invoke][PreRun] failed, model_id:%u.", model_id_); // [No need to check value] if (pipe_executor_ != nullptr) { @@ -179,16 +179,15 @@ Status HybridModelAsyncExecutor::RunInternal() { } ret = HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); if (ret != SUCCESS) { - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue; } RECORD_MODEL_EXECUTION_EVENT(executor_->GetContext(), "[RunInternal] [iteration = %d] End", iterator_count_); iterator_count_++; - GELOGI("run iterator count is %lu", iterator_count_); + SetRunningFlag(false); + GELOGI("run iterator count is %lu, model_id:%u", iterator_count_, model_id_); } - CsaInteract::GetInstance().WriteInternalErrorCode(); GELOGI("Model run end, model id:%u", model_id_); return SUCCESS; } diff --git a/ge/hybrid/executor/hybrid_model_async_executor.h b/ge/hybrid/executor/hybrid_model_async_executor.h index b6942b10..c5a6533a 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.h +++ b/ge/hybrid/executor/hybrid_model_async_executor.h @@ -55,6 +55,14 @@ class HybridModelAsyncExecutor { Status EnqueueData(const std::shared_ptr &data); + uint32_t GetDataInputerSize() { return data_inputer_->Size(); } + + bool GetRunningFlag() const { return running_flag_; } + + void SetRunningFlag(bool flag) { running_flag_ = flag; } + + const GraphExecutionContext * GeContext() { return executor_->GetContext(); } + private: Status InitInputDesc(); @@ -84,6 +92,8 @@ class HybridModelAsyncExecutor { uint32_t device_id_ = 0U; uint32_t model_id_ = 0U; std::atomic_bool run_flag_; + // check whether model is running with data + bool running_flag_ = false; std::unique_ptr data_inputer_; std::unique_ptr executor_; std::unique_ptr pipe_executor_; diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc index 4a8a0af0..2ab4ed5d 100755 --- a/ge/hybrid/executor/hybrid_model_executor.cc +++ b/ge/hybrid/executor/hybrid_model_executor.cc @@ -18,6 +18,7 @@ #include "graph/ge_context.h" #include "graph/runtime_inference_context.h" #include "graph/utils/tensor_utils.h" +#include "graph/load/model_manager/model_manager.h" #include "common/dump/dump_manager.h" #include "common/profiling/profiling_manager.h" @@ -102,7 +103,22 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor, } if (!model_->IsSingleOp()) { - HYBRID_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph."); + Status ret = executor.Synchronize(); + if (ret != ge::SUCCESS) { + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (!exception_infos.empty()) { + HYBRID_CHK_STATUS_RET(context_.DumpExceptionInfo(exception_infos), + "[Execute][GraphInternal] Dump exception info failed."); + } + if (ret == ge::END_OF_SEQUENCE) { + GELOGD("Got end of sequence"); + } else { + GELOGE(ret, "[Execute][GraphInternal] Synchronize failed."); + } + return ret; + } RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End"); } @@ -175,19 +191,16 @@ Status HybridModelExecutor::CheckInputShapeByShapeRange(const GraphItem *graph_i HybridModelExecutor::ExecuteArgs &args) { GE_CHECK_NOTNULL(graph_item); auto input_nodes = graph_item->GetInputNodes(); - if (args.input_desc.size() < input_nodes.size()) { - REPORT_INNER_ERROR("E19999", "[%s] Number of inputs [%zu] is not sufficient for graph which needs [%zu] inputs.", - graph_item->GetName().c_str(), args.input_desc.size(), input_nodes.size()); - GELOGE(INTERNAL_ERROR, "[%s] Number of inputs [%zu] is not sufficient for graph which needs [%zu] inputs.", - graph_item->GetName().c_str(), args.input_desc.size(), input_nodes.size()); - return INTERNAL_ERROR; - } for (size_t i = 0; i < input_nodes.size(); ++i) { auto &input_node = input_nodes[i]; if (input_node == nullptr) { GELOGD("[%s] Input[%zu] is not needed by graph, skip it.", graph_item->GetName().c_str(), i); continue; } + if (!input_node->is_dynamic) { + GELOGD("[%s] Input[%zu] is not dynamic, skip it.", graph_item->GetName().c_str(), i); + continue; + } GeTensorDescPtr model_input_desc = input_node->MutableInputDesc(0); GE_CHECK_NOTNULL(model_input_desc); std::vector> shape_range; @@ -200,6 +213,13 @@ Status HybridModelExecutor::CheckInputShapeByShapeRange(const GraphItem *graph_i GELOGD("[%s] Input[%zu] shape is not needed to check by shape range, skip it.", graph_item->GetName().c_str(), i); continue; } + if (i >= args.input_desc.size()) { + REPORT_INNER_ERROR("E19999", "[%s] Inputs[%zu] is greater than or equal to input desc size[%zu].", + graph_item->GetName().c_str(), i, args.input_desc.size()); + GELOGE(INTERNAL_ERROR, "[%s] inputs[%zu] is greater than or equal to input desc size[%zu].", + graph_item->GetName().c_str(), i, args.input_desc.size()); + return INTERNAL_ERROR; + } ConstGeTensorDescPtr args_tensor_desc = args.input_desc[i]; GE_CHECK_NOTNULL(args_tensor_desc); GeShape shape = args_tensor_desc->GetShape(); diff --git a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc index a5de7c22..b2a77653 100644 --- a/ge/hybrid/executor/hybrid_model_pipeline_executor.cc +++ b/ge/hybrid/executor/hybrid_model_pipeline_executor.cc @@ -4,6 +4,7 @@ #include "common/dump/dump_manager.h" #include "graph/ge_context.h" #include "graph/runtime_inference_context.h" +#include "graph/load/model_manager/model_manager.h" namespace ge { namespace hybrid { @@ -266,6 +267,13 @@ Status HybridModelPipelineExecutor::Execute(HybridModelExecutor::ExecuteArgs &ar ret = stage_executors_[i]->Synchronize(); if (ret != SUCCESS) { + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + auto exception_infos = model_manager->GetExceptionInfos(); + if (!exception_infos.empty()) { + HYBRID_CHK_STATUS_RET(context_.DumpExceptionInfo(exception_infos), + "[Execute][GraphInternal] Dump exception info failed."); + } GELOGE(ret, "[Invoke][Synchronize] failed for [Executor: %zu].", i); REPORT_CALL_ERROR("E19999", "[Executor: %zu] failed to Synchronize result.", i); has_error = true; diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 24713f96..d6bbc36d 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -19,6 +19,7 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/tensor_adapter.h" #include "graph/debug/ge_attr_define.h" +#include "graph/load/model_manager/model_manager.h" #include "hybrid/node_executor/node_executor.h" #include "hybrid/executor//worker//shape_inference_engine.h" #include "common/dump/dump_op.h" @@ -70,6 +71,7 @@ class NodeDoneCallback { Status PrepareConstInputs(const NodeItem &node_item); Status DumpDynamicNode(); Status ProfilingReport(); + Status SaveDumpOpInfo(); Status GetTaskDescInfo(const NodePtr node, const HybridModel *model, std::vector &task_desc_info); GraphExecutionContext *graph_context_; @@ -266,6 +268,40 @@ Status NodeDoneCallback::DumpDynamicNode() { return SUCCESS; } +Status NodeDoneCallback::SaveDumpOpInfo() { + GE_CHECK_NOTNULL(graph_context_); + GE_CHECK_NOTNULL(graph_context_->model); + + auto node = context_->GetNodeItem().node; + if (node == nullptr) { + GELOGE(PARAM_INVALID, "[Save][DumpOpInfo] Get node is nullptr."); + return PARAM_INVALID; + } + auto op_desc = node->GetOpDesc(); + GE_CHECK_NOTNULL(op_desc); + + vector input_addrs; + vector output_addrs; + for (int i = 0; i < context_->NumInputs(); i++) { + auto tensor_value = context_->GetInput(i); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr."); + void *input_addr = const_cast(tensor_value->GetData()); + input_addrs.emplace_back(input_addr); + } + for (int j = 0; j < context_->NumOutputs(); j++) { + auto tensor_value = context_->GetOutput(j); + GE_CHK_BOOL_RET_STATUS(tensor_value != nullptr, PARAM_INVALID, "[Save][DumpOpInfo] Tensor value is nullptr."); + void *output_addr = const_cast(tensor_value->GetData()); + output_addrs.emplace_back(output_addr); + } + + uint32_t stream_id = context_->GetStreamId(); + uint32_t task_id = context_->GetTaskId(); + graph_context_->exception_dumper.SaveDumpOpInfo(op_desc, task_id, stream_id, input_addrs, output_addrs); + + return SUCCESS; +} + Status NodeDoneCallback::OnNodeDone() { auto &node_item = context_->GetNodeItem(); GELOGI("[%s] Start callback process.", node_item.NodeName().c_str()); @@ -278,6 +314,12 @@ Status NodeDoneCallback::OnNodeDone() { GE_CHK_STATUS_RET(DumpDynamicNode(), "[Call][DumpDynamicNode] Failed."); } + auto model_manager = ModelManager::GetInstance(); + GE_CHECK_NOTNULL(model_manager); + if (model_manager->IsDumpExceptionOpen()) { + GE_CHK_STATUS_RET(SaveDumpOpInfo(), "[Save][DumpOpInfo] Failed to dump op info."); + } + if (ProfilingManager::Instance().ProfilingModelExecuteOn()) { GE_CHK_STATUS_RET(ProfilingReport(), "[Report][Profiling] of node[%s] failed.", node_item.NodeName().c_str()); } @@ -322,20 +364,28 @@ Status ExecutionEngine::ExecuteAsync(NodeState &node_state, GraphExecutionContext &execution_context) { GELOGI("[%s] Node is ready for execution", task_context->GetNodeName()); RECORD_EXECUTION_EVENT(&execution_context, task_context->GetNodeName(), "Start"); - auto cb = std::shared_ptr(new(std::nothrow) NodeDoneCallback(&execution_context, task_context)); - GE_CHECK_NOTNULL(cb); - auto callback = [task_context, cb]() { - auto ret = cb->OnNodeDone(); - if (ret != SUCCESS) { - task_context->OnError(ret); - } - }; - + std::function callback = nullptr; + GE_CHK_STATUS_RET_NOLOG(InitCallback(task_context, execution_context, callback)); GE_CHK_STATUS_RET_NOLOG(DoExecuteAsync(node_state, *task_context, execution_context, callback)); GE_CHK_STATUS_RET_NOLOG(PropagateOutputs(*node_state.GetNodeItem(), *task_context, execution_context)); return SUCCESS; } +Status ExecutionEngine::InitCallback(const std::shared_ptr &task_context, + GraphExecutionContext &execution_context, std::function &callback) { + if (task_context->NeedCallback()) { + auto cb = std::shared_ptr(new(std::nothrow) NodeDoneCallback(&execution_context, task_context)); + GE_CHECK_NOTNULL(cb); + callback = [task_context, cb]() { + auto ret = cb->OnNodeDone(); + if (ret != SUCCESS) { + task_context->OnError(ret); + } + }; + } + return SUCCESS; +} + Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, TaskContext &task_context, GraphExecutionContext &context, @@ -343,7 +393,7 @@ Status ExecutionEngine::DoExecuteAsync(NodeState &node_state, const auto &task = node_state.GetKernelTask(); if (task == nullptr) { GELOGE(INTERNAL_ERROR, "[Get][KernelTask] of [%s] is null.", node_state.GetName().c_str()); - REPORT_INNER_ERROR("E19999", "GetKernelTask of %s is null.", node_state.GetName().c_str()); + REPORT_INNER_ERROR("E19999", "GetKernelTask of %s failed.", node_state.GetName().c_str()); return INTERNAL_ERROR; } diff --git a/ge/hybrid/executor/worker/execution_engine.h b/ge/hybrid/executor/worker/execution_engine.h index ad80d99b..c10ad729 100644 --- a/ge/hybrid/executor/worker/execution_engine.h +++ b/ge/hybrid/executor/worker/execution_engine.h @@ -35,6 +35,8 @@ class ExecutionEngine { TaskContext &task_context, GraphExecutionContext &context, const std::function &callback); + static Status InitCallback(const std::shared_ptr &task_context, + GraphExecutionContext &execution_context, std::function &callback); }; } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/hybrid_davinci_model.cc b/ge/hybrid/hybrid_davinci_model.cc index c741fe7e..0ad1c865 100755 --- a/ge/hybrid/hybrid_davinci_model.cc +++ b/ge/hybrid/hybrid_davinci_model.cc @@ -19,6 +19,7 @@ #include "hybrid/model/hybrid_model.h" #include "hybrid/executor/hybrid_model_async_executor.h" #include "hybrid/node_executor/node_executor.h" +#include "graph/manager/graph_manager_utils.h" namespace ge { namespace hybrid { @@ -32,9 +33,10 @@ class HybridDavinciModel::Impl { } Status Init() { - GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), "Failed to initialize executors"); - GE_CHK_STATUS_RET(model_.Init(), "Failed to init model.") - GE_CHK_STATUS_RET(executor_.Init(), "Failed to init model executor.") + GE_CHK_STATUS_RET(NodeExecutorManager::GetInstance().EnsureInitialized(), + "[Initialize][NodeExecutorManager] failed"); + GE_CHK_STATUS_RET(model_.Init(), "[Init][HybridModel] failed.") + GE_CHK_STATUS_RET(executor_.Init(), "[Init][HybridModelAsyncExecutor] failed.") return SUCCESS; } @@ -80,6 +82,12 @@ class HybridDavinciModel::Impl { model_.SetOmName(model_name); } + uint32_t GetDeviceId() { + return model_.GetDeviceId(); + } + + const GraphExecutionContext * GeContext() { return executor_.GeContext(); } + uint64_t GetSessionId() { return model_.GetSessionId(); } @@ -107,6 +115,17 @@ class HybridDavinciModel::Impl { model_.SetModelDescVersion(is_new_model_desc); } + uint32_t GetDataInputerSize() { return executor_.GetDataInputerSize(); } + + bool GetRunningFlag() const { return executor_.GetRunningFlag(); } + + Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + auto listener = dynamic_cast(listener_.get()); + GE_CHECK_NOTNULL(listener); + listener->SetCallback(callback); + return SUCCESS; + } + private: std::shared_ptr listener_; HybridModel model_; @@ -186,6 +205,11 @@ void HybridDavinciModel::SetOmName(const string &om_name) { } } +uint32_t HybridDavinciModel::GetDeviceId() const { + GE_CHECK_NOTNULL(impl_); + return impl_->GetDeviceId(); +} + Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { GE_CHECK_NOTNULL(impl_); return impl_->GetDynamicBatchInfo(batch_info, dynamic_type); @@ -221,5 +245,33 @@ uint64_t HybridDavinciModel::GetSessionId() { GE_CHECK_NOTNULL(impl_); return impl_->GetSessionId(); } + +uint32_t HybridDavinciModel::GetDataInputerSize() { + GE_CHECK_NOTNULL(impl_); + return impl_->GetDataInputerSize(); +} + +bool HybridDavinciModel::GetRunningFlag() const { return impl_->GetRunningFlag(); } + +Status HybridDavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + return impl_->SetRunAsyncListenerCallback(callback); +} + +bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + if (impl_ == nullptr) { + return false; + } + auto context = impl_->GeContext(); + GE_CHECK_NOTNULL(context); + bool ret = context->exception_dumper.GetOpDescInfo(stream_id, task_id, op_desc_info); + if (!ret) { + for (const auto &iter : context->davinci_model) { + if (iter->GetOpDescInfo(stream_id, task_id, op_desc_info)) { + return true; + } + } + } + return ret; +} } // namespace hybrid } // namespace ge diff --git a/ge/hybrid/hybrid_davinci_model.h b/ge/hybrid/hybrid_davinci_model.h index 3b3473ff..472fff17 100644 --- a/ge/hybrid/hybrid_davinci_model.h +++ b/ge/hybrid/hybrid_davinci_model.h @@ -61,6 +61,8 @@ class HybridDavinciModel { uint64_t GetSessionId(); + uint32_t GetDeviceId() const; + Status GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type); void GetUserDesignateShapeOrder(std::vector &user_input_shape_order); @@ -74,6 +76,14 @@ class HybridDavinciModel { void SetModelDescVersion(bool is_new_model_desc); + uint32_t GetDataInputerSize(); + + bool GetRunningFlag() const; + + Status SetRunAsyncListenerCallback(const RunAsyncCallback &callback); + + bool GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const; + private: HybridDavinciModel() = default; class Impl; diff --git a/ge/hybrid/hybrid_davinci_model_stub.cc b/ge/hybrid/hybrid_davinci_model_stub.cc index 67a7a101..2d4fbe03 100644 --- a/ge/hybrid/hybrid_davinci_model_stub.cc +++ b/ge/hybrid/hybrid_davinci_model_stub.cc @@ -68,6 +68,14 @@ uint64_t HybridDavinciModel::GetSessionId() { return 0; } +uint32_t HybridDavinciModel::GetDataInputerSize() { + return 0; +} + +uint32_t HybridDavinciModel::GetDeviceId() const { + return 0; +} + Status HybridDavinciModel::GetDynamicBatchInfo(std::vector> &batch_info, int32_t &dynamic_type) { return UNSUPPORTED; } @@ -87,5 +95,17 @@ Status HybridDavinciModel::GetInputOutputDescInfo(vector &i void HybridDavinciModel::SetModelDescVersion(bool is_new_model_desc) { } + +bool HybridDavinciModel::GetRunningFlag() const { + return false; +} + +Status HybridDavinciModel::SetRunAsyncListenerCallback(const RunAsyncCallback &callback) { + return UNSUPPORTED; +} + +bool HybridDavinciModel::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const { + return true; +} } // namespace hybrid } // namespace ge \ No newline at end of file diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc index 60fdf55a..a047a05b 100755 --- a/ge/hybrid/model/hybrid_model_builder.cc +++ b/ge/hybrid/model/hybrid_model_builder.cc @@ -134,7 +134,7 @@ HybridModelBuilder::HybridModelBuilder(HybridModel &hybrid_model) Status HybridModelBuilder::Build() { GE_CHK_STATUS_RET(ValidateParams(), "[Invoke][ValidateParams] failed, model_name_:[%s]", GetGraphName()); - hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName(); + hybrid_model_.model_name_ = ge_root_model_->GetModelName(); GELOGI("[%s] Start to build hybrid model.", GetGraphName()); GE_CHK_STATUS_RET(InitRuntimeParams(), "[Invoke][InitRuntimeParams] failed, model_name_:[%s]", GetGraphName()); GE_CHK_STATUS_RET(RecoverGraphUnknownFlag(), @@ -277,7 +277,7 @@ Status HybridModelBuilder::ParseForceInfershapeNodes(const NodePtr &node, NodeIt auto op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); // not care result, if no this attr, stand for the op does not need force infershape - (void)AttrUtils::GetBool(op_desc, kForceInfershape, node_item.is_need_force_infershape); + (void) AttrUtils::GetBool(op_desc, kForceInfershape, node_item.is_need_force_infershape); GELOGD("node [%s] is need do infershape, flag is %d", op_desc->GetName().c_str(), node_item.is_need_force_infershape); @@ -323,6 +323,18 @@ Status HybridModelBuilder::ParseDependentInputNodes(NodeItem &node_item, const s } } + for (const auto &src_node : ge_node->GetInControlNodes()) { + auto src_node_item = MutableNodeItem(src_node); + if ((src_node_item != nullptr) && (is_hccl_op || src_node_item->IsHcclOp())) { + GELOGD("[%s](%s) Add input control dependent node [%s](%s)", + ge_node->GetName().c_str(), + ge_node->GetType().c_str(), + src_node->GetName().c_str(), + src_node->GetType().c_str()); + dependent_for_execution.emplace(src_node); + } + } + // cond or branch need to be prepared before the execution of IF or CASE if (node_item.node_type == IF || node_item.node_type == STATELESSIF || node_item.node_type == CASE) { auto src_node = NodeUtils::GetInDataNodeByIndex(*ge_node, 0); // cond input @@ -1528,14 +1540,22 @@ Status HybridModelBuilder::IdentifyVariableOutputs(NodeItem &node_item) { in_data_anchor->GetIdx(), src_node->GetName().c_str(), src_op_type.c_str()); + uint32_t parent_index = 0; + if (GetParentNodeOutputIndex(*net_output_desc, in_data_anchor->GetIdx(), parent_index) != SUCCESS) { + continue; + } + GELOGD("Got parent output index = %u", parent_index); + if (src_op_type == DATA) { + int ref_i = 0; + (void)AttrUtils::GetInt(src_node->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, ref_i); + node_item.reuse_inputs.emplace(static_cast(parent_index), ref_i); + GELOGD("[%s] output[%u] resues input[%d]", node_item.NodeName().c_str(), parent_index, ref_i); + } if (src_op_type != CONSTANTOP && src_op_type != CONSTANT && src_op_type != VARIABLE) { continue; } - uint32_t parent_index = 0; - GE_CHK_STATUS_RET_NOLOG(GetParentNodeOutputIndex(*net_output_desc, in_data_anchor->GetIdx(), parent_index)); - GELOGD("Got parent output index = %u", parent_index); GE_CHECK_LE(parent_index, INT32_MAX); node_item.ref_outputs.emplace(static_cast(parent_index), src_node); if (src_op_type == CONSTANTOP || src_op_type == CONSTANT) { diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc index a4fc4449..29ae831c 100755 --- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc +++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc @@ -208,6 +208,8 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function REPORT_CALL_ERROR("E19999", "rtGetTaskIdAndStreamID failed, ret: 0x%X.", rt_ret); return RT_ERROR_TO_GE_STATUS(rt_ret); } + context.SetTaskId(task_id); + context.SetStreamId(stream_id); GELOGD("Aicore node[%s] task_id: %u, stream_id: %u.", context.GetNodeName(), task_id, stream_id); (void)context.SaveProfilingTaskDescInfo(task_id, stream_id, kTaskTypeAicore, (*it)->GetBlockDim()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/ge/hybrid/node_executor/aicore/aicore_op_task.cc index 06340119..8bb871fb 100644 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc @@ -75,7 +75,6 @@ Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) output_indices_to_skip_.push_back(i); } } - GELOGI("[TASK_INFO] %lu/%s.", log_id_, log_name_.c_str()); return SUCCESS; } @@ -228,19 +227,19 @@ Status AiCoreOpTask::InitWithKernelDef(const OpDesc &op_desc, const domi::TaskDe } const auto *args_offset_buffer = reinterpret_cast(context.args_offset().data()); - uint32_t offset = *args_offset_buffer; - if (offset > args_size_) { + offset_ = *args_offset_buffer; + if (offset_ > args_size_) { GELOGE(INTERNAL_ERROR, "[Check][Offset][%s] Arg offset out of range. offset = %u," - "arg size = %u , op:%s op_type:%s", GetName().c_str(), offset, args_size_, + "arg size = %u , op:%s op_type:%s", GetName().c_str(), offset_, args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); REPORT_INNER_ERROR("E19999", "[%s] Arg offset out of range. offset = %u, arg size = %u" - "op:%s op_type:%s", GetName().c_str(), offset, args_size_, + "op:%s op_type:%s", GetName().c_str(), offset_, args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } - arg_base_ = reinterpret_cast(args_.get() + offset); - max_arg_count_ = (args_size_ - offset) / sizeof(void *); + arg_base_ = reinterpret_cast(args_.get() + offset_); + max_arg_count_ = (args_size_ - offset_) / sizeof(void *); GELOGD("[%s] Done setting kernel args successfully. stub_func = %s, block_dim = %d," "arg base = %p, arg size = %u", op_desc.GetName().c_str(), stub_name_.c_str(), @@ -289,19 +288,19 @@ Status AiCoreOpTask::InitWithKernelDefWithHandle(const OpDesc &op_desc, const do } const auto *args_offset_buffer = reinterpret_cast(context.args_offset().data()); - uint32_t offset = *args_offset_buffer; - if (offset > args_size_) { + offset_ = *args_offset_buffer; + if (offset_ > args_size_) { GELOGE(INTERNAL_ERROR, "[Check][Offset][%s] Arg offset out of range. offset = %u, arg size = %u" - "op:%s op_type:%s", GetName().c_str(), offset, args_size_, + "op:%s op_type:%s", GetName().c_str(), offset_, args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); REPORT_INNER_ERROR("E19999", "[%s] Arg offset out of range. offset = %u, arg size = %u" - "op:%s op_type:%s", GetName().c_str(), offset, args_size_, + "op:%s op_type:%s", GetName().c_str(), offset_, args_size_, op_desc.GetName().c_str(), op_desc.GetType().c_str()); return INTERNAL_ERROR; } - arg_base_ = reinterpret_cast(args_.get() + offset); - max_arg_count_ = (args_size_ - offset) / sizeof(void *); + arg_base_ = reinterpret_cast(args_.get() + offset_); + max_arg_count_ = (args_size_ - offset_) / sizeof(void *); return SUCCESS; } @@ -428,14 +427,20 @@ Status AiCoreOpTask::UpdateArgs(TaskContext &task_context) { ++expected_arg_count; } if (expected_arg_count > max_arg_count_) { - GELOGE(INTERNAL_ERROR, - "[Check][arg_count][%s] Invalid arg memory, max arg count = %u, but expect = %zu", - GetName().c_str(), - max_arg_count_, - expected_arg_count); - REPORT_INNER_ERROR("E19999", "[%s] Invalid arg memory, max arg count = %u, but expect = %zu", - GetName().c_str(), max_arg_count_, expected_arg_count); - return INTERNAL_ERROR; + GELOGD("Need to reset size of args_ from %u to %zu.", max_arg_count_, expected_arg_count); + auto length = expected_arg_count * sizeof(uintptr_t) + offset_; + std::unique_ptr new_args(new(std::nothrow) uint8_t[length]); + GE_CHECK_NOTNULL(new_args); + if (memcpy_s(new_args.get(), length, args_.get(), offset_) != EOK) { + GELOGE(ACL_ERROR_GE_MEMORY_OPERATE_FAILED, "[Update][new_args]failed, dst length is %zu, src length is %u.", + length, offset_); + REPORT_INNER_ERROR("E19999", "update kernel args failed of %s.", task_context.GetNodeName()); + return ACL_ERROR_GE_MEMORY_OPERATE_FAILED; + } + args_ = std::move(new_args); + max_arg_count_ = static_cast(expected_arg_count); + args_size_ = static_cast(length); + arg_base_ = reinterpret_cast(args_.get() + offset_); } int index = 0; @@ -492,6 +497,7 @@ Status AiCoreOpTask::LaunchKernel(rtStream_t stream) { GE_CHK_RT_RET(rtKernelLaunch(stub_func_, block_dim_, args_.get(), args_size_, nullptr, stream)); GELOGD("AiCoreOpTask LaunchKernel End (task = %s, block_dim = %u).", stub_name_.c_str(), block_dim_); } + GELOGI("[TASK_INFO] %lu/%s", log_id_, log_name_.c_str()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.h b/ge/hybrid/node_executor/aicore/aicore_op_task.h index fe18bfd0..8d7b7f1e 100755 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.h +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.h @@ -116,6 +116,7 @@ class AiCoreOpTask { bool is_dynamic_ = false; uint64_t log_id_ = 0; std::string log_name_; + uint32_t offset_ = 0; }; class AtomicAddrCleanOpTask : public AiCoreOpTask { diff --git a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc index 339e1ee4..c2ebf654 100755 --- a/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc +++ b/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc @@ -208,6 +208,8 @@ Status AicpuNodeTaskBase::ExecuteAsync(TaskContext &context, std::function done_callback) { +Status KnownNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTaskExecuteAsync] Start"); GELOGD("[%s] KnownNodeTask::ExecuteAsync in.", context.GetNodeName()); if (davinci_model_->GetTaskList().empty()) { @@ -56,7 +56,9 @@ Status KnownNodeTask:: ExecuteAsync(TaskContext &context, std::functionGetRtModelHandle(), context.GetStream(), 0); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, - GELOGE(rt_ret, "rtModelExecute error, ret: hybrid_model_executorOx%X", rt_ret); return FAILED;); + REPORT_CALL_ERROR("E19999", "rtModelExecute error, ret:Ox%X", rt_ret); + GELOGE(rt_ret, "[Invoke][rtModelExecute] error, ret:Ox%X", rt_ret); + return FAILED;); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodertModelExecute] End"); GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(done_callback)); @@ -87,7 +89,7 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { } GE_CHK_STATUS_RET(davinci_model_->UpdateKnownNodeArgs(inputs, outputs), - "known node task update known node args failed."); + "[Update][KnownNodeArgs] failed for %s.", context.GetNodeName()); GELOGD("[%s] KnownNodeExecutor::UpdateArgs success, task_size = %zu", context.GetNodeName(), davinci_model_->GetTaskList().size()); return SUCCESS; @@ -95,15 +97,15 @@ Status KnownNodeTask::UpdateArgs(TaskContext &context) { Status KnownNodeTask::Init(TaskContext &context) { // allocate output mem - GE_CHK_STATUS_RET(context.AllocateOutputs(), "known node task allocate output failed."); + GE_CHK_STATUS_RET(context.AllocateOutputs(), "[Allocate][Outputs] failed for %s.", context.GetNodeName()); // allocate mem base void *buffer = nullptr; if (davinci_model_->TotalMemSize() != 0) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTask_AllocateWorkspace] Start"); - GE_CHK_STATUS_RET( - context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, davinci_model_->GetRuntimeParam().mem_base), - "known node task allocate workspace failed."); + GE_CHK_STATUS_RET(context.AllocateWorkspace(davinci_model_->TotalMemSize(), &buffer, + davinci_model_->GetRuntimeParam().mem_base), + "[Allocate][Workspace] failed for %s.", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeTask_AllocateWorkspace] End, size %zu", davinci_model_->TotalMemSize()); // update mem base @@ -112,8 +114,18 @@ Status KnownNodeTask::Init(TaskContext &context) { davinci_model_->GetRuntimeParam().mem_base, davinci_model_->GetRuntimeParam().mem_size); } GE_CHK_STATUS_RET(ModelManager::GetInstance()->DestroyAicpuKernel(davinci_model_->GetSessionId(), - davinci_model_->Id(), davinci_model_->SubModelId()), - "KnownNodeTask::Init destroy aicpu kernel failed."); + davinci_model_->Id(), + davinci_model_->SubModelId()), + "[Destroy][AicpuKernel] failed, session_id:%lu, model_id:%u, sub_model_id:%u", + davinci_model_->GetSessionId(), davinci_model_->Id(), davinci_model_->SubModelId()); + if (!load_flag_) { + auto execution_context = const_cast(context.GetExecutionContext()); + GE_CHECK_NOTNULL(execution_context); + auto &davinci_model = execution_context->davinci_model; + davinci_model.emplace_back(davinci_model_); + load_flag_ = true; + } + GELOGI("[%s] KnownNodeExecutor::Init success.", context.GetNodeName()); return SUCCESS; } @@ -121,7 +133,8 @@ Status KnownNodeTask::Init(TaskContext &context) { Status KnownNodeTask::InitDavinciModel(const HybridModel &model, TensorBuffer *weight_buffer) { GELOGD("[Init][DavinciModel] start"); davinci_model_->InitRuntimeParams(); - GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), "init variable mem failed"); + GE_CHK_STATUS_RET(davinci_model_->InitVariableMem(), + "[Init][VariableMem] failed"); int32_t device_id = 0; GE_CHK_RT_RET(rtGetDevice(&device_id)); davinci_model_->SetDeviceId(static_cast(device_id)); @@ -153,11 +166,13 @@ Status KnownNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) cons GELOGD("[%s] KnownNodeExecutor::PrepareTask in.", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] Start"); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] Start"); - GE_CHK_STATUS_RET(task.Init(context), "known node init davinci model failed."); + GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init] %s known node init davinci model failed.", + context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorTaskInit] End"); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] Start"); - GE_CHK_STATUS_RET(task.UpdateArgs(context), "known node task update args failed."); + GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Invoke][UpdateArgs] %s known node task update args failed.", + context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorUpdateArgs] End"); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorPrepareTask] End"); GELOGD("[%s] KnownNodeExecutor::PrepareTask success.", context.GetNodeName()); @@ -188,7 +203,9 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node davinci_model->SetSubModelId(node->GetOpDesc()->GetId()); GELOGD("KnownNodeExecutor::LoadTask node id %ld.", node->GetOpDesc()->GetId()); - GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), "KnownNodeExecutor::LoadTask davincimodel assign failed."); + GE_CHK_STATUS_RET(davinci_model->Assign(ge_model), + "[Invoke][Assign]KnownNodeExecutor::LoadTask davincimodel assign failed for node:%s.", + node->GetName().c_str()); auto known_node_task = MakeShared(davinci_model); GE_CHECK_NOTNULL(known_node_task); @@ -201,8 +218,7 @@ Status KnownNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node Status KnownNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] Start"); - GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), - "Failed to execute task. node = %s", + GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "[Invoke][ExecuteAsync]Failed to execute task. node = %s", context.GetNodeItem().NodeName().c_str()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[KnownNodeExecutorExecuteTask] End"); return SUCCESS; diff --git a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h index 26141b5a..629cb543 100644 --- a/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h +++ b/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.h @@ -42,6 +42,7 @@ class KnownNodeTask : public NodeTask { virtual Status DoInitDavinciModel(void *weight, size_t weight_size); private: std::shared_ptr davinci_model_ = nullptr; + bool load_flag_ = false; }; class KnownNodeExecutor : public NodeExecutor { diff --git a/ge/hybrid/node_executor/controlop/control_op_executor.cc b/ge/hybrid/node_executor/controlop/control_op_executor.cc index 4e7e71f1..df7da661 100644 --- a/ge/hybrid/node_executor/controlop/control_op_executor.cc +++ b/ge/hybrid/node_executor/controlop/control_op_executor.cc @@ -43,8 +43,7 @@ Status ControlOpNodeTask::ExecuteSubgraph(const GraphItem *subgraph, auto executor = MakeShared(subgraph, execution_context); GE_CHECK_NOTNULL(executor); GE_CHK_STATUS_RET(executor->ExecuteAsync(task_context), - "[%s] Failed to execute partitioned call.", - subgraph->GetName().c_str()); + "[Invoke][ExecuteAsync][%s] Failed to execute partitioned call.", subgraph->GetName().c_str()); auto callback = [executor, done_callback]() mutable { if (done_callback != nullptr) { @@ -127,7 +126,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi auto cond_tensor = task_context.GetInput(kIfCondIndex); GE_CHECK_NOTNULL(cond_tensor); GE_CHK_STATUS_RET(ToBool(*cond_tensor, data_type, cond_val), - "[%s] Failed to get cond value.", + "[Invoke][ToBool][%s] Failed to get cond value.", task_context.GetNodeName()); } else { // true if num elements is non-zero @@ -141,9 +140,7 @@ Status IfOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::functi auto subgraph = cond_val ? then_ : else_; GELOGD("[%s] Taking subgraph [%s] by cond = [%d]", task_context.GetNodeName(), subgraph->GetName().c_str(), cond_val); GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), - "[%s] Failed to execute subgraph. cond = %d", - task_context.GetNodeName(), - cond_val); + "[Execute][Subgraph] failed for [%s]. cond = %d", task_context.GetNodeName(), cond_val); GELOGD("[%s] Done executing with cond = %d successfully.", task_context.GetNodeName(), cond_val); return SUCCESS; @@ -201,8 +198,7 @@ Status CaseOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::func } GE_CHK_STATUS_RET(ExecuteSubgraph(subgraph, task_context, done_callback), - "[%s] Failed to execute else-subgraph.", - task_context.GetNodeName()); + "[Execute][Subgraph] failed for [%s].", task_context.GetNodeName()); GELOGD("[%s] Done executing subgraph[%d] successfully.", task_context.GetNodeName(), branch_index); return SUCCESS; @@ -228,18 +224,18 @@ Status WhileOpNodeTask::Init(const NodePtr &node, const HybridModel &model) { Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::function &done_callback) const { if (task_context.NumInputs() != task_context.NumOutputs()) { + REPORT_INNER_ERROR("E19999", + "[%s] Invalid while args. num_inputs = %d not equal num_outputs = %d", + task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs()); GELOGE(INTERNAL_ERROR, - "[%s] Invalid while args. num_inputs = %d, num_outputs = %d", - task_context.GetNodeName(), - task_context.NumInputs(), - task_context.NumOutputs()); + "[Check][Param:task_context][%s] Invalid while args. num_inputs = %d, num_outputs = %d", + task_context.GetNodeName(), task_context.NumInputs(), task_context.NumOutputs()); return INTERNAL_ERROR; } bool is_continue = false; GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), - "[%s] Failed to execute cond-subgraph", - task_context.GetNodeName()); + "[Execute][Cond] failed for [%s]", task_context.GetNodeName()); if (!is_continue) { for (int i = 0; i < task_context.NumInputs(); ++i) { auto input_tensor = task_context.GetInput(i); @@ -269,9 +265,8 @@ Status WhileOpNodeTask::DoExecuteAsync(TaskContext &task_context, const std::fun ++iteration; GELOGD("[%s] Start to execute, iteration = %d", task_context.GetNodeName(), iteration); GE_CHK_STATUS_RET(ExecuteOneLoop(task_context, is_continue), - "[%s] Failed to execute iteration %d.", - task_context.GetNodeName(), - iteration); + "[Invoke][ExecuteOneLoop][%s] Failed to execute iteration %d.", + task_context.GetNodeName(), iteration); } GELOGD("[%s] Quit from loop. current iteration = %d", task_context.GetNodeName(), iteration); if (done_callback) { @@ -299,24 +294,27 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue auto executor = MakeShared(cond_, execution_context, task_context.IsForceInferShape()); GE_CHECK_NOTNULL(executor); GELOGD("[%s] Start to execute cond-subgraph.", task_context.GetNodeName()); - GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), "Failed to execute partitioned call."); + GE_CHK_STATUS_RET(executor->ExecuteAsync(inputs, input_desc), + "[Invoke][ExecuteAsync] %s Failed to execute partitioned call.", task_context.GetNodeName()); GELOGD("[%s] Done executing cond-subgraph successfully.", cond_->GetName().c_str()); GE_CHK_STATUS_RET_NOLOG(task_context.RegisterCallback([executor]() mutable { executor.reset(); })); // get cond output - GE_CHK_STATUS_RET(executor->Synchronize(), "[%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str()); + GE_CHK_STATUS_RET(executor->Synchronize(), + "[Invoke][Synchronize][%s] Failed to sync cond-subgraph result.", cond_->GetName().c_str()); std::vector cond_outputs; std::vector cond_output_desc_list; GE_CHK_STATUS_RET(executor->GetOutputs(cond_outputs, cond_output_desc_list), - "[%s] Failed to get cond-output.", - cond_->GetName().c_str()); + "[Invoke][GetOutputs][%s] Failed to get cond-output.", cond_->GetName().c_str()); if (cond_outputs.size() != kCondOutputSize || cond_output_desc_list.size() != kCondOutputSize) { + REPORT_INNER_ERROR("E19999", "[%s] Number of cond outputs(%zu) or size of cond output desc(%zu)" + "not equal %zu, check invalid", task_context.GetNodeName(), cond_outputs.size(), + cond_output_desc_list.size(), kCondOutputSize); GELOGE(INTERNAL_ERROR, - "[%s] Number of cond outputs is invalid. number = %zu", - task_context.GetNodeName(), - cond_outputs.size()); + "[Check][Size][%s] Number of cond outputs(%zu) or Number of cond output desc(%zu) not equal %zu", + task_context.GetNodeName(), cond_outputs.size(), cond_output_desc_list.size(), kCondOutputSize); return INTERNAL_ERROR; } @@ -325,8 +323,7 @@ Status WhileOpNodeTask::ExecuteCond(TaskContext &task_context, bool &is_continue if (shape.IsScalar()) { auto data_type = cond_tensor_desc->GetDataType(); GE_CHK_STATUS_RET(ToBool(cond_outputs[0], data_type, is_continue), - "[%s] Failed to get cond value.", - task_context.GetNodeName()); + "[Invoke][ToBool][%s] Failed to get cond value.", task_context.GetNodeName()); } else { // true if num elements is non-zero is_continue = shape.GetShapeSize() > 0; @@ -367,17 +364,15 @@ Status WhileOpNodeTask::MoveOutputs2Inputs(TaskContext &task_context) { Status WhileOpNodeTask::ExecuteOneLoop(TaskContext &task_context, bool &is_continue) const { GELOGD("[%s] Start to execute body-subgraph.", task_context.GetNodeName()); GE_CHK_STATUS_RET(ExecuteSubgraph(body_, task_context, nullptr), - "[%s] Failed to execute cond-subgraph", task_context.GetNodeName()); + "[Execute][Subgraph] failed for [%s]", task_context.GetNodeName()); GELOGD("[%s] Done executing body-subgraph successfully.", task_context.GetNodeName()); // set outputs to inputs for next iteration GE_CHK_STATUS_RET(MoveOutputs2Inputs(task_context), - "[%s] Failed to move outputs to inputs", - task_context.GetNodeName()); + "[Move][Outputs2Inputs] failed for [%s]", task_context.GetNodeName()); GE_CHK_STATUS_RET(ExecuteCond(task_context, is_continue), - "[%s] Failed to execute cond-subgraph", - task_context.GetNodeName()); + "[Invoke][ExecuteCond][%s] Failed to execute cond-subgraph", task_context.GetNodeName()); if (!is_continue) { for (int i = 0; i < task_context.NumInputs(); ++i) { @@ -404,12 +399,14 @@ Status ControlOpNodeExecutor::LoadTask(const HybridModel &model, } else if (node_type == WHILE || node_type == STATELESSWHILE) { node_task.reset(new(std::nothrow) WhileOpNodeTask()); } else { - GELOGE(PARAM_INVALID, "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); + REPORT_INNER_ERROR("E19999", "[%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); + GELOGE(PARAM_INVALID, "[Check][NodeType][%s] Unsupported type: %s", node->GetName().c_str(), node_type.c_str()); return PARAM_INVALID; } GE_CHECK_NOTNULL(node_task); - GE_CHK_STATUS_RET(node_task->Init(node, model), "[%s] Failed to init ControlOpNodeTask.", node->GetName().c_str()); + GE_CHK_STATUS_RET(node_task->Init(node, model), + "[Invoke][Init][%s] Failed to init ControlOpNodeTask.", node->GetName().c_str()); task = std::move(node_task); return SUCCESS; diff --git a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc index 9d92420e..43a4ca84 100755 --- a/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc +++ b/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc @@ -47,7 +47,9 @@ Status RefInputTask::UpdateArgs(TaskContext &) { Status RefInputTask::Execute(TaskContext &context) { auto iter = out_ref_input_index_.find(node_type_); if (iter == out_ref_input_index_.end()) { - GELOGE(UNSUPPORTED, "node %s type %s can not use RefInputTask.", + REPORT_INNER_ERROR("E19999", "node %s type %s can not use RefInputTask.", + node_name_.c_str(), node_type_.c_str()); + GELOGE(UNSUPPORTED, "[Find][Node]node %s type %s can not use RefInputTask.", node_name_.c_str(), node_type_.c_str()); return UNSUPPORTED; } @@ -65,7 +67,9 @@ Status RefInputTask::RefOneByOne(TaskContext &context) { int input_num = context.NumInputs(); int output_num = context.NumOutputs(); if (output_num > input_num) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only %d inputs, can't ref one by one.", + REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only %d inputs, can't ref one by one.", + node_name_.c_str(), node_type_.c_str(), output_num, input_num); + GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only %d inputs, can't ref one by one.", node_name_.c_str(), node_type_.c_str(), output_num, input_num); return INTERNAL_ERROR; } @@ -84,7 +88,9 @@ Status RefInputTask::RefByOrder(const std::vector &ref_order, TaskCont GELOGI("node %s type %s ref input by order begin.", node_name_.c_str(), node_type_.c_str()); int32_t output_num = context.NumOutputs(); if (ref_order.size() != static_cast(output_num)) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d outputs but only has %zu out ref index.", + REPORT_INNER_ERROR("E19999", "node %s type %s has %d outputs but only has %zu out ref index.", + node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d outputs but only has %zu out ref index.", node_name_.c_str(), node_type_.c_str(), output_num, ref_order.size()); return INTERNAL_ERROR; } @@ -102,7 +108,7 @@ Status RefInputTask::RefByOrder(const std::vector &ref_order, TaskCont Status RefInputTask::ExecuteAsync(TaskContext &context, std::function done_callback) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[RefInputTaskExecuteAsync] Start"); - GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s ref input task execute failed", + GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s ref input task execute failed", node_name_.c_str(), node_type_.c_str()); if (done_callback != nullptr) { // host cpu no need register callback, call it directly. @@ -126,20 +132,26 @@ Status DependInputShapeTask::Execute(TaskContext &context) { std::string node_type = node_->GetType(); auto kernel = factory.Create(node_type); if (kernel == nullptr) { - GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", + REPORT_CALL_ERROR("E19999", "create failed for node %s type %s is not supported by host kernel.", + node_->GetName().c_str(), node_type.c_str()); + GELOGE(UNSUPPORTED, "[Invoke][Create]node %s type %s is not supported by host kernel.", node_->GetName().c_str(), node_type.c_str()); return UNSUPPORTED; } std::vector outputs; Status compute_ret = kernel->Compute(node_, outputs); if (compute_ret != SUCCESS) { - GELOGE(compute_ret, "node %s type %s compute failed or not imply.", + REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", node_->GetName().c_str(), node_type.c_str()); + GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.", node_->GetName().c_str(), node_type.c_str()); return compute_ret; } int32_t output_num = context.NumOutputs(); if (static_cast(output_num) != outputs.size()) { - GELOGE(INTERNAL_ERROR, "node %s type %s has %d output, but kernel compute only has %zu output.", + REPORT_INNER_ERROR("E19999", "node %s type %s has %d output," + "but kernel compute only has %zu output. check invalid", + node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size]node %s type %s has %d output, but kernel compute only has %zu output.", node_->GetName().c_str(), node_type.c_str(), output_num, outputs.size()); return INTERNAL_ERROR; } @@ -155,7 +167,11 @@ Status DependInputShapeTask::Execute(TaskContext &context) { auto tensor_value = context.MutableOutput(i); GE_CHECK_NOTNULL(tensor_value); if (tensor_data.GetSize() > tensor_value->GetSize()) { - GELOGE(INTERNAL_ERROR, "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", + REPORT_INNER_ERROR("E19999", "node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu." + "check invalid", + node_->GetName().c_str(), node_type.c_str(), i, + tensor_data.GetSize(), tensor_value->GetSize()); + GELOGE(INTERNAL_ERROR, "[Check][Size]node:%s type:%s [%d]th compute data size=%zu, but context data size=%zu.", node_->GetName().c_str(), node_type.c_str(), i, tensor_data.GetSize(), tensor_value->GetSize()); return INTERNAL_ERROR; } @@ -180,7 +196,7 @@ Status DependInputShapeTask::Execute(TaskContext &context) { Status DependInputShapeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[DependInputShapeTaskExecuteAsync] Start"); - GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s depend input shape task execute failed", + GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute]node:%s type:%s depend input shape task execute failed", node_->GetName().c_str(), node_->GetType().c_str()); if (done_callback != nullptr) { // host cpu no need register callback, call it directly. @@ -213,7 +229,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, node->GetName().c_str(), node_type.c_str()); task = MakeShared(node); if (task == nullptr) { - GELOGE(MEMALLOC_FAILED, "create RefInputTask for node %s failed.", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "Create RefInputTask failed for node %s.", node->GetName().c_str()); + GELOGE(MEMALLOC_FAILED, "[Create][RefInputTask] failed for node %s.", node->GetName().c_str()); return MEMALLOC_FAILED; } } else if (DependInputShapeTask::IsBelong(node_type)) { @@ -221,7 +238,9 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, node->GetName().c_str(), node_type.c_str()); task = MakeShared(node); if (task == nullptr) { - GELOGE(MEMALLOC_FAILED, "create DependInputShapeTask for node %s type %s failed.", + REPORT_CALL_ERROR("E19999", "Create DependInputShapeTask failed for node %s type %s.", + node->GetName().c_str(), node_type.c_str()); + GELOGE(MEMALLOC_FAILED, "[Create][DependInputShapeTask]failed for node %s type %s.", node->GetName().c_str(), node_type.c_str()); return MEMALLOC_FAILED; } @@ -229,7 +248,8 @@ Status GeLocalNodeExecutor::LoadTask(const HybridModel &model, GELOGI("node %s type %s, use ConstantNodeTask.", node->GetName().c_str(), node_type.c_str()); auto tensor = model.GetTensor(node); if (tensor == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to get tensor by name: %s", node->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetTensor failed for name: %s", node->GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][Tensor] failed for name: %s", node->GetName().c_str()); return INTERNAL_ERROR; } @@ -251,7 +271,7 @@ Status ConstantNodeTask::UpdateArgs(TaskContext &context) { Status ConstantNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGD("[%s] Start execute.", context.GetNodeName()); - GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, *tensor_), "[Set][Output] failed for [%s].", context.GetNodeName()); if (done_callback) { GELOGD("[%s] Start invoke callback.", context.GetNodeName()); done_callback(); diff --git a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc index 48b5fe9a..20684194 100644 --- a/ge/hybrid/node_executor/hccl/hccl_node_executor.cc +++ b/ge/hybrid/node_executor/hccl/hccl_node_executor.cc @@ -43,13 +43,15 @@ REGISTER_NODE_EXECUTOR_BUILDER(NodeExecutorManager::ExecutorType::HCCL, HcclNode Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGI("[%s] HcclNodeTask::ExecuteAsync in.", context.GetNodeName()); if (context.handle_ == nullptr) { - GELOGE(FAILED, "hccl handle is nullptr! "); + REPORT_INNER_ERROR("E19999", " %s invalid, hccl handle is nullptr!", context.GetNodeName()); + GELOGE(FAILED, "[Check][Param:context] %s hccl handle is nullptr!", context.GetNodeName()); return FAILED; } auto HcomExecEnqueueOperation = (HcclResult(*)(HcomOperation, std::function))dlsym( context.handle_, "HcomExecEnqueueOperation"); if (HcomExecEnqueueOperation == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecEnqueueOperation hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecEnqueueOperation] failed for %s hcom unknown node function.", + context.GetNodeName()); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -83,24 +85,35 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do ge::DataType src_data_type = input_desc->GetDataType(); auto iter = kConstOpHcclDataType.find(static_cast(src_data_type)); if (iter == kConstOpHcclDataType.end()) { - GELOGE(PARAM_INVALID, "kConstOpHcclDataType find failed."); + REPORT_INNER_ERROR("E19999", "%s inputdesc0 datatype:%s not support.", + op_desc->GetName().c_str(), + TypeUtils::DataTypeToSerialString(src_data_type).c_str()); + GELOGE(PARAM_INVALID, "[Find][DataType]%s inputdesc0 datatype:%s not support.", + op_desc->GetName().c_str(), + TypeUtils::DataTypeToSerialString(src_data_type).c_str()); return PARAM_INVALID; } op_info.dataType = iter->second; HcclReduceOp op_type = HCCL_REDUCE_SUM; if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HCOMREDUCESCATTER || op_desc->GetType() == HVDCALLBACKALLREDUCE || op_desc->GetType() == HCOMREDUCE) { - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), "GetHcclOperationType failed"); + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclOperationType(op_desc, op_type), + "[Get][HcclOperationType] failed for %s type:%s", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); op_info.opType = op_type; } int64_t root_id = 0; if (op_desc->GetType() == HCOMBROADCAST) { - GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), "GetHcclRootId failed"); + GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclRootId(op_desc, root_id), + "[Get][HcclRootId] failed for %s type:%s", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); } op_info.root = root_id; auto callback = [op_desc, done_callback](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "node %s call HcomExecEnqueueOperation failed, ret: 0x%X", + REPORT_CALL_ERROR("E19999", "call HcomExecEnqueueOperation failed for node %s, ret: 0x%X", + op_desc->GetName().c_str(), status); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node %s, ret: 0x%X", op_desc->GetName().c_str(), status); } @@ -110,14 +123,18 @@ Status HcclNodeTask::ExecuteAsync(TaskContext &context, std::function do int32_t count = 0; GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomCount(op_desc, static_cast(op_info.dataType), op_desc->GetType() == HCOMALLGATHER, count), - "GetHcomCount failed"); + "[Get][HcomCount] failed for %s type:%s", op_desc->GetName().c_str(), + op_desc->GetType().c_str()); GELOGI("[%s] HcclNodeTask::ExecuteAsync hccl_type %s, count %d, data_type %d, op_type %d, root %d.", context.GetNodeName(), op_info.hcclType.c_str(), count, op_info.dataType, op_info.opType, op_info.root); op_info.count = count; HcclResult hccl_ret = HcomExecEnqueueOperation(op_info, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); + REPORT_CALL_ERROR("E19999", "Call HcomExecEnqueueOperation failed for node:%s(%s), ret: 0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueOperation] failed for node:%s(%s), ret: 0x%X", + op_desc->GetName().c_str(), op_desc->GetType().c_str(), hccl_ret); return HCCL_E_INTERNAL; } @@ -173,13 +190,23 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vectorGetTensor(offset_index_.first, offset_index_.second, offset_tensor)) if (static_cast(offset_tensor.GetSize() / GetSizeByDataType(data_type)) != row_num) { - GELOGE(PARAM_INVALID, "num of offset and remote addr mismatch, offset size=%zu, remote_addr size=%ld, dtype=%s", + REPORT_INNER_ERROR("E19999", "num of offset and remote addr mismatch, check invalid" + "offset size=%zu, remote_addr size=%ld, dtype=%s", offset_tensor.GetSize(), row_num, + TypeUtils::DataTypeToSerialString(data_type).c_str()); + GELOGE(PARAM_INVALID, "[Check][Size]num of offset and remote addr mismatch," + "offset size=%zu, remote_addr size=%ld, dtype=%s", offset_tensor.GetSize(), row_num, TypeUtils::DataTypeToSerialString(data_type).c_str()); return PARAM_INVALID; } @@ -244,7 +275,9 @@ Status RdmaNodeTask::ExtractTensor(TaskContext &context, vector(reinterpret_cast(tv->MutableData())); auto device_len = tv->GetSize() / row_num; if (device_len <= 0 || device_len > data[kVarTableIdxLen]) { - GELOGE(FAILED, "Local embedding length is out of range, expect %ld, but %ld exactly.", + REPORT_INNER_ERROR("E19999", "Local embedding length is out of range, expect %ld, but %ld exactly.", + data[kVarTableIdxLen], device_len); + GELOGE(FAILED, "[Check][Size]Local embedding length is out of range, expect %ld, but %ld exactly.", data[kVarTableIdxLen], device_len); return FAILED; } @@ -267,7 +300,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do (HcclResult(*)(const string &, const vector &, std::function))dlsym(context.handle_, "HcomExecEnqueueRemoteAccess"); if (HcomExecEnqueueRemoteAccess == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecEnqueueRemoteAccess hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecEnqueueRemoteAccess] failed for node:%s(%s) hcom unknown node function.", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); if (dlclose(context.handle_) != 0) { GELOGW("Failed to close handle %s", dlerror()); } @@ -283,7 +317,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do TaskContext *p_ctx = &context; auto callback = [p_ctx, done_callback](HcclResult status) { if (status != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", status); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExcutorInitialize] failed for node:%s(%s), ret: 0x%X", + p_ctx->GetNodeName(), p_ctx->GetNodeItem().NodeType().c_str(), status); p_ctx->SetStatus(FAILED); } done_callback(); @@ -296,7 +331,8 @@ Status RdmaNodeTask::ExecuteAsync(TaskContext &context, std::function do } HcclResult hccl_ret = HcomExecEnqueueRemoteAccess(context.GetNodeItem().NodeType(), addr_infos, callback); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(HCCL_E_INTERNAL, "Call HcomExcutorInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(HCCL_E_INTERNAL, "[Call][HcomExecEnqueueRemoteAccess] failed for node:%s(%s), ret: 0x%X", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str(), hccl_ret); return HCCL_E_INTERNAL; } @@ -314,13 +350,17 @@ Status HcclNodeTask::Init(TaskContext &context) { Status HcclNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { GELOGI("[%s] HcclNodeExecutor::PrepareTask in.", context.GetNodeName()); - GE_CHK_STATUS_RET(task.Init(context), "hccl node load hccl so failed."); + GE_CHK_STATUS_RET(task.Init(context), "[Invoke][Init]hccl node %s(%s) load hccl so failed.", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); // allocate output mem, output mem or remote read will be calculated when node execute. if (kRdmaReadTypes.count(context.GetNodeItem().NodeType()) == 0) { - GE_CHK_STATUS_RET(context.AllocateOutputs(), "hccl node task allocate output failed."); + GE_CHK_STATUS_RET(context.AllocateOutputs(), + "[Invoke][AllocateOutputs]hccl node %s(%s) task allocate output failed.", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); } - GE_CHK_STATUS_RET(task.UpdateArgs(context), "hccl node task update args failed."); + GE_CHK_STATUS_RET(task.UpdateArgs(context), "[Update][Args] failed for hccl node %s(%s).", + context.GetNodeName(), context.GetNodeItem().NodeType().c_str()); GELOGI("[%s] HcclNodeExecutor::PrepareTask success.", context.GetNodeName()); return SUCCESS; } @@ -341,8 +381,9 @@ Status HcclNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, Status HcclNodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const { context.handle_ = handle_; - GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), "Failed to execute task. node = %s", - context.GetNodeItem().NodeName().c_str()); + GE_CHK_STATUS_RET(task.ExecuteAsync(context, callback), + "[Invoke][ExecuteAsync] failed to execute task. node:%s(%s)", + context.GetNodeItem().NodeName().c_str(), context.GetNodeItem().NodeType().c_str()); return SUCCESS; } @@ -359,12 +400,13 @@ Status HcclNodeExecutor::Initialize() { GELOGI("FileName:%s, Path:%s.", file_name.c_str(), canonical_path.c_str()); handle_ = dlopen(canonical_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (handle_ == nullptr) { - GELOGE(GE_PLGMGR_SO_NOT_EXIST, "Failed in dlopen %s! ", dlerror()); + REPORT_CALL_ERROR("E19999", "Open SoFile %s failed, error:%s! ", canonical_path.c_str(), dlerror()); + GELOGE(GE_PLGMGR_SO_NOT_EXIST, "[Open][SoFile] %s failed, error:%s! ", canonical_path.c_str(), dlerror()); return FAILED; } auto HcomExecInitialize = (HcclResult(*)())dlsym(handle_, "HcomExecInitialize"); if (HcomExecInitialize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecInitialize hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecInitialize] Failed for hcom unknown node function."); return FAILED; } HcclResult hccl_ret = HcomExecInitialize(); @@ -373,7 +415,7 @@ Status HcclNodeExecutor::Initialize() { } else if (hccl_ret == HCCL_SUCCESS) { GELOGI("Hcom executor initialize success."); } else { - GELOGE(FAILED, "Call HcomExecInitialize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "[Call][HcomExecInitialize] failed, ret: 0x%X", hccl_ret); return FAILED; } return SUCCESS; @@ -382,12 +424,12 @@ Status HcclNodeExecutor::Initialize() { Status HcclNodeExecutor::Finalize() { auto HcomExecFinalize = (HcclResult(*)())dlsym(handle_, "HcomExecFinalize"); if (HcomExecFinalize == nullptr) { - GELOGE(FAILED, "Failed to invoke HcomExecFinalize hcom unknown node function."); + GELOGE(FAILED, "[Invoke][HcomExecFinalize] failed for hcom unknown node function."); return FAILED; } HcclResult hccl_ret = HcomExecFinalize(); if (hccl_ret != HCCL_SUCCESS) { - GELOGE(FAILED, "Call HcomExecFinalize failed, ret: 0x%X", hccl_ret); + GELOGE(FAILED, "[Call][HcomExecFinalize] failed, ret: 0x%X", hccl_ret); return FAILED; } // dlclose file handle diff --git a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc index 0cc635e4..6e8a1eb9 100755 --- a/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc +++ b/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc @@ -33,7 +33,7 @@ Status HostNodeTaskBase::UpdateArgs(TaskContext &) { Status HostNodeTaskBase::ExecuteAsync(TaskContext &context, std::function done_callback) { GELOGD("[%s] Start execute.", context.GetNodeName()); - GE_CHK_STATUS_RET(Execute(context), "node:%s type:%s, task execute failed.", + GE_CHK_STATUS_RET(Execute(context), "[Invoke][Execute] failed for node:%s type:%s.", node_->GetName().c_str(), node_->GetType().c_str()) if (done_callback) { GELOGD("[%s] Start invoke callback.", context.GetNodeName()); @@ -70,7 +70,8 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) { AllocationAttr attr; attr.SetMemType(HOST_DDR); if (context.AllocateOutput(i, output_desc, nullptr, &attr) != SUCCESS) { - GELOGE(FAILED, "node:%s Failed to allocate output %d", context.GetNodeName(), i); + REPORT_CALL_ERROR("E19999", "node:%s Failed to allocate output %d", context.GetNodeName(), i); + GELOGE(FAILED, "[Invoke][AllocateOutput]node:%s Failed to allocate output %d", context.GetNodeName(), i); return FAILED; } auto tensor = context.GetOutput(i); @@ -92,14 +93,18 @@ Status HostCpuNodeTask::Execute(TaskContext &context) { RunContext run_context; auto host_kernel = hybrid::host_cpu::KernelFactory::Instance().CreateKernel(node_); if (host_kernel == nullptr) { - GELOGE(UNSUPPORTED, "node %s type %s is not supported by host kernel.", + REPORT_CALL_ERROR("E19999", "CreateKernel failed for node %s type %s is not supported by host kernel.", + node_->GetName().c_str(), node_->GetType().c_str()); + GELOGE(UNSUPPORTED, "[Create][Kernel]node %s type %s is not supported by host kernel.", node_->GetName().c_str(), node_->GetType().c_str()); return UNSUPPORTED; } Status compute_ret = host_kernel->Compute(context); if (compute_ret != SUCCESS) { - GELOGE(compute_ret, "node %s type %s compute failed or not imply.", + REPORT_CALL_ERROR("E19999", "node %s type %s compute failed.", + node_->GetName().c_str(), node_->GetType().c_str()); + GELOGE(compute_ret, "[Invoke][Compute]node %s type %s compute failed or not imply.", node_->GetName().c_str(), node_->GetType().c_str()); return compute_ret; } @@ -131,7 +136,10 @@ Status HostCpuNodeExecutor::LoadTask(const HybridModel &model, const NodePtr &no task = MakeShared(node); GE_CHECK_NOTNULL(task); } else { - GELOGE(UNSUPPORTED, "node %s type %s is not support in HostCpuNodeExecutor now.", name.c_str(), type.c_str()); + REPORT_INNER_ERROR("E19999", "Create NodeTask failed for node %s type %s.", + name.c_str(), type.c_str()); + GELOGE(UNSUPPORTED, "[Create][NodeTask]node %s type %s is not support in HostCpuNodeExecutor now.", + name.c_str(), type.c_str()); return UNSUPPORTED; } return SUCCESS; diff --git a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc index d54195d6..370bb286 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc @@ -34,7 +34,9 @@ Status AssignKernel::Compute(TaskContext& context) { const auto value_tensor = context.GetInput(kAssignValueInputIndex); GE_CHECK_NOTNULL(value_tensor); if (value_tensor->GetSize() > ref_tensor->GetSize()) { - GELOGE(INTERNAL_ERROR, "[%s] value_input_size=%zu, but ref_input_size=%zu.", + REPORT_INNER_ERROR("E19999", "[%s] value_input_size=%zu bigger than ref_input_size=%zu. check invalid", + node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] value_input_size=%zu, but ref_input_size=%zu.", node_->GetName().c_str(), value_tensor->GetSize(), ref_tensor->GetSize()); return INTERNAL_ERROR; } @@ -46,7 +48,7 @@ Status AssignKernel::Compute(TaskContext& context) { value_tensor->GetSize(), RT_MEMCPY_HOST_TO_HOST)); } GE_CHK_STATUS_RET(context.SetOutput(kAssignRefOutputIndex, *ref_tensor), - "[%s] Failed to set output.", context.GetNodeName()); + "[Set][Output] failed for[%s].", context.GetNodeName()); GELOGD("[%s] compute success.", node_->GetName().c_str()); return SUCCESS; diff --git a/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc index e34f601a..8bf50096 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc @@ -30,7 +30,8 @@ namespace host_cpu { Status DataKernel::Compute(TaskContext& context) { auto input = context.MutableInput(kDataInputIndex); GE_CHECK_NOTNULL(input); - GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), "[%s] Failed to set output.", context.GetNodeName()) + GE_CHK_STATUS_RET(context.SetOutput(kDataOutputIndex, *input), + "[Set][Output] failed for [%s].", context.GetNodeName()) GELOGD("[%s] compute success.", node_->GetName().c_str()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc index 52d48821..17692f36 100755 --- a/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc @@ -36,31 +36,41 @@ Status RandomUniformKernel::Compute(TaskContext& context) { (void)AttrUtils::GetInt(node_->GetOpDesc(), "seed2", seed2); DataType data_type = DT_FLOAT; if (!AttrUtils::GetDataType(node_->GetOpDesc(), kAttrDtype, data_type)) { - GELOGE(PARAM_INVALID, "[%s] get attr dtype failed.", node_->GetName().c_str()); + REPORT_CALL_ERROR("E19999", "GetDataType failed for [%s].", node_->GetName().c_str()); + GELOGE(PARAM_INVALID, "[Get][DataType] failed for [%s].", node_->GetName().c_str()); return PARAM_INVALID; } switch (data_type) { case DT_FLOAT16: if (GenerateFP16(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { - GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT"); + GELOGE(FAILED, "[Invoke][GenerateFP16]Generate random_distribution failed for %s, data_type=DT_FLOAT16", + node_->GetName().c_str()); return FAILED; } break; case DT_FLOAT: if (Generate(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { - GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_FLOAT"); + GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_FLOAT", + node_->GetName().c_str()); return FAILED; } break; case DT_DOUBLE: if (Generate(node_->GetOpDesc(), seed, seed2, context) != SUCCESS) { - GELOGE(FAILED, "Generate random_distribution failed, data_type=DT_DOUBLE"); + GELOGE(FAILED, "[Invoke][Generate]Generate random_distribution failed for %s, data_type=DT_DOUBLE", + node_->GetName().c_str()); return FAILED; } break; default: - GELOGE(UNSUPPORTED, "Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE, but data_type=%s", - TypeUtils::DataTypeToSerialString(data_type).c_str()); + REPORT_INNER_ERROR("E19999", "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE," + "but data_type=%s, node:%s", + TypeUtils::DataTypeToSerialString(data_type).c_str(), + node_->GetName().c_str()); + GELOGE(UNSUPPORTED, "[Check][DataType]Supported DataType is DT_FLOAT16 / DT_FLOAT / DT_DOUBLE," + "but data_type=%s, node:%s", + TypeUtils::DataTypeToSerialString(data_type).c_str(), + node_->GetName().c_str()); return UNSUPPORTED; } @@ -79,7 +89,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s auto tensor_size = data_num * sizeof(T); TensorValue tensor; GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), - "[%s] Failed to allocate output of size %zu", + "[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu", context.GetNodeName(), tensor_size); @@ -101,7 +111,7 @@ Status RandomUniformKernel::Generate(const ge::OpDescPtr &op_desc_ptr, int64_t s *(buf + i) = distribution(gen); } - GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output] failed for [%s].", context.GetNodeName()); return SUCCESS; } @@ -115,7 +125,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64 auto tensor_size = data_num * sizeof(fp16_t); TensorValue tensor; GE_CHK_STATUS_RET(context.AllocateTensor(tensor_size, tensor, &attr), - "[%s] Failed to allocate output of size %zu", + "[Invoke][AllocateTensor][%s] Failed to allocate output of size %zu", context.GetNodeName(), tensor_size); @@ -137,7 +147,7 @@ Status RandomUniformKernel::GenerateFP16(const ge::OpDescPtr &op_desc_ptr, int64 *(buf + i) = static_cast(distribution(gen)); } - GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, tensor), "[Set][Output]failed for [%s].", context.GetNodeName()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc b/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc index 16738c2a..902a07c2 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc @@ -25,11 +25,12 @@ namespace host_cpu { Status VariableKernel::Compute(TaskContext& context) { auto tensor = context.GetVariable(node_->GetName()); if (tensor == nullptr) { - GELOGE(PARAM_INVALID, "tensor is NULL."); + REPORT_INNER_ERROR("E19999", "Get Variable from task context for node:%s failed.", context.GetNodeName()); + GELOGE(PARAM_INVALID, "[Check][Param]Get Variable from task context for node:%s failed.", context.GetNodeName()); return PARAM_INVALID; } // Constant & Variable Op has and only has one output - GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[%s] Failed to set output.", context.GetNodeName()); + GE_CHK_STATUS_RET(context.SetOutput(0, *tensor), "[Set][Output] failed for [%s].", context.GetNodeName()); GELOGD("[%s] compute success.", node_->GetName().c_str()); return SUCCESS; } diff --git a/ge/hybrid/node_executor/host_cpu/kernel_factory.cc b/ge/hybrid/node_executor/host_cpu/kernel_factory.cc index aabae999..7d3ef703 100644 --- a/ge/hybrid/node_executor/host_cpu/kernel_factory.cc +++ b/ge/hybrid/node_executor/host_cpu/kernel_factory.cc @@ -34,7 +34,10 @@ std::shared_ptr KernelFactory::CreateKernel(const NodePtr &node) { if (iter != kernel_creator_map_.end()) { return iter->second(node); } - GELOGE(FAILED, "Not supported, type = %s, name = %s", node->GetType().c_str(), node->GetName().c_str()); + REPORT_INNER_ERROR("E19999", "Not supported because kernel_creator_map_ not contain type:%s, name = %s", + node->GetType().c_str(), node->GetName().c_str()); + GELOGE(FAILED, "[Find][NodeType]Not supported because kernel_creator_map_ not contain type = %s, name = %s", + node->GetType().c_str(), node->GetName().c_str()); return nullptr; } diff --git a/ge/hybrid/node_executor/node_executor.cc b/ge/hybrid/node_executor/node_executor.cc index e74256f2..d5d868ab 100755 --- a/ge/hybrid/node_executor/node_executor.cc +++ b/ge/hybrid/node_executor/node_executor.cc @@ -45,8 +45,7 @@ Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function &callback) const { HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback), - "Failed to execute task. node = %s", - context.GetNodeItem().NodeName().c_str()); + "[Execute][Task] failed. node = %s", context.GetNodeItem().NodeName().c_str()); return SUCCESS; } @@ -106,7 +105,10 @@ NodeExecutorManager::ExecutorType NodeExecutorManager::ResolveExecutorType(Node const auto &lib_name = op_desc->GetOpKernelLibName(); auto it = engine_mapping_.find(lib_name); if (it == engine_mapping_.end()) { - GELOGE(UNSUPPORTED, "KernelLib not supported. node = %s, lib_name = %s", node.GetName().c_str(), lib_name.c_str()); + REPORT_INNER_ERROR("E19999", "Failed to get ExecutorType by lib_name:%s, node:%s", + lib_name.c_str(), node.GetName().c_str()); + GELOGE(UNSUPPORTED, "[Find][ExecutorType]Failed to get ExecutorType by lib_name:%s, node:%s", + lib_name.c_str(), node.GetName().c_str()); return ExecutorType::RESERVED; } @@ -117,7 +119,10 @@ Status NodeExecutorManager::GetExecutor(Node &node, const NodeExecutor **executo auto executor_type = ResolveExecutorType(node); const auto it = executors_.find(executor_type); if (it == executors_.end()) { - GELOGE(INTERNAL_ERROR, "Failed to get executor by type: %d.", static_cast(executor_type)); + REPORT_INNER_ERROR("E19999", "Failed to get executor by type: %d.", + static_cast(executor_type)); + GELOGE(INTERNAL_ERROR, "[Check][ExecutorType]Failed to get executor by type: %d.", + static_cast(executor_type)); return INTERNAL_ERROR; } @@ -155,16 +160,16 @@ Status NodeExecutorManager::CalcOpRunningParam(Node &node) const { GeShape output_shape = output_tensor.GetShape(); int64_t output_mem_size = 0; GE_CHK_STATUS_RET(TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size), - "hccl calc tensor mem size failed."); + "[Calc][TensorMemSize] failed, node:%s.", node.GetName().c_str()); GE_CHK_STATUS_RET(CheckInt64AddOverflow(output_mem_size, MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1), - "[%s] Invalid output mem size: %ld", + "[Check][Overflow][%s] Invalid output mem size: %ld", node.GetName().c_str(), output_mem_size); output_mem_size = ((output_mem_size + MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1) / MEMORY_ALIGN_SIZE) * MEMORY_ALIGN_SIZE; TensorUtils::SetSize(output_tensor, output_mem_size); GE_CHK_STATUS_RET(op_desc->UpdateOutputDesc(static_cast(i), output_tensor), - "hccl update output size failed."); + "[Update][OutputDesc] failed, node:%s.", node.GetName().c_str()); GELOGD("%s output desc[%zu], dim_size: %zu, mem_size: %ld.", node.GetName().c_str(), i, output_tensor.GetShape().GetDimNum(), output_mem_size); } @@ -189,14 +194,17 @@ Status NodeExecutorManager::InitializeExecutors() { GE_CHECK_NOTNULL(build_fn); auto executor = std::unique_ptr(build_fn()); if (executor == nullptr) { - GELOGE(INTERNAL_ERROR, "Failed to create executor for engine type = %d", static_cast(engine_type)); + REPORT_CALL_ERROR("E19999", "Create NodeExecutor failed for engine type = %d", + static_cast(engine_type)); + GELOGE(INTERNAL_ERROR, "[Create][NodeExecutor] failed for engine type = %d", static_cast(engine_type)); return INTERNAL_ERROR; } GELOGD("Executor of engine type = %d was created successfully", static_cast(engine_type)); auto ret = executor->Initialize(); if (ret != SUCCESS) { - GELOGE(ret, "Failed to initialize NodeExecutor of type = %d, clear executors", static_cast(engine_type)); + REPORT_CALL_ERROR("E19999", "Initialize NodeExecutor failed for type = %d", static_cast(engine_type)); + GELOGE(ret, "[Initialize][NodeExecutor] failed for type = %d", static_cast(engine_type)); for (auto &executor_it : executors_) { executor_it.second->Finalize(); } diff --git a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc index f01cb21e..28a5dea1 100755 --- a/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc +++ b/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc @@ -38,15 +38,14 @@ Status PartitionedCallNodeTask::Init(TaskContext &context) { Status PartitionedCallNodeTask::ExecuteAsync(TaskContext &context, std::function done_callback) { GE_CHK_STATUS_RET(subgraph_executor_->ExecuteAsync(context), - "[%s] Failed to set inputs", graph_item_->GetName().c_str()); + "[Invoke][ExecuteAsync] failed for[%s]", graph_item_->GetName().c_str()); auto callback = [=]() { Callback(done_callback); }; GE_CHK_STATUS_RET(context.RegisterCallback(callback), - "[%s] Failed to register callback", - graph_item_->GetName().c_str()); + "[Register][Callback] failed for [%s]", graph_item_->GetName().c_str()); GELOGD("[%s] Done executing subgraph successfully.", graph_item_->GetName().c_str()); return SUCCESS; } @@ -83,7 +82,7 @@ Status PartitionedCallNodeExecutor::LoadTask(const ge::hybrid::HybridModel &mode Status PartitionedCallNodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const { RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] Start"); - GE_CHK_STATUS_RET(task.Init(context), "[%s] Failed to init task.", context.GetNodeName()); + GE_CHK_STATUS_RET(task.Init(context), "[Init][Task] failed for [%s].", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[PartitionedCallPrepareTask] End"); return SUCCESS; } diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc index 4e1b367b..db8fe9fe 100644 --- a/ge/hybrid/node_executor/task_context.cc +++ b/ge/hybrid/node_executor/task_context.cc @@ -63,17 +63,22 @@ std::unique_ptr TaskContext::Create(NodeState *node_state, node_item.output_start, node_item.num_outputs); if (node_item.input_start < 0 || node_item.output_start < 0) { + REPORT_INNER_ERROR("E19999", "NodeItem:%s(%s) not property initialized." + "input_start:%d or output_start:%d less than 0", + node_item.NodeName().c_str(), node_item.NodeType().c_str(), + node_item.input_start, node_item.output_start); GELOGE(INTERNAL_ERROR, - "NodeItem not property initialized. input_start = %d, output_start = %d", - node_item.input_start, - node_item.output_start); + "[Check][Param]NodeItem:%s(%s) not property initialized. input_start = %d, output_start = %d", + node_item.NodeName().c_str(), node_item.NodeType().c_str(), + node_item.input_start, node_item.output_start); return nullptr; } auto task_context = std::unique_ptr( new(std::nothrow)TaskContext(execution_context, node_state, subgraph_context)); if (task_context == nullptr) { - GELOGE(MEMALLOC_FAILED, "[%s] Failed to create instance of TaskContext.", node_item.NodeName().c_str()); + REPORT_CALL_ERROR("E19999", "Create TaskContext failed for [%s].", node_item.NodeName().c_str()); + GELOGE(MEMALLOC_FAILED, "[Create][TaskContext] failed for [%s].", node_item.NodeName().c_str()); return nullptr; } @@ -94,7 +99,12 @@ int TaskContext::NumOutputs() const { TensorValue *TaskContext::MutableInput(int index) { if (index < 0 || index >= node_item_->num_inputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs); + REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_inputs = %d, node:%s(%s)", + index, node_item_->num_inputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_inputs = %d, node:%s(%s)", + index, node_item_->num_inputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -103,7 +113,12 @@ TensorValue *TaskContext::MutableInput(int index) { const TensorValue *TaskContext::GetOutput(int index) const { if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs); + REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -112,7 +127,12 @@ const TensorValue *TaskContext::GetOutput(int index) const { TensorValue *TaskContext::MutableOutput(int index) { if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_outputs = %d", index, node_item_->num_outputs); + REPORT_INNER_ERROR("E19999", "Index out of range, check invalid. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index out of range. index = %d, num_outputs = %d, node:%s(%s)", + index, node_item_->num_outputs, + node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -125,7 +145,10 @@ std::size_t TaskContext::NumWorkspaces() const { void *TaskContext::MutableWorkspace(int index) { if (index < 0 || static_cast(index) >= workspaces_.size()) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_workspaces = %d", index, node_item_->num_outputs); + REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. number:%zu of workspaces_, node:%s(%s)", + index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. number:%zu of workspaces_, node:%s(%s)", + index, workspaces_.size(), node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -134,7 +157,11 @@ void *TaskContext::MutableWorkspace(int index) { const TensorValue *TaskContext::GetInput(int index) const { if (index < 0 || index >= node_item_->num_inputs) { - GELOGE(PARAM_INVALID, "Index out of range. index = %d, num_inputs = %d", index, node_item_->num_inputs); + REPORT_INNER_ERROR("E19999", "Index:%d out of range, check invalid. num_inputs:%d node:%s(%s)", + index, node_item_->num_inputs, node_item_->NodeName().c_str(), + node_item_->NodeType().c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Index:%d out of range. num_inputs:%d node:%s(%s)", + index, node_item_->num_inputs, node_item_->NodeName().c_str(), node_item_->NodeType().c_str()); return nullptr; } @@ -146,7 +173,10 @@ Status TaskContext::AllocateWorkspaces() { for (auto size : workspace_sizes) { void *workspace = execution_context_->allocator->Allocate(size); if (workspace == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size: %ld", size); + REPORT_CALL_ERROR("E19999", "node:%s(%s) Allocate workspace failed, size: %ld", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); + GELOGE(MEMALLOC_FAILED, "[Allocate][workspace] failed for node:%s(%s), size: %ld", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); return MEMALLOC_FAILED; } @@ -162,7 +192,8 @@ Status TaskContext::RegisterCallback(const std::function &callback_fun) } auto ret = execution_context_->callback_manager->RegisterCallback(GetStream(), callback_fun); if (ret != SUCCESS) { - GELOGE(ret, "[%s] Failed to register callback", GetNodeName()); + REPORT_CALL_ERROR("E19999", "RegisterCallback failed for [%s]", GetNodeName()); + GELOGE(ret, "[Register][Callback] failed for [%s]", GetNodeName()); execution_context_->callback_manager->Destroy(); return ret; } @@ -187,7 +218,8 @@ string TaskContext::TensorDesc2String(const GeTensorDesc &desc) { Status TaskContext::AllocateTensor(const GeTensorDesc &tensor_desc, TensorValue &tensor, AllocationAttr *attr) { int64_t size = 0; if (ge::TensorUtils::GetSize(tensor_desc, size) != GRAPH_SUCCESS) { - GELOGE(INTERNAL_ERROR, "Failed to get tensor size"); + REPORT_CALL_ERROR("E19999", "Get TensorSize failed, tensor:%s", tensor_desc.GetName().c_str()); + GELOGE(INTERNAL_ERROR, "[Get][TensorSize] failed, tensor:%s", tensor_desc.GetName().c_str()); return INTERNAL_ERROR; } @@ -211,7 +243,12 @@ Status TaskContext::AllocateOutput(int index, TensorDesc2String(tensor_desc).c_str()); if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index); + REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); + GELOGE(PARAM_INVALID, "[Check][Param] %s(%s) output index out of range. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); return PARAM_INVALID; } @@ -289,7 +326,10 @@ Status TaskContext::AllocateOutputs(AllocationAttr *attr) { Status TaskContext::AllocateTensor(size_t size, TensorValue &tensor, AllocationAttr *attr) { auto buffer = TensorBuffer::Create(execution_context_->allocator, size, attr); if (buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate buffer of size: %zu", size); + REPORT_CALL_ERROR("E19999", "%s(%s) Allocate buffer failed, size: %zu", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); + GELOGE(MEMALLOC_FAILED, "[Allocate][buffer] failed for %s(%s), size: %zu", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), size); return MEMALLOC_FAILED; } @@ -303,7 +343,12 @@ const NodeItem &TaskContext::GetNodeItem() const { Status TaskContext::SetOutput(int index, const TensorValue &tensor) { if (index < 0 || index >= node_item_->num_outputs) { - GELOGE(PARAM_INVALID, "output index out of range. num_output = %d, index = %d", node_item_->num_outputs, index); + REPORT_INNER_ERROR("E19999", "%s(%s) output index out of range check invalid. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); + GELOGE(PARAM_INVALID, "[Check][Param]%s(%s) output index out of range. num_output = %d, index = %d", + node_item_->NodeName().c_str(), node_item_->NodeType().c_str(), + node_item_->num_outputs, index); return PARAM_INVALID; } @@ -368,7 +413,8 @@ Status TaskContext::AllocateWorkspace(size_t size, void **buffer, void *ori_addr } if (*buffer == nullptr) { - GELOGE(MEMALLOC_FAILED, "Failed to allocate workspace of size = %zu", size); + REPORT_CALL_ERROR("E19999", "Allocate Workspace failed, size = %zu", size); + GELOGE(MEMALLOC_FAILED, "[Allocate][Workspace] failed, size = %zu", size); return MEMALLOC_FAILED; } @@ -400,11 +446,11 @@ Status TaskContext::PropagateOutputs() { input_offset); if (subgraph_context_->all_inputs_.size() <= static_cast(input_offset)) { - GELOGE(INTERNAL_ERROR, - "[%s] input index out of range. index = %d, total input num = %zu", - GetNodeName(), - input_offset, - subgraph_context_->all_inputs_.size()); + REPORT_INNER_ERROR("E19999", + "[%s] input index out of range check invalid. index = %d, total input num = %zu", + GetNodeName(), input_offset, subgraph_context_->all_inputs_.size()); + GELOGE(INTERNAL_ERROR, "[Check][Size][%s] input index out of range. index = %d, total input num = %zu", + GetNodeName(), input_offset, subgraph_context_->all_inputs_.size()); return INTERNAL_ERROR; } @@ -515,7 +561,8 @@ const DumpProperties &TaskContext::GetDumpProperties() const { } bool TaskContext::NeedCallback() { - return node_item_->has_observer || IsDumpEnabled() || execution_context_->profiling_level > 0; + return node_item_->has_observer || IsDumpEnabled() || execution_context_->profiling_level > 0 || + !execution_context_->model->IsSingleOp(); } Status TaskContext::Synchronize() { diff --git a/ge/init/gelib.cc b/ge/init/gelib.cc index ab7fbb29..39a18fd1 100644 --- a/ge/init/gelib.cc +++ b/ge/init/gelib.cc @@ -42,7 +42,6 @@ #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/host_mem_manager.h" #include "graph/manager/graph_var_manager.h" -#include "omm/csa_interact.h" #include "runtime/kernel.h" #include "opskernel_manager/ops_kernel_builder_manager.h" #include "external/runtime/rt_error_codes.h" @@ -73,7 +72,7 @@ Status GELib::Initialize(const map &options) { return GE_CLI_INIT_FAILED; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kSystemInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kSystemInit); map new_options; Status ret = instancePtr_->SetRTSocVersion(options, new_options); if (ret != SUCCESS) { @@ -116,7 +115,7 @@ Status GELib::InnerInitialize(const map &options) { return SUCCESS; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kSystemInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kSystemInit); GELOGI("GE System initial."); GE_TIMESTAMP_START(SystemInitialize); Status initSystemStatus = SystemInitialize(options); @@ -127,7 +126,7 @@ Status GELib::InnerInitialize(const map &options) { return initSystemStatus; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kEngineInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kEngineInit); GELOGI("engineManager initial."); GE_TIMESTAMP_START(EngineInitialize); Status initEmStatus = engineManager_.Initialize(options); @@ -139,7 +138,7 @@ Status GELib::InnerInitialize(const map &options) { return initEmStatus; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOpsKernelInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOpsKernelInit); GELOGI("opsManager initial."); GE_TIMESTAMP_START(OpsManagerInitialize); Status initOpsStatus = opsManager_.Initialize(options); @@ -151,7 +150,7 @@ Status GELib::InnerInitialize(const map &options) { return initOpsStatus; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOpsKernelBuilderInit); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOpsKernelBuilderInit); GELOGI("opsBuilderManager initial."); GE_TIMESTAMP_START(OpsKernelBuilderManagerInitialize); Status initOpsBuilderStatus = OpsKernelBuilderManager::Instance().Initialize(options); @@ -163,7 +162,7 @@ Status GELib::InnerInitialize(const map &options) { return initOpsBuilderStatus; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); GELOGI("sessionManager initial."); GE_TIMESTAMP_START(SessionManagerInitialize); Status initSmStatus = sessionManager_.Initialize(options); @@ -376,10 +375,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt } GE_CHK_STATUS_RET(HostMemManager::Instance().Initialize()); - // Update CSA file - CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId()); - Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT); - GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret); // set device id GELOGI("set logical device id:%u", options.device_id); @@ -408,10 +403,6 @@ Status GELib::SystemShutdownWithOptions(const Options &options) { GE_CHK_RT(rtDeviceReset(options.device_id)); - // Update CSA file - Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_SUCCEED); - GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret); - is_system_inited = false; is_shutdown = true; GELOGI("%s finalize GELib success.", mode.c_str()); @@ -447,7 +438,7 @@ string GELib::GetPath() { return PluginManager::GetPath(); } // Finalize all modules Status GELib::Finalize() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().SetStage(error_message::kFinalize, error_message::kFinalize); GELOGI("finalization start"); // Finalization is not allowed before initialization if (!init_flag_) { diff --git a/ge/ir_build/ge_ir_build.cc b/ge/ir_build/ge_ir_build.cc index 336102d4..c9dfac07 100644 --- a/ge/ir_build/ge_ir_build.cc +++ b/ge/ir_build/ge_ir_build.cc @@ -32,7 +32,7 @@ #include "graph/utils/type_utils.h" #include "graph/ge_global_options.h" #include "init/gelib.h" -#include "ir_build/atc_ir_common.h" +#include "ir_build/option_utils.h" #include "model/ge_model.h" #include "graph/shape_refiner.h" #include "graph/opsproto_manager.h" @@ -202,12 +202,12 @@ graphStatus aclgrphBuildInitializeImpl(std::map &globa } graphStatus aclgrphBuildInitialize(std::map global_options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); return aclgrphBuildInitializeImpl(global_options); } graphStatus aclgrphBuildInitialize(std::map &global_options) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); std::map tmp_global_options; for (auto &option : global_options) { if (option.first.GetString() == nullptr || option.second.GetString() == nullptr) { @@ -222,7 +222,7 @@ graphStatus aclgrphBuildInitialize(std::map &global_ } void aclgrphBuildFinalize() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().SetStage(error_message::kFinalize, error_message::kFinalize); if (ge::GELib::GetInstance() != nullptr && ge::GELib::GetInstance()->InitFlag()) { (void)ge::GELib::GetInstance()->Finalize(); return; @@ -299,10 +299,19 @@ graphStatus Impl::UpdateDataOpAttr(const Graph &graph) { GE_CHK_BOOL_EXEC(ParseInputShape(input_shape, shape_map, user_shape_map, true), return GRAPH_PARAM_INVALID, "[Parse][InputShape] failed!"); } - std::map>> shape_range_map; + std::map>> name_shape_range_map; + std::vector>> index_shape_range_map; if (!input_shape_range.empty()) { - GE_CHK_BOOL_EXEC(ParseInputShapeRange(input_shape_range, shape_range_map), - return GRAPH_PARAM_INVALID, "[Parse][InputShapeRange] failed."); + Status ret = GRAPH_PARAM_INVALID; + if (input_shape_range.find(":") != string::npos) { + ret = ParseInputShapeRange(input_shape_range, name_shape_range_map); + } else { + ret = ParseInputShapeRange(input_shape_range, index_shape_range_map); + } + if (ret != SUCCESS) { + GELOGE(GRAPH_PARAM_INVALID, "[Parse][InputShapeRange] parse shape range[%s] failed.", input_shape_range.c_str()); + return GRAPH_PARAM_INVALID; + } } auto compute_graph = ge::GraphUtils::GetComputeGraph(graph); GE_CHECK_NOTNULL(compute_graph); @@ -315,10 +324,14 @@ graphStatus Impl::UpdateDataOpAttr(const Graph &graph) { GELOGE(GRAPH_FAILED, "[Update][DataOpShape] fail for op:%s.", op->GetName().c_str()); return GRAPH_FAILED; } - if (UpdateDataOpShapeRange(op, shape_range_map) != SUCCESS) { + if (UpdateDataOpShapeRange(op, name_shape_range_map) != SUCCESS) { GELOGE(GRAPH_FAILED, "[Update][DataOpShapeRange] fail for op:%s.", op->GetName().c_str()); return GRAPH_FAILED; - } + } + if (UpdateDataOpShapeRange(op, index_shape_range_map) != SUCCESS) { + GELOGE(GRAPH_FAILED, "[Update][DataOpShapeRange] fail for op:%s.", op->GetName().c_str()); + return GRAPH_FAILED; + } } } @@ -574,7 +587,7 @@ graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &in } if (!ParseInputShape(input_shape, omg_context_.input_dims, omg_context_.user_input_dims, is_dynamic_input)) { - GELOGE(GRAPH_PARAM_INVALID, "[Parse][InputShape:input_shape] Failed, shape: %s", input_shape.c_str()); + GELOGE(GRAPH_PARAM_INVALID, "[Parse][InputShape:InputShape] Failed, shape: %s", input_shape.c_str()); return GRAPH_PARAM_INVALID; } return GRAPH_SUCCESS; @@ -582,7 +595,7 @@ graphStatus Impl::InitDomiOmgContext(const string &input_shape, const string &in graphStatus aclgrphBuildModel(const ge::Graph &graph, const std::map &build_options, ModelBufferData &model) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGD("Enter aclmdlBuildModel process!"); Impl builder; return builder.BuildModel(graph, build_options, model); @@ -590,7 +603,7 @@ graphStatus aclgrphBuildModel(const ge::Graph &graph, const std::map &build_options, ModelBufferData &model) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); GELOGD("Enter aclmdlBuildModel process!"); std::map tmp_build_options; for (auto &option : build_options) { @@ -608,7 +621,7 @@ graphStatus aclgrphBuildModel(const ge::Graph &graph, const std::map PATH_MAX || len != strlen(file) || strlen(file) == 0) { @@ -703,7 +716,7 @@ graphStatus aclgrphDumpGraph(const ge::Graph &graph, const char *file, const siz graphStatus aclgrphGenerateForOp(const AscendString &op_type, const vector &inputs, const vector &outputs, Graph &graph) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); auto op_type_str = std::string(op_type.GetString()); auto op_name = op_type_str + "_" + std::to_string(ge::GetCurrentTimestamp()); auto op_desc = ge::MakeShared(op_name, op_type_str); @@ -763,7 +776,7 @@ static std::string AttrTypeToSerialString(aclgrphAttrType attr_type) { } graphStatus aclgrphSetOpAttr(Graph &graph, aclgrphAttrType attr_type, const char *cfg_path) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); auto compute_graph = GraphUtils::GetComputeGraph(graph); GE_CHECK_NOTNULL(compute_graph); if (cfg_path == nullptr) { diff --git a/ge/ir_build/atc_ir_common.cc b/ge/ir_build/option_utils.cc similarity index 77% rename from ge/ir_build/atc_ir_common.cc rename to ge/ir_build/option_utils.cc index 6ce6ce7b..1be996b2 100755 --- a/ge/ir_build/atc_ir_common.cc +++ b/ge/ir_build/option_utils.cc @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "atc_ir_common.h" +#include "option_utils.h" #include "common/util/error_manager/error_manager.h" #include "external/ge/ge_api_types.h" #include "framework/common/string_util.h" @@ -22,6 +22,7 @@ #include "graph/compute_graph.h" #include "graph/utils/type_utils.h" #include "graph/utils/tensor_utils.h" +#include "graph/debug/ge_attr_define.h" using std::pair; using std::string; @@ -55,12 +56,15 @@ const char *const kDigitError = "is not digit"; const char *const kCompressWeightError = "it must be appointed when appoint parameter[--optypelist_for_implmode]"; const char *const kSelectImplmodeError = "only support high_performance, high_precision"; const char *const kDynamicBatchSizeError = "It can only contains digit, \",\", \" \""; +const char *const kDynamicImageSizeError = "It can only contains digit, \",\", \" \" and \";\""; const char *const kKeepDtypeError = "file not found"; const char *const kInputShapeRangeInvalid = "format of shape range is invalid"; +const char *const kInputShapeRangeSizeInvalid = " shape range size less than 2 is invalid"; const char *const kShapeRangeValueConvertError = "transfer from string to int64 error"; const char *const kInputShapeRangeSample1 = "\"input_name1:[n1~n2,c1,h1,w1]\""; const char *const kInputShapeRangeSample2 = "\"[1~20]\""; const char *const kInputShapeRangeSample3 = "\"[1~20,3,3~6,-1]\""; +const char *const kInputShapeRangeSample4 = "\"[1~20,3,3~6,-1],[1~20,3,3~6,-1]\""; vector SplitInputShape(const std::string &input_shape) { vector shape_pair_vec; @@ -71,6 +75,67 @@ vector SplitInputShape(const std::string &input_shape) { } return shape_pair_vec; } + +static bool StringToLongNoThrow(const string &str, long &val) { + try { + val = std::stol(str); + return true; + } catch (const std::invalid_argument) { + REPORT_INPUT_ERROR("E10048", std::vector({"shape_range", "reason", "sample"}), + std::vector({str, kShapeRangeValueConvertError, kInputShapeRangeSample3})); + GELOGE(PARAM_INVALID, "[Parse][Parameter] str:%s to long failed, reason: %s, correct sample is %s.", + str.c_str(), kShapeRangeValueConvertError, kInputShapeRangeSample3); + } catch (const std::out_of_range) { + REPORT_INPUT_ERROR("E10048", std::vector({"shape_range", "reason", "sample"}), + std::vector({str, kShapeRangeValueConvertError, kInputShapeRangeSample3})); + GELOGE(PARAM_INVALID, "[Parse][Parameter] str:%s to long failed, reason: %s, correct sample is %s.", + str.c_str(), kShapeRangeValueConvertError, kInputShapeRangeSample3); + } + return false; +} + +static bool ParseShapeRangePair(const string &shape_range, + const vector &range_pair_set, + std::pair &range_pair) { + if (range_pair_set.size() == 1) { + long range_value = 0; + if (!StringToLongNoThrow(range_pair_set.at(0), range_value)) { + return false; + } + if (range_value < 0) { + range_pair = std::make_pair(1, range_value); + } else { + range_pair = std::make_pair(range_value, range_value); + } + } else if (range_pair_set.size() == kRangePairSize) { + // unknown dim, should get range. + long range_left = 0; + if (!StringToLongNoThrow(range_pair_set.at(0), range_left)) { + return false; + } + long range_right = 0; + if (!StringToLongNoThrow(range_pair_set.at(1), range_right)) { + return false; + } + if ((range_left < 0) || (range_right < 0)) { + REPORT_INPUT_ERROR("E10048", std::vector({"shape_range", "reason", "sample"}), + std::vector({shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample3})); + GELOGE(PARAM_INVALID, + "[Parse][InputParameter] [--input_shape_range]'s shape range[%s] failed," + "reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample3); + return false; + } + range_pair = std::make_pair(range_left, range_right); + } else { + REPORT_INPUT_ERROR("E10048", std::vector({"shape_range", "reason", "sample"}), + std::vector({shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample3})); + GELOGE(PARAM_INVALID, "[Parse][Parameter]shape_range:%s invalid, reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample3); + return false; + } + return true; +} } // namespace Status CheckInputFormat(const string &input_format) { @@ -93,7 +158,7 @@ bool CheckDynamicBatchSizeInputShapeValid(map> shape_map vector shape = iter->second; if (shape.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10012"); - GELOGE(ge::PARAM_INVALID, + GELOGE(ge::PARAM_INVALID, "[Check][DynamicBatchSizeInputShape] shape size can not be less than 1 when set --dynamic_batch_size."); return false; } @@ -110,7 +175,7 @@ bool CheckDynamicBatchSizeInputShapeValid(map> shape_map if (size == 0) { ErrorManager::GetInstance().ATCReportErrMessage("E10031"); - GELOGE(ge::PARAM_INVALID, + GELOGE(ge::PARAM_INVALID, "[Check][DynamicBatchSizeInputShape]At least one batch n must be equal to -1 when set dynamic_batch_size."); return false; } @@ -170,6 +235,16 @@ bool CheckDynamicImagesizeInputShapeValid(map> shape_map } EraseEndSemicolon(dynamic_image_size); + for (char c : dynamic_image_size) { + bool is_char_valid = isdigit(c) || (c == ',') || (c == ' ') || (c == ';'); + if (!is_char_valid) { + ErrorManager::GetInstance().ATCReportErrMessage( + "E10033", {"value", "reason"}, {dynamic_image_size, kDynamicImageSizeError}); + GELOGE(ge::PARAM_INVALID, "[Check][DynamicImageSizeInputShape] --dynamic_image_size:%s is invalid. reason: %s", + dynamic_image_size.c_str(), kDynamicImageSizeError); + return false; + } + } // Different parameter sets are split string by ';' std::vector split_set = StringUtils::Split(dynamic_image_size, ';'); // Different dimensions are split by ',' @@ -244,7 +319,7 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims if (split_set.size() > kMaxDynamicDimNum) { ErrorManager::GetInstance().ATCReportErrMessage( "E10042", {"parameter", "reason"}, {"dynamic_dims", "dynamic_dims's num of parameter set can not exceed 100"}); - GELOGE(ge::PARAM_INVALID, + GELOGE(ge::PARAM_INVALID, "[CheckAndParse][DynamicDims]dynamic_dims's num of parameter set can not exceed %zu.", kMaxDynamicDimNum); return false; } @@ -265,7 +340,7 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--dynamic_dims's parameter", dim.c_str(), "must be positive integer"}); - GELOGE(ge::PARAM_INVALID, + GELOGE(ge::PARAM_INVALID, "[CheckAndParse][DynamicDims]--dynamic_dims:%s parameter must be positive integer.", dynamic_dims.c_str()); return false; @@ -276,24 +351,6 @@ bool CheckAndParseDynamicDims(int32_t dynamic_dim_num, std::string &dynamic_dims return true; } -bool StringToLongNoThrow(const string &str, long &val) { - try { - val = std::stol(str); - return true; - } catch (const std::invalid_argument) { - ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, - {str, kShapeRangeValueConvertError, kInputShapeRangeSample3}); - GELOGE(PARAM_INVALID, "[Parse][Parameter] str:%s invalid, reason: %s, correct sample is %s.", - str.c_str(), kShapeRangeValueConvertError, kInputShapeRangeSample3); - } catch (const std::out_of_range) { - ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, - {str, kShapeRangeValueConvertError, kInputShapeRangeSample3}); - GELOGE(PARAM_INVALID, "[Parse][Parameter] str:%s invalid, reason: %s, correct sample is %s.", - str.c_str(), kShapeRangeValueConvertError, kInputShapeRangeSample3); - } - return false; -} - bool ParseSingleShapeRange(std::string &shape_range, vector> &shape_range_vec) { vector square_brackets; for (auto ch : shape_range) { @@ -320,41 +377,8 @@ bool ParseSingleShapeRange(std::string &shape_range, vector range_pair_set = ge::StringUtils::Split(range_pair_str, '~'); pair range_pair; - if (range_pair_set.size() == 1) { - long range_value = 0; - if (!StringToLongNoThrow(range_pair_set.at(0), range_value)) { - return false; - } - if (range_value < 0) { - range_pair = std::make_pair(1, range_value); - } else { - range_pair = std::make_pair(range_value, range_value); - } - } else if (range_pair_set.size() == kRangePairSize) { - // unknown dim, should get range. - long range_left = 0; - if (!StringToLongNoThrow(range_pair_set.at(0), range_left)) { - return false; - } - long range_right = 0; - if (!StringToLongNoThrow(range_pair_set.at(1), range_right)) { - return false; - } - if (range_left < 0 || (range_right < 0)) { - ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, - {shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample3}); - GELOGE(PARAM_INVALID, - "[Parse][InputParameter] [--input_shape_range]'s shape range[%s] failed," - "reason: %s, correct sample is %s.", - shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample3); - return false; - } - range_pair = std::make_pair(range_left, range_right); - } else { - ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, - {shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample3}); - GELOGE(PARAM_INVALID, "[Parse][Parameter]shape_range:%s invalid, reason: %s, correct sample is %s.", - shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample3); + if (!ParseShapeRangePair(shape_range, range_pair_set, range_pair)) { + GELOGE(PARAM_INVALID, "[Parse][RangePair] parse range pair failed."); return false; } shape_range_vec.emplace_back(range_pair); @@ -362,8 +386,13 @@ bool ParseSingleShapeRange(std::string &shape_range, vector>> &shape_range_map) { +/** + * Parser shape_range from string to map + * shape_range from option normally is "input1:[1~20,3,3~6,-1];input2:[1~20,3,3~6,-1]" + * @param shape_range + */ +Status ParseInputShapeRange(const std::string &shape_range, + std::map>> &shape_range_map) { GELOGD("Input shape range %s", shape_range.c_str()); vector shape_range_vec = StringUtils::Split(shape_range, ';'); @@ -373,27 +402,84 @@ bool ParseInputShapeRange(const std::string &shape_range, if (shape_range_pair_vec.size() != DEFAULT_SHAPE_RANGE_PAIR_SIZE) { ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape_range", "reason", "sample"}, {shape_range, kSplitError1, kInputShapeRangeSample1}); - GELOGE(PARAM_INVALID, "[Parse][Parameter]--input shape_range:%s invalid, reason: %s, correct sample is %s.", + GELOGE(PARAM_INVALID, "[Parse][Parameter]--input shape_range:%s invalid, reason: %s, correct sample is %s.", shape_range.c_str(), kSplitError1, kInputShapeRangeSample1); - return false; + return PARAM_INVALID; } if (shape_range_pair_vec[1].empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10048", {"shape", "reason", "sample"}, {shape_range, kEmptyError, kInputShapeRangeSample1}); GELOGE(PARAM_INVALID, "[Parse][Parameter]shape_range:%s invalid,reason: %s, correct sample is %s.", shape_range.c_str(), kEmptyError, kInputShapeRangeSample1); - return false; + return PARAM_INVALID; } string shape_range_str = shape_range_pair_vec[1]; vector> shape_range_val; if (!ParseSingleShapeRange(shape_range_str, shape_range_val)) { GELOGE(PARAM_INVALID, "[Parse][Parameter] shape_range_str: %s invalid.", shape_range_str.c_str()); - return false; + return PARAM_INVALID; } shape_range_map.emplace(make_pair(StringUtils::Trim(shape_range_pair_vec[0]), shape_range_val)); } - return true; + return SUCCESS; +} + +/** + * Parser shape_range from string to vector + * shape_range from option normally is "[1~20,3,3~6,-1],[1~20,3,3~6,-1]" + * @param shape_range + */ +Status ParseInputShapeRange(const std::string &shape_range, + std::vector>> &range) { + GELOGD("Input shape range %s", shape_range.c_str()); + + if (shape_range.size() < 2) { + REPORT_INPUT_ERROR("E10048", std::vector({"shape_range", "reason", "sample"}), + std::vector({shape_range, kInputShapeRangeSizeInvalid, kInputShapeRangeSample4})); + GELOGE(PARAM_INVALID, "[Parse][ShapeRange] str:%s invalid, reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeSizeInvalid, kInputShapeRangeSample4); + return PARAM_INVALID; + } + // different shape_range of single input are split by ']' + vector shape_range_set = ge::StringUtils::Split(shape_range, ']'); + if (shape_range_set.empty()) { + REPORT_INPUT_ERROR("E10048", std::vector({"shape_range", "reason", "sample"}), + std::vector({shape_range, kInputShapeRangeInvalid, kInputShapeRangeSample4})); + GELOGE(PARAM_INVALID, "[Parse][ShapeRange] str:%s invalid, reason: %s, correct sample is %s.", + shape_range.c_str(), kInputShapeRangeInvalid, kInputShapeRangeSample4); + return PARAM_INVALID; + } + for (auto &shape_range_str : shape_range_set) { + if (shape_range_str.size() < 3) { + // shape_range_str should be "[2~3,1" + // or ",[2~3,1". because we should trim '[' or ',[' + // so shape_range_str.size() < 3 is invalid + continue; + } + // trim start bytes, after that, single input should be "1~20,3,3~6,-1" + if (ge::StringUtils::StartWith(shape_range_str, "[")) { + shape_range_str = shape_range_str.substr(1, shape_range_str.size()); + } + if (ge::StringUtils::StartWith(shape_range_str, ",")) { + shape_range_str = shape_range_str.substr(2, shape_range_str.size()); + } + + // parse shape_range of single input. eg. "1~20,3,3~6,-1" + std::vector> range_of_single_input; + vector dim_range_set = ge::StringUtils::Split(shape_range_str, ','); + for (const auto &range_pair_str : dim_range_set) { + vector range_pair_set = ge::StringUtils::Split(range_pair_str, '~'); + pair range_pair; + if (!ParseShapeRangePair(shape_range_str, range_pair_set, range_pair)) { + GELOGE(PARAM_INVALID, "[Parse][RangePair] Parse range pair failed."); + return PARAM_INVALID; + } + range_of_single_input.emplace_back(range_pair); + } + range.emplace_back(range_of_single_input); + } + return SUCCESS; } Status CheckDynamicInputParamValid(string &dynamic_batch_size, string &dynamic_image_size, string &dynamic_dims, @@ -409,11 +495,13 @@ Status CheckDynamicInputParamValid(string &dynamic_batch_size, string &dynamic_i } if (param_size == 0) { - if (!input_shape_range.empty()) { - std::map>> shape_range_map; - if (!ParseInputShapeRange(input_shape_range, shape_range_map)) { - GELOGE(ge::PARAM_INVALID, "[Parse][InputShapeRange] failed, range: %s", input_shape_range.c_str()); - return ge::PARAM_INVALID; + if (input_shape_range.find(":") != string::npos) { + if (!input_shape_range.empty()) { + std::map>> shape_range_map; + if (ParseInputShapeRange(input_shape_range, shape_range_map) != SUCCESS) { + GELOGE(ge::PARAM_INVALID, "[Parse][InputShapeRange] failed, range: %s", input_shape_range.c_str()); + return ge::PARAM_INVALID; + } } } return ge::SUCCESS; @@ -608,6 +696,11 @@ Status CheckKeepTypeParamValid(const std::string &keep_dtype) { int CheckLogParamValidAndSetLogLevel(const std::string log) { int ret = -1; + char *npu_collect_path = std::getenv("NPU_COLLECT_PATH"); + if (npu_collect_path != nullptr && log == "null") { + return 0; + } + if (log == "default") { ret = 0; } else if (log == "null") { @@ -733,10 +826,10 @@ Status UpdateDataOpShape(const OpDescPtr &op, map> &shap } Status UpdateDataOpShapeRange(const OpDescPtr &op, - map>> &shape_range_map) { + const map>> &name_shape_range_map) { GE_CHECK_NOTNULL(op); - if (shape_range_map.empty()) { - GELOGI("Shape range map of data op [%s] is empty.", op->GetName().c_str()); + if (name_shape_range_map.empty()) { + GELOGI("Shape range name map of data op [%s] is empty.", op->GetName().c_str()); return SUCCESS; } @@ -746,8 +839,8 @@ Status UpdateDataOpShapeRange(const OpDescPtr &op, GE_CHECK_NOTNULL(tensor_output); string data_op_name = op->GetName(); auto origin_shape = tensor_input->GetShape(); - auto iter = shape_range_map.find(data_op_name); - if (iter != shape_range_map.end()) { + auto iter = name_shape_range_map.find(data_op_name); + if (iter != name_shape_range_map.end()) { auto cur_shape_range = iter->second; if (TensorUtils::CheckShapeByShapeRange(origin_shape, cur_shape_range) != SUCCESS) { GELOGE(PARAM_INVALID, "[Check][OpDescPtr] Check shape by shape range failed for op:%s.", data_op_name.c_str()); @@ -772,6 +865,56 @@ Status UpdateDataOpShapeRange(const OpDescPtr &op, return SUCCESS; } +Status UpdateDataOpShapeRange(const OpDescPtr &op, + const vector>> &index_shape_range_map) { + GE_CHECK_NOTNULL(op); + if (index_shape_range_map.empty()) { + GELOGI("Shape range index map of data op [%s] is empty.", op->GetName().c_str()); + return SUCCESS; + } + + GeAttrValue::INT index = 0; + if (!AttrUtils::GetInt(op, ATTR_NAME_INDEX, index)) { + GELOGW("[%s] Get index from data attr failed.", op->GetName().c_str()); + return SUCCESS; + } + + if ((index < 0) || (static_cast(index) >= index_shape_range_map.size())) { + std::string situation = "data op index[" + std::to_string(index) + "]"; + std::string reason = "it must less than user_input size[" + std::to_string(index_shape_range_map.size()) + "]"; + REPORT_INPUT_ERROR("E19025", std::vector({"situation", "reason"}), + std::vector({situation, reason})); + GELOGE(PARAM_INVALID, "user_input size = %zu, graph data op index = %ld.", index_shape_range_map.size(), index); + return FAILED; + } + + auto tensor_input = op->MutableInputDesc(0); + auto tensor_output = op->MutableOutputDesc(0); + GE_CHECK_NOTNULL(tensor_input); + GE_CHECK_NOTNULL(tensor_output); + string data_op_name = op->GetName(); + auto origin_shape = tensor_input->GetShape(); + auto cur_shape_range = index_shape_range_map[index]; + if (TensorUtils::CheckShapeByShapeRange(origin_shape, cur_shape_range) != SUCCESS) { + GELOGE(PARAM_INVALID, "[Check][OpDescPtr] Check shape by shape range failed for op:%s.", data_op_name.c_str()); + return PARAM_INVALID; + } + for (size_t idx = 0; idx < cur_shape_range.size(); ++idx) { + auto left_range = cur_shape_range[idx].first; + auto right_range = cur_shape_range[idx].second; + if (left_range != right_range) { + origin_shape.SetDim(idx, UNKNOWN_DIM); + } + } + tensor_input->SetShape(origin_shape); + tensor_input->SetShapeRange(cur_shape_range); + tensor_output->SetShape(origin_shape); + tensor_output->SetShapeRange(cur_shape_range); + GELOGI("Update input [%s] shape range info success.", data_op_name.c_str()); + + return SUCCESS; +} + static Status CheckInputShapeRangeNode(const ComputeGraphPtr &compute_graph, const map>> &shape_range_map) { for (const auto &it : shape_range_map) { @@ -802,7 +945,7 @@ Status UpdateDynamicInputShapeRange(const ge::ComputeGraphPtr &compute_graph, co GE_CHECK_NOTNULL(compute_graph); map>> shape_range_map; - if (!ParseInputShapeRange(input_shape_range, shape_range_map)) { + if (ParseInputShapeRange(input_shape_range, shape_range_map) != SUCCESS) { GELOGE(PARAM_INVALID, "[Parse][InputShapeRange] input_shape_range:%s invalid.", input_shape_range.c_str()); return PARAM_INVALID; } diff --git a/ge/ir_build/atc_ir_common.h b/ge/ir_build/option_utils.h similarity index 87% rename from ge/ir_build/atc_ir_common.h rename to ge/ir_build/option_utils.h index 6ff40547..44504e35 100644 --- a/ge/ir_build/atc_ir_common.h +++ b/ge/ir_build/option_utils.h @@ -64,8 +64,10 @@ Status CheckDynamicInputParamValid(std::string &dynamic_batch_size, std::string bool ParseInputShape(const std::string &input_shape, std::map> &shape_map, std::vector>> &user_shape_map, bool is_dynamic_input = false); -bool ParseInputShapeRange(const std::string &shape_range, - std::map>> &shape_range_map); +Status ParseInputShapeRange(const std::string &shape_range, + std::map>> &shape_range_map); +Status ParseInputShapeRange(const std::string &shape_range, + std::vector>> &range); Status CheckOutputTypeParamValid(const std::string output_type); Status CheckBufferOptimizeParamValid(const std::string buffer_optimize); @@ -80,8 +82,10 @@ Status CheckKeepTypeParamValid(const std::string &keep_dtype); void PrintOptionMap(std::map &options, std::string tips); void EraseEndSemicolon(std::string ¶m); Status UpdateDataOpShape(const OpDescPtr &op, std::map> &shape_map); +Status UpdateDataOpShapeRange( + const OpDescPtr &op, const std::map>> &name_shape_range_map); Status UpdateDataOpShapeRange(const OpDescPtr &op, - std::map>> &shape_range_map); + const std::vector>> &index_shape_range_map); Status UpdateDynamicInputShapeRange(const ge::ComputeGraphPtr &compute_graph, const string &input_shape_range); } #endif // FRAMEWORK_DOMI_ATC_IR_COMMON_H_ diff --git a/ge/model/ge_model.cc b/ge/model/ge_model.cc index acaeff0d..bcccc6f8 100755 --- a/ge/model/ge_model.cc +++ b/ge/model/ge_model.cc @@ -85,4 +85,14 @@ ProtoAttrMapHelper GeModel::MutableAttrMap() { return attrs_; } ConstProtoAttrMapHelper GeModel::GetAttrMap() const { return ConstProtoAttrMapHelper(attrs_.GetProtoOwner(), attrs_.GetProtoMsg()); } + +Status GeModel::GetSessionId(uint32_t model_id, uint64_t &session_id) const { + auto it = model_id_to_session_id_map_.find(model_id); + if (it != model_id_to_session_id_map_.end()) { + session_id = it->second; + return SUCCESS; + } + GELOGW("No session id were found with model id [%u].", model_id); + return INTERNAL_ERROR; +} } // namespace ge diff --git a/ge/model/ge_model.h b/ge/model/ge_model.h index 5676c3b6..08db8cc3 100755 --- a/ge/model/ge_model.h +++ b/ge/model/ge_model.h @@ -71,6 +71,11 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder void SetModelId(uint32_t model_id) { model_id_ = model_id; } uint32_t GetModelId() const { return model_id_; } + Status GetSessionId(uint32_t model_id, uint64_t &session_id) const; + void InsertSessionMap(uint32_t model_id, uint64_t session_id) { + model_id_to_session_id_map_.insert({model_id, session_id}); + } + protected: ConstProtoAttrMapHelper GetAttrMap() const override; @@ -90,6 +95,7 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY GeModel : public AttrHolder std::string platform_version_; uint8_t platform_type_ = {0}; uint32_t model_id_ = INVALID_MODEL_ID; + std::map model_id_to_session_id_map_; }; } // namespace ge using GeModelPtr = std::shared_ptr; diff --git a/ge/model/ge_root_model.h b/ge/model/ge_root_model.h index aa5a4d47..b8ff7b7a 100755 --- a/ge/model/ge_root_model.h +++ b/ge/model/ge_root_model.h @@ -32,15 +32,38 @@ class GeRootModel { return subgraph_instance_name_to_model_; }; - const ComputeGraphPtr &GetRootGraph() const { return root_graph_; }; - void SetModelId(uint32_t model_id) { model_id_ = model_id; } + const ComputeGraphPtr &GetRootGraph() const { return root_graph_; } + void SetModelId(uint32_t model_id) { + model_id_ = model_id; + // cached for removement + model_ids_.emplace_back(model_id); + } uint32_t GetModelId() const { return model_id_; } + + void SetModelName(const std::string &model_name) { model_name_ = model_name; } + + const std::string &GetModelName() const { return model_name_; } + + std::vector GetAllModelId() const { return model_ids_; } + + void ClearAllModelId() { model_ids_.clear(); } + Status CheckIsUnknownShape(bool &is_dynamic_shape); + void SetRootGraph(ComputeGraphPtr graph) { root_graph_ = graph; } + + void SetTrainFlag(bool flag) { train_flag_ = flag; } + + bool GetTrainFlag() const { return train_flag_; } + private: ComputeGraphPtr root_graph_ = nullptr; std::map subgraph_instance_name_to_model_; uint32_t model_id_ = 0; + // In multithread online secenario, same graph can owns different davinci_model for for concurrency + std::vector model_ids_; + bool train_flag_ = false; + std::string model_name_; }; } // namespace ge using GeRootModelPtr = std::shared_ptr; diff --git a/ge/offline/CMakeLists.txt b/ge/offline/CMakeLists.txt index 87589859..2a0f0ff0 100644 --- a/ge/offline/CMakeLists.txt +++ b/ge/offline/CMakeLists.txt @@ -11,7 +11,7 @@ set(SRC_LIST "main.cc" "single_op_parser.cc" "../session/omg.cc" - "../ir_build/atc_ir_common.cc" + "../ir_build/option_utils.cc" ) ############ atc_atc.bin ############ diff --git a/ge/offline/main.cc b/ge/offline/main.cc index 54a1d8fb..6603a3f5 100755 --- a/ge/offline/main.cc +++ b/ge/offline/main.cc @@ -36,7 +36,7 @@ #include "graph/utils/graph_utils.h" #include "graph/utils/type_utils.h" #include "init/gelib.h" -#include "ir_build/atc_ir_common.h" +#include "ir_build/option_utils.h" #include "omg/omg.h" #include "omg/parser/parser_factory.h" #include "omg/parser/parser_inner_ctx.h" @@ -220,6 +220,8 @@ DEFINE_string(performance_mode, "", "Optional; express high compile performance "normal: no need to compile, used saved .o files directly;" "high: need to recompile, high execute performance mode."); +DEFINE_string(device_id, "0", "Optional; user device id"); + class GFlagUtils { public: /** @@ -579,7 +581,7 @@ class GFlagUtils { if (fileName.size() > static_cast(PATH_MAX)) { ErrorManager::GetInstance().ATCReportErrMessage( "E10021", {"parameter", "size"}, {"output", std::to_string(PATH_MAX)}); - GELOGE(ge::FAILED, + GELOGE(ge::FAILED, "[Check][Path]Input parameter[--output]'s path is too long, it must be less than %d", PATH_MAX); return false; } @@ -638,7 +640,7 @@ static bool CheckInputFormat() { // only support NCHW ND ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kCaffeFormatSupport}); - GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", + GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kCaffeFormatSupport); return false; } else if ((FLAGS_framework == static_cast(domi::TENSORFLOW))) { // tf @@ -648,7 +650,7 @@ static bool CheckInputFormat() { // only support NCHW NHWC ND NCDHW NDHWC ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kTFFormatSupport}); - GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", + GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kTFFormatSupport); return false; } else if (FLAGS_framework == static_cast(domi::ONNX)) { @@ -658,7 +660,7 @@ static bool CheckInputFormat() { // only support NCHW ND ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--input_format", FLAGS_input_format, kONNXFormatSupport}); - GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", + GELOGE(ge::FAILED, "[Check][InputFormat]Invalid value for --input_format[%s], %s.", FLAGS_input_format.c_str(), kONNXFormatSupport); return false; } @@ -903,7 +905,7 @@ static Status ConvertModelToJson(int fwk_type, const string &model_file, const s ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--framework", std::to_string(fwk_type), kModelToJsonSupport}); - GELOGE(ge::FAILED, "[Convert][ModelToJson]Invalid value for --framework[%d], %s.", + GELOGE(ge::FAILED, "[Convert][ModelToJson]Invalid value for --framework[%d], %s.", fwk_type, kModelToJsonSupport); ret = ge::FAILED; } @@ -969,7 +971,7 @@ domi::Status GenerateModel(std::map &options, std::string output ge::Graph graph; std::vector inputs; if (FLAGS_framework == domi::MINDSPORE) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); // load model from file ge::Model load_model = ge::Model("loadmodel", "version2"); auto ret1 = load_model.LoadFromFile(FLAGS_model); @@ -1010,12 +1012,12 @@ domi::Status GenerateModel(std::map &options, std::string output atc_params.insert(std::pair(string(ge::OUTPUT_DATATYPE), FLAGS_output_type)); atc_params.insert(std::pair("output", output)); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kParser); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kParser); Status ret = ParseGraph(graph, atc_params, FLAGS_model.c_str(), FLAGS_weight.c_str(), (domi::FrameworkType)FLAGS_framework, FLAGS_op_name_map.c_str(), FLAGS_target.c_str(), (ge::RunMode)FLAGS_mode, is_dynamic_input); - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); // in ONLY_PRE_CHECK mode, pre-checking report has already saved in ParseGraph if (FLAGS_mode == ge::ONLY_PRE_CHECK) { (void)ge_generator.Finalize(); @@ -1084,6 +1086,7 @@ static void SetEnvForSingleOp(std::map &options) { options.emplace(ge::MDL_BANK_PATH_FLAG, FLAGS_mdl_bank_path); options.emplace(ge::OP_BANK_PATH_FLAG, FLAGS_op_bank_path); options.emplace(ge::PERFORMANCE_MODE, FLAGS_performance_mode); + options.emplace(ge::TUNE_DEVICE_IDS, FLAGS_device_id); } domi::Status GenerateSingleOp(const std::string& json_file_path) { @@ -1114,7 +1117,7 @@ domi::Status GenerateSingleOp(const std::string& json_file_path) { return domi::FAILED; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kParser); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kParser); vector build_params; if (ge::SingleOpParser::ParseSingleOpList(json_file_path, build_params) != ge::SUCCESS) { DOMI_LOGE("parse single op json file failed"); @@ -1176,6 +1179,7 @@ domi::Status GenerateOmModel() { options.insert(std::pair(string(ge::OUTPUT_NODE_NAME), FLAGS_out_nodes)); options.insert(std::pair(string(ge::INSERT_OP_FILE), FLAGS_insert_op_conf)); options.insert(std::pair(string(ge::PRECISION_MODE), FLAGS_precision_mode)); + options.insert(std::pair(string(ge::TUNE_DEVICE_IDS), FLAGS_device_id)); options.insert(std::pair(string(ge::RUN_FLAG), to_string(0))); options.insert(std::pair(string(ge::TRAIN_FLAG), to_string(0))); @@ -1249,7 +1253,7 @@ domi::Status GenerateOmModel() { return domi::FAILED; } - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); if (FLAGS_display_model_info == "1") { GELOGI("need to display model info."); return ge::ConvertOm(FLAGS_output.c_str(), "", false); @@ -1259,7 +1263,7 @@ domi::Status GenerateOmModel() { } domi::Status ConvertModelToJson() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); Status ret = GFlagUtils::CheckConverJsonParamFlags(); GE_CHK_BOOL_EXEC(ret == domi::SUCCESS, return domi::FAILED, "[CheckConver][JsonParamFlags] failed!"); @@ -1270,7 +1274,7 @@ domi::Status ConvertModelToJson() { } domi::Status DisplayModelInfo() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); // No model path passed in GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(FLAGS_om == "", ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"parameter"}, {"om"}); @@ -1319,7 +1323,7 @@ bool CheckRet(domi::Status ret) { } domi::Status ConvertPbtxtToJson() { - ErrorManager::GetInstance().SetStage(ErrorMessage::kModelCompile, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kModelCompile, error_message::kOther); Status ret = GFlagUtils::CheckConverJsonParamFlags(); if (ret != domi::SUCCESS) { GELOGE(ge::FAILED, "[CheckConver][JsonParamFlags] failed!"); @@ -1409,7 +1413,7 @@ bool CheckMemInfo() { } int main(int argc, char* argv[]) { - ErrorManager::GetInstance().SetStage(ErrorMessage::kInitialize, ErrorMessage::kOther); + ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); Status ret = domi::SUCCESS; std::cout << "ATC start working now, please wait for a moment." << std::endl; @@ -1450,7 +1454,7 @@ int main(int argc, char* argv[]) { } } while (0); - ErrorManager::GetInstance().SetStage(ErrorMessage::kFinalize, ErrorMessage::kFinalize); + ErrorManager::GetInstance().SetStage(error_message::kFinalize, error_message::kFinalize); if (!CheckRet(ret)) { std::cout << "ATC run failed, Please check the detail log, Try \'atc --help\' for more information" << std::endl; int result = ErrorManager::GetInstance().OutputErrMessage(STDOUT_FILENO); diff --git a/ge/offline/module.mk b/ge/offline/module.mk index 5c7a919c..27c5863a 100755 --- a/ge/offline/module.mk +++ b/ge/offline/module.mk @@ -12,7 +12,7 @@ LOCAL_SRC_FILES := \ main.cc \ single_op_parser.cc \ ../session/omg.cc \ - ../ir_build/atc_ir_common.cc \ + ../ir_build/option_utils.cc \ LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../ ./ \ @@ -65,7 +65,7 @@ LOCAL_SRC_FILES := \ main.cc \ single_op_parser.cc \ ../session/omg.cc \ - ../ir_build/atc_ir_common.cc \ + ../ir_build/option_utils.cc \ LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../ ./ \ @@ -118,7 +118,7 @@ LOCAL_SRC_FILES := \ main.cc \ single_op_parser.cc \ ../session/omg.cc \ - ../ir_build/atc_ir_common.cc \ + ../ir_build/option_utils.cc \ LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/../ ./ \ diff --git a/ge/offline/proto/ge_ir.proto b/ge/offline/proto/ge_ir.proto index 12989a54..c0ef3071 100644 --- a/ge/offline/proto/ge_ir.proto +++ b/ge/offline/proto/ge_ir.proto @@ -31,6 +31,8 @@ enum DataType DT_STRING_REF = 24; // string_ref type DT_DUAL = 25; /**< dual output type */ DT_VARIANT = 26; // variant type + DT_BF16 = 27; // bf16 type + DT_INT4 = 28; // int4 type } message AttrDef diff --git a/ge/omm/csa_interact.cc b/ge/omm/csa_interact.cc deleted file mode 100644 index 15bca075..00000000 --- a/ge/omm/csa_interact.cc +++ /dev/null @@ -1,265 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "omm/csa_interact.h" - -#include "framework/common/debug/ge_log.h" -#include "framework/common/debug/log.h" -#include "framework/common/util.h" -#include "graph/ge_context.h" -#include "graph/manager/graph_var_manager.h" -#include "graph/utils/tensor_utils.h" -#include "mmpa/mmpa_api.h" -#include "nlohmann/json.hpp" - -namespace ge { -namespace { -const char FMK_STATUS_FILE_DIR_ENV[] = "FMK_STATUS_FILE_DIR"; -const char JOBSTATE_FILE_NAME[] = "jobstateupdate_framework"; -const char HCOM_DETECT_FILE_NAME[] = "hcom_detection_result"; -const char FILE_SEPARATE[] = "/"; -} // namespace - -/// -/// @brief Obtain CsaInteract instance -/// @return CsaInteract instance -/// -CsaInteract &CsaInteract::GetInstance() { - static CsaInteract instance; - return instance; -} - -/// -/// @brief CsaInteract instance initialization -/// @param [in] dev_index device index -/// @param [in] job_id job id -/// @return void -/// -void CsaInteract::Init(int32_t dev_index, int64_t job_id) { - if (!is_init_) { - dev_index_ = dev_index; - job_id_ = job_id; - - char file_dir_env[MMPA_MAX_PATH] = { 0x00 }; - INT32 res = mmGetEnv(FMK_STATUS_FILE_DIR_ENV, file_dir_env, MMPA_MAX_PATH); - string csa_path_prefix; - if (res == EN_OK) { - csa_path_prefix = file_dir_env; - } - if (!csa_path_prefix.empty()) { - job_state_file_ = csa_path_prefix + std::to_string(dev_index_) + FILE_SEPARATE + JOBSTATE_FILE_NAME; - hcom_detect_file_ = csa_path_prefix + std::to_string(dev_index_) + FILE_SEPARATE + HCOM_DETECT_FILE_NAME; - } - is_init_ = true; - } -} - -/// -/// @brief Update job state file -/// @param [in] job_state job state -/// @param [in] job_sub_state detailed job state -/// @param [in] module_ret_errcode sub module training failure error code -/// @param [in] error_module error module identified by FMK -/// @return Status -/// -Status CsaInteract::WriteJobState(JobState job_state, JobSubState job_sub_state, uint32_t module_ret_errcode, - ErrorModule error_module) { - if (!is_init_) { - GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState"); - REPORT_INNER_ERROR("E19999", "WriteJobState failed before init. "); - return INTERNAL_ERROR; - } - if ((curr_state_ == JOBSTATE_FAILED) || (curr_state_ == JOBSTATE_KILLED)) { - return SUCCESS; - } - - if (job_state_file_.empty()) { - return SUCCESS; - } - - std::string content; - try { - nlohmann::json content_json; - content_json["job_id"] = job_id_; - content_json["jobstate"] = job_state; - // Only the running or running failure state has a job sub state - if ((job_state == JOBSTATE_RUNNING) || (job_state == JOBSTATE_FAILED)) { - content_json["job_sub_state"] = job_sub_state; - } - content_json["time"] = CurrentTimeInStr(); - // Write error code only if run failed - if (job_state == JOBSTATE_FAILED) { - content_json["errorcode"] = module_ret_errcode; - content_json["errmodule"] = error_module; - } - - content = content_json.dump(); - } catch (const nlohmann::json::exception &e) { - GELOGE(INTERNAL_ERROR, "[Create][JsonObject] exception:%s job_state:%u job_sub_state:%u.", - e.what(), job_state, job_sub_state); - REPORT_INNER_ERROR("E19999", "Create json object failed. exception:%s job_state:%u job_sub_state:%u.", - e.what(), job_state, job_sub_state); - return INTERNAL_ERROR; - } - - if (WriteFile(job_state_file_, content) != SUCCESS) { - // The error log subfunction has been printed and will not print again - return INTERNAL_ERROR; - } - - curr_state_ = job_state; - return SUCCESS; -} - -/// -/// @brief Update error code in the job state file -/// @param [in] module_ret_errcode sub module training failure error code -/// @param [in] error_module error module identified by FMK -/// @param [in] job_sub_state detailed job state -/// @return void -/// -void CsaInteract::WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, JobSubState job_sub_state) { - // The error log subfunction has been printed and will not print again - Status ret = WriteJobState(JOBSTATE_FAILED, job_sub_state, module_ret_errcode, error_module); - if (ret != SUCCESS) { - GELOGW("write error code fail. ret_code: %u, status: %u", module_ret_errcode, job_sub_state); - } -} - -/// -/// @brief Record errors that occurred durning the training -/// @param [in] module_ret_errcode sub module training failure error code -/// @param [in] error_module error module identified by FMK -/// @param [in] job_sub_state detailed job state -/// @return void -/// -void CsaInteract::StoreInternalErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, - JobSubState job_sub_state) { - is_have_internal_error_ = true; - - csa_error_code_.module_ret_errcode = module_ret_errcode; - csa_error_code_.error_module = error_module; - csa_error_code_.job_sub_state = job_sub_state; -} - -/// -/// @brief Update training error code in the job state file -/// @return void -/// -void CsaInteract::WriteInternalErrorCode() { - if (is_have_internal_error_) { - WriteErrorCode(csa_error_code_.module_ret_errcode, csa_error_code_.error_module, csa_error_code_.job_sub_state); - } -} - -/// -/// @brief Update network connectivity detect file -/// @param [in] content network connectivity content -/// @return Status -/// -Status CsaInteract::WriteHcomDetection(const std::string &content) { - if (!is_init_) { - GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState"); - REPORT_INNER_ERROR("E19999", "WriteHcomDetection failed before init."); - return INTERNAL_ERROR; - } - - if (hcom_detect_file_.empty()) { - return SUCCESS; - } - - return WriteFile(hcom_detect_file_, content); -} - -/// -/// @ingroup WriteFile -/// @brief Write the content into the file. If the file does not exist, create the file -/// @param [in] file_name: File name to be written -/// @param [in] content: Contents to be written -/// @return Status -/// -Status CsaInteract::WriteFile(const std::string &file_name, const std::string &content) { - // if file path is not exist, then make path - INT32 flags = M_WRONLY | O_TRUNC | M_CREAT; - int32_t fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD); - if (fd == EN_ERROR) { - if (MakePath(file_name) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "[Create][File Path] errno is %d", errno); - REPORT_CALL_ERROR("E19999", "MakePath failed. errno is %d", errno); - return INTERNAL_ERROR; - } - fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD); - if (fd == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Open][File] errno is %d file_name: %s", errno, file_name.c_str()); - REPORT_CALL_ERROR("E19999", "mmOpen2 failed. errno is %d file_name: %s", errno, file_name.c_str()); - return INTERNAL_ERROR; - } - } - - mmSsize_t ret = mmWrite(fd, reinterpret_cast(const_cast(content.c_str())), content.length()); - if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Write][File] errno is %d", errno); - REPORT_CALL_ERROR("E19999", "mmWrite failed. errno is %d", errno); - ret = mmClose(fd); - if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno); - REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno); - } - return INTERNAL_ERROR; - } - ret = mmClose(fd); - if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno); - REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno); - return INTERNAL_ERROR; - } - - return SUCCESS; -} - -/// -/// @ingroup MakePath -/// @brief Verify whether the file path exists, if not, recursively create the folder -/// @param [in] file_name: File name to be verified -/// @return Status -/// -Status CsaInteract::MakePath(const std::string &file_name) { - std::size_t found = file_name.find_last_of("/"); - if (found == std::string::npos) { - return PARAM_INVALID; - } - - std::string file_path = file_name.substr(0, found + 1); - if (mmAccess(file_path.c_str()) == EN_OK) { - return SUCCESS; - } - - found = file_path.find_first_of("/"); - while (found != std::string::npos) { - std::string pre_path = file_path.substr(0, found + 1); - if (mmAccess(pre_path.c_str()) != EN_OK) { - if (mmMkdir(pre_path.c_str(), M_IRWXU) != EN_OK) { - GELOGE(INTERNAL_ERROR, "[Create][FileDir] fail, errno is %d, pre_path:%s", errno, pre_path.c_str()); - REPORT_CALL_ERROR("E19999", "mmMkdir failed. errno is %d pre_path:%s", errno, pre_path.c_str()); - return INTERNAL_ERROR; - } - } - found = file_path.find_first_of("/", found + 1); - } - - return SUCCESS; -} -} // namespace ge diff --git a/ge/omm/csa_interact.h b/ge/omm/csa_interact.h deleted file mode 100644 index 0a609e09..00000000 --- a/ge/omm/csa_interact.h +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GE_OMM_CSA_INTERACT_H_ -#define GE_OMM_CSA_INTERACT_H_ - -#include - -#include "framework/common/ge_inner_error_codes.h" - -namespace ge { -enum JobState { - JOBSTATE_WAITING = 1, - JOBSTATE_RUNNING, - JOBSTATE_KILLING, - JOBSTATE_SUCCEED, - JOBSTATE_FAILED, - JOBSTATE_KILLED, - JOBSTATE_UNKOWN -}; - -enum JobSubState { - JOBSUBSTATE_ENV_INIT = 201, - JOBSUBSTATE_ENV_FIN, - JOBSUBSTATE_RESOUCE_ALLOC, - JOBSUBSTATE_MODEL_COMPILE, - JOBSUBSTATE_GRAPH_PREPARE, - JOBSUBSTATE_GRAPH_SPLIT, - JOBSUBSTATE_GRAPH_OPTIMIZE, - JOBSUBSTATE_GRAPH_BUILD, - JOBSUBSTATE_GRAPH_LOAD, - JOBSUBSTATE_GRAPH_EXEC, - JOBSUBSTATE_GRAPH_UNLOAD, - JOBSUBSTATE_OTHER -}; - -enum ErrorModule { - ERROR_MODULE_DRIVER = 0x01, - ERROR_MODULE_RUNTIME = 0x04, - ERROR_MODULE_CCE = 0x06, - ERROR_MODULE_FMK = 0x08, - ERROR_MODULE_HCCL = 0x12 -}; - -struct CsaErrorCode { - CsaErrorCode() - : module_ret_errcode(0), - error_module(ERROR_MODULE_FMK), - job_sub_state(JOBSUBSTATE_OTHER) {} - ~CsaErrorCode() {} - uint32_t module_ret_errcode; - ErrorModule error_module; - JobSubState job_sub_state; -}; -class CsaInteract { - public: - /// - /// @brief Obtain CsaInteract instance - /// @return CsaInteract instance - /// - static CsaInteract& GetInstance(); - - /// - /// @brief CsaInteract instance initialization - /// @param [in] dev_index device index - /// @param [in] job_id job id - /// @return void - /// - void Init(int32_t dev_index, int64_t job_id); - - /// - /// @brief Update job state file - /// @param [in] job_state job state - /// @param [in] job_sub_state detailed job state - /// @param [in] module_ret_errcode sub module training failure error code - /// @param [in] error_module error module identified by FMK - /// @return Status - /// - Status WriteJobState(JobState job_state, - JobSubState job_sub_state = JOBSUBSTATE_OTHER, - uint32_t module_ret_errcode = SUCCESS, - ErrorModule error_module = ERROR_MODULE_FMK); - - /// - /// @brief Update error code in the job state file - /// @param [in] module_ret_errcode sub module training failure error code - /// @param [in] error_module error module identified by FMK - /// @param [in] job_sub_state detailed job state - /// @return void - /// - void WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, - JobSubState job_sub_state); - - /// - /// @brief Record errors that occurred durning the training - /// @param [in] module_ret_errcode sub module training failure error code - /// @param [in] error_module error module identified by FMK - /// @param [in] job_sub_state detailed job state - /// @return void - /// - void StoreInternalErrorCode(uint32_t module_ret_errcode, - ErrorModule error_module, - JobSubState job_sub_state); - - /// - /// @brief Update training error code in the job state file - /// @return void - /// - void WriteInternalErrorCode(); - - /// - /// @brief Update network connectivity detect file - /// @param [in] content network connectivity content - /// @return Status - /// - Status WriteHcomDetection(const std::string& content); - - private: - CsaInteract() - : dev_index_(0), - job_id_(0), - is_init_(false), - curr_state_(JOBSTATE_UNKOWN), - is_have_internal_error_(false) {} - - ~CsaInteract() {} - - CsaInteract(const CsaInteract&) = delete; - CsaInteract(CsaInteract&&) = delete; - CsaInteract& operator=(const CsaInteract&) = delete; - CsaInteract& operator=(CsaInteract&&) = delete; - - /// - /// @ingroup WriteFile - /// @brief Write the content into the file. If the file does not exist, create the file - /// @param [in] file_name: File name to be written - /// @param [in] content: Contents to be written - /// @return Status - /// - Status WriteFile(const std::string& file_name, const std::string& content); - - /// - /// @ingroup MakePath - /// @brief Verify whether the file path exists, if not, recursively create the folder - /// @param [in] file_name: File name to be verified - /// @return Status - /// - Status MakePath(const std::string& file_name); - - // device index - int32_t dev_index_; - // job id - int64_t job_id_; - // is initialization complete - bool is_init_; - // current job state - JobState curr_state_; - // job state file - std::string job_state_file_; - // network connectivity detect file - std::string hcom_detect_file_; - // identification of internal errors that occurred during the training - bool is_have_internal_error_; - // error code information - CsaErrorCode csa_error_code_; -}; -} // namespace ge - -#endif // GE_OMM_CSA_INTERACT_H_ - diff --git a/ge/proto/dump_task.proto b/ge/proto/dump_task.proto index ee1c6f47..a2411ddb 100644 --- a/ge/proto/dump_task.proto +++ b/ge/proto/dump_task.proto @@ -1,5 +1,5 @@ syntax = "proto3"; -package toolkit.dumpdata; +package toolkit.dump; enum OutputDataType { DT_UNDEFINED = 0; diff --git a/ge/proto/ge_ir.proto b/ge/proto/ge_ir.proto index 12989a54..c0ef3071 100644 --- a/ge/proto/ge_ir.proto +++ b/ge/proto/ge_ir.proto @@ -31,6 +31,8 @@ enum DataType DT_STRING_REF = 24; // string_ref type DT_DUAL = 25; /**< dual output type */ DT_VARIANT = 26; // variant type + DT_BF16 = 27; // bf16 type + DT_INT4 = 28; // int4 type } message AttrDef diff --git a/ge/proto/op_mapping_info.proto b/ge/proto/op_mapping.proto similarity index 97% rename from ge/proto/op_mapping_info.proto rename to ge/proto/op_mapping.proto index 7fb6f84b..d626eb49 100644 --- a/ge/proto/op_mapping_info.proto +++ b/ge/proto/op_mapping.proto @@ -1,5 +1,5 @@ syntax = "proto3"; -package aicpu.dump; +package toolkit.aicpu.dump; message Shape { repeated uint64 dim = 1; diff --git a/ge/session/omg.cc b/ge/session/omg.cc index 961bc8c7..ca5043b1 100755 --- a/ge/session/omg.cc +++ b/ge/session/omg.cc @@ -38,7 +38,7 @@ #include "graph/debug/ge_attr_define.h" #include "graph/optimize/common/params.h" #include "graph/utils/type_utils.h" -#include "ir_build/atc_ir_common.h" +#include "ir_build/option_utils.h" #include "omg/omg_inner_types.h" #include "omg/parser/model_parser.h" #include "omg/parser/parser_factory.h" @@ -86,7 +86,8 @@ static bool CheckInputTrueOrFalse(const std::string &s, const std::string &atc_p return true; } else { ErrorManager::GetInstance().ATCReportErrMessage("E10005", {"parameter", "value"}, {atc_param, s}); - GELOGE(PARAM_INVALID, "Input parameter[--%s]'s value[%s] must be true or false.", atc_param.c_str(), s.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Input parameter[--%s]'s value[%s] must be true or false.", + atc_param.c_str(), s.c_str()); return false; } } @@ -110,9 +111,8 @@ static Status CheckInputShapeNode(const ComputeGraphPtr &graph, bool is_dynamic_ GE_CHECK_NOTNULL(tensor_desc); for (auto dim : tensor_desc->GetShape().GetDims()) { if (dim < 0) { - GELOGE(PARAM_INVALID, - "Input op [%s] shape %ld is negative, maybe you should set input_shape to specify its shape", - node->GetName().c_str(), dim); + GELOGE(PARAM_INVALID, "[Check][Param]Input op [%s] shape %ld is negative, " + "maybe you should set input_shape to specify its shape", node->GetName().c_str(), dim); const string reason = "maybe you should set input_shape to specify its shape"; ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, {node->GetName(), to_string(dim), reason}); @@ -127,12 +127,14 @@ static Status CheckInputShapeNode(const ComputeGraphPtr &graph, bool is_dynamic_ ge::NodePtr node = graph->FindNode(node_name); if (node == nullptr) { ErrorManager::GetInstance().ATCReportErrMessage("E10016", {"parameter", "opname"}, {"input_shape", node_name}); - GELOGE(PARAM_INVALID, "Input parameter[--input_shape]'s opname[%s] is not exist in model", node_name.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Input parameter[--input_shape]'s opname[%s] is not exist in model", + node_name.c_str()); return PARAM_INVALID; } if (node->GetType() != DATA) { ErrorManager::GetInstance().ATCReportErrMessage("E10017", {"parameter", "opname"}, {"input_shape", node_name}); - GELOGE(PARAM_INVALID, "Input parameter[--input_shape]'s opname[%s] is not a input opname", node_name.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Input parameter[--input_shape]'s opname[%s] is not a input opname", + node_name.c_str()); return PARAM_INVALID; } } @@ -160,8 +162,8 @@ static Status CheckInputFp16Nodes(const ComputeGraphPtr &graph, const string &in for (auto &s : adjust_fp16_format_vec) { StringUtils::Trim(s); if (!CheckInputTrueOrFalse(s, "is_input_adjust_hw_layout")) { - GELOGE(PARAM_INVALID, "Invalid Param, is_input_adjust_hw_layout only support true/false: but is [%s]", - is_input_adjust_hw_layout.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid Param, is_input_adjust_hw_layout only support true/false:" + "but is [%s]", is_input_adjust_hw_layout.c_str()); return PARAM_INVALID; } } @@ -176,7 +178,7 @@ static Status CheckInputFp16Nodes(const ComputeGraphPtr &graph, const string &in if (node == nullptr) { ErrorManager::GetInstance().ATCReportErrMessage("E10016", {"parameter", "opname"}, {"input_fp16_nodes", input_fp16_nodes_vec[i]}); - GELOGE(PARAM_INVALID, "Input parameter[--input_fp16_nodes]'s opname[%s] is not exist in model", + GELOGE(PARAM_INVALID, "[Check][Param]Input parameter[--input_fp16_nodes]'s opname[%s] is not exist in model", input_fp16_nodes_vec[i].c_str()); return PARAM_INVALID; } @@ -185,7 +187,7 @@ static Status CheckInputFp16Nodes(const ComputeGraphPtr &graph, const string &in if (op_desc->GetType() != DATA) { ErrorManager::GetInstance().ATCReportErrMessage("E10017", {"parameter", "opname"}, {"input_fp16_nodes", input_fp16_nodes_vec[i]}); - GELOGE(PARAM_INVALID, "Input parameter[--input_fp16_nodes]'s opname[%s] is not a input opname", + GELOGE(PARAM_INVALID, "[Check][Param]Input parameter[--input_fp16_nodes]'s opname[%s] is not a input opname", input_fp16_nodes_vec[i].c_str()); return PARAM_INVALID; } @@ -205,8 +207,8 @@ static Status ParseOutputFp16NodesFormat(const string &is_output_fp16) { for (auto &is_fp16 : node_format_vec) { StringUtils::Trim(is_fp16); if (!CheckInputTrueOrFalse(is_fp16, "is_output_adjust_hw_layout")) { - GELOGE(PARAM_INVALID, "Invalid Param, is_output_adjust_hw_layout only support true/false: but is [%s]", - is_output_fp16.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid Param, is_output_adjust_hw_layout " + "only support true/false: but is [%s]", is_output_fp16.c_str()); return PARAM_INVALID; } if (is_fp16 == "false") { @@ -263,7 +265,8 @@ void FindParserSo(const string &path, vector &file_list, string &caffe_p Status SetOutFormatAndDataTypeAttr(ge::OpDescPtr op_desc, const ge::Format format, const ge::DataType data_type) { if (op_desc == nullptr) { - GELOGE(domi::FAILED, "Input op desc invalid."); + REPORT_INNER_ERROR("E19999", "param op_desc is nullptr, check invalid."); + GELOGE(domi::FAILED, "[Check][Param]Input op desc invalid."); return domi::FAILED; } (void)ge::AttrUtils::SetInt(op_desc, ATTR_NAME_NET_OUTPUT_FORMAT, format); @@ -274,7 +277,7 @@ Status SetOutFormatAndDataTypeAttr(ge::OpDescPtr op_desc, const ge::Format forma bool CheckDigitStr(std::string &str) { for (char c : str) { if (!isdigit(c)) { - GELOGE(domi::FAILED, "value[%s] is not positive integer", str.c_str()); + GELOGE(domi::FAILED, "[Check][Param]value[%s] is not positive integer", str.c_str()); return false; } } @@ -284,18 +287,18 @@ bool CheckDigitStr(std::string &str) { Status StringToInt(std::string &str, int32_t &value) { try { if (!CheckDigitStr(str)) { - GELOGE(PARAM_INVALID, "Invalid of digit string: %s ", str.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid of digit string: %s ", str.c_str()); ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, {"--output_type", str, "is not positive integer"}); return PARAM_INVALID; } value = stoi(str); } catch (std::invalid_argument &) { - GELOGE(PARAM_INVALID, "Invalid of digit string: %s, catch invalid_argument.", str.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid of digit string: %s, catch invalid_argument.", str.c_str()); ErrorManager::GetInstance().ATCReportErrMessage("E10014", {"parameter", "value"}, {"--output_type", str}); return PARAM_INVALID; } catch (std::out_of_range &) { - GELOGE(PARAM_INVALID, "Invalid of digit string: %s, catch out_of_range.", str.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid of digit string: %s, catch out_of_range.", str.c_str()); ErrorManager::GetInstance().ATCReportErrMessage("E10013", {"parameter", "value"}, {"--output_type", str}); return PARAM_INVALID; } @@ -314,7 +317,8 @@ Status VerifyOutputTypeAndOutNodes(std::vector &out_type_vec) { if (out_nodes_info.find(out_type_vec[i]) == out_nodes_info.end()) { ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, {"--output_type", out_type_vec[i], kOutputTypeError}); - GELOGE(domi::FAILED, "Invalid value for --output_type[%s], %s.", out_type_vec[i].c_str(), kOutputTypeError); + GELOGE(domi::FAILED, "[Check][Param]Invalid value for --output_type[%s], %s.", + out_type_vec[i].c_str(), kOutputTypeError); return domi::FAILED; } } @@ -326,7 +330,8 @@ Status CheckOutPutDataTypeSupport(const std::string &output_type) { if (it == output_type_str_to_datatype.end()) { ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, {"--output_type", output_type, kOutputTypeSupport}); - GELOGE(PARAM_INVALID, "Invalid value for --output_type[%s], %s.", output_type.c_str(), kOutputTypeSupport); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid value for --output_type[%s], %s.", + output_type.c_str(), kOutputTypeSupport); return domi::FAILED; } return domi::SUCCESS; @@ -344,7 +349,7 @@ Status ParseOutputType(const std::string &output_type, std::mapsecond; @@ -383,7 +390,7 @@ Status CheckOutNode(ge::OpDescPtr op_desc, int32_t index) { int32_t out_size = op_desc->GetOutputsSize(); if (index < 0 || index >= out_size) { GELOGE(domi::FAILED, - "out_node [%s] output index:%d must be smaller " + "[Check][Param]out_node [%s] output index:%d must be smaller " "than node output size:%d and can not be negative!", op_desc->GetName().c_str(), index, out_size); std::string fail_reason = "output index:" + to_string(index) + " must be smaller than output size:" + @@ -403,7 +410,7 @@ Status GetDefaultOutInfo(ge::ComputeGraphPtr &compute_graph, if (out_node == nullptr) { ErrorManager::GetInstance().ATCReportErrMessage("E10016", {"parameter", "opname"}, {"out_nodes", default_out_nodes[i].first}); - GELOGE(domi::FAILED, "Can not find src node (%s) in graph.", default_out_nodes[i].first.c_str()); + GELOGE(domi::FAILED, "[Check][Param]Can not find src node (%s) in graph.", default_out_nodes[i].first.c_str()); return domi::FAILED; } output_nodes_info.push_back(std::make_pair(out_node, default_out_nodes[i].second)); @@ -432,7 +439,7 @@ Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const std::map> output_node_dt_map; if (!output_type.empty()) { if (ParseOutputType(output_type, output_node_dt_map) != SUCCESS) { - GELOGE(domi::FAILED, "Parse output_type failed."); + GELOGE(domi::FAILED, "[Parse][output_type] failed."); return domi::FAILED; } } @@ -443,13 +450,13 @@ Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const if (out_node == nullptr) { ErrorManager::GetInstance().ATCReportErrMessage("E10016", {"parameter", "opname"}, {"out_nodes", user_out_nodes[i].first}); - GELOGE(domi::FAILED, "Can not find src node (%s) in graph.", user_out_nodes[i].first.c_str()); + GELOGE(domi::FAILED, "[Check][Param]Can not find src node (%s) in graph.", user_out_nodes[i].first.c_str()); return domi::FAILED; } auto op_desc = out_node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); if (CheckOutNode(op_desc, user_out_nodes[i].second) != SUCCESS) { - GELOGE(domi::FAILED, "Check out node (%s) fail.", user_out_nodes[i].first.c_str()); + GELOGE(domi::FAILED, "[Check][OutNode] (%s) fail.", user_out_nodes[i].first.c_str()); return domi::FAILED; } @@ -475,7 +482,7 @@ Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const // default output node (leaf) if (user_out_nodes.empty()) { if (GetDefaultOutInfo(compute_graph, output_nodes_info) != SUCCESS) { - GELOGE(domi::FAILED, "Get default output info failed."); + GELOGE(domi::FAILED, "[Get][DefaultOutInfo] failed."); return domi::FAILED; } } @@ -513,7 +520,8 @@ void GetOutputNodesNameAndIndex(std::vector> &ou Status GetOutputLeaf(NodePtr node, std::vector> &output_nodes_info) { ge::OpDescPtr tmpDescPtr = node->GetOpDesc(); if (tmpDescPtr == nullptr) { - GELOGE(domi::FAILED, "Get outnode op desc fail."); + REPORT_INNER_ERROR("E19999", "param node has no opdesc."); + GELOGE(domi::FAILED, "[Check][Param]Get outnode op desc fail."); return domi::FAILED; } size_t size = tmpDescPtr->GetOutputsSize(); @@ -527,7 +535,8 @@ Status GetOutputLeaf(NodePtr node, std::vector> for (auto in_anchor : in_anchors) { auto out_anchor = in_anchor->GetPeerOutAnchor(); if (out_anchor == nullptr) { - GELOGE(domi::FAILED, "Get leaf node op desc fail."); + REPORT_INNER_ERROR("E19999", "GetPeerOutAnchor return nullptr, node:%s.", node->GetName().c_str()); + GELOGE(domi::FAILED, "[Invoke][GetPeerOutAnchor]Get leaf node op desc fail."); return domi::FAILED; } auto out_node = out_anchor->GetOwnerNode(); @@ -557,8 +566,10 @@ Status InitDomiOmgContext(const string &input_shape, const string &input_format, if (iter != ge::input_format_str_to_geformat.end()) { domi::GetContext().format = iter->second; } else { - GELOGE(PARAM_INVALID, "Input format %s not support , expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", - input_format.c_str()); + REPORT_INNER_ERROR("E19999", "param input_format:%s is not support, " + "expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", input_format.c_str()); + GELOGE(PARAM_INVALID, "[Check][Param]Input format %s not support, " + "expect ND/NCHW/NHWC/CHWN/NC1HWC0/NHWC1C0.", input_format.c_str()); return PARAM_INVALID; } } @@ -572,9 +583,9 @@ Status InitDomiOmgContext(const string &input_shape, const string &input_format, map> &shape_map = domi::GetContext().input_dims; if (!ge::ParseInputShape(input_shape, domi::GetContext().input_dims, domi::GetContext().user_input_dims, - is_dynamic_input) || - shape_map.empty()) { - GELOGE(PARAM_INVALID, "Failed to parse input shape: %s", input_shape.c_str()); + is_dynamic_input) || shape_map.empty()) { + REPORT_CALL_ERROR("E19999", "ParseInputShape failed for %s", input_shape.c_str()); + GELOGE(PARAM_INVALID, "[Parse][InputShape] %s failed.", input_shape.c_str()); return PARAM_INVALID; } @@ -601,7 +612,7 @@ Status ParseOutNodes(const string &out_nodes) { "E10001", {"parameter", "value", "reason"}, {"--out_nodes", node, "the correct format is \"node_name1:0;node_name1:1;node_name2:0\""}); GELOGE(PARAM_INVALID, - "The input format of --out_nodes is invalid, the correct format is " + "[Parse][Param]The input format of --out_nodes is invalid, the correct format is " "\"node_name1:0;node_name1:1;node_name2:0\", while the actual input is %s.", node.c_str()); return PARAM_INVALID; @@ -609,15 +620,16 @@ Status ParseOutNodes(const string &out_nodes) { if (!domi::GetContext().user_out_nodes_top_vec.empty()) { ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, {"--out_nodes", out_nodes, "is not all index or top_name"}); - GELOGE(PARAM_INVALID, - "This out_nodes str must be all index or top_name, while the actual input is %s", out_nodes.c_str()); + GELOGE(PARAM_INVALID, "[Parse][Param]This out_nodes str must be all index or top_name, " + "while the actual input is %s", out_nodes.c_str()); return PARAM_INVALID; } // stoi: The method may throw an exception: invalid_argument/out_of_range if (!CheckDigitStr(key_value_v[1])) { ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"}, {"--out_nodes", out_nodes, "is not positive integer"}); - GELOGE(PARAM_INVALID, "This str must be digit string, while the actual input is %s", out_nodes.c_str()); + GELOGE(PARAM_INVALID, "[Parse][Param]This str must be digit string, while the actual input is %s", + out_nodes.c_str()); return PARAM_INVALID; } @@ -635,11 +647,11 @@ Status ParseOutNodes(const string &out_nodes) { } } } catch (std::invalid_argument &) { - GELOGE(PARAM_INVALID, "Invalid of out_nodes: %s ", out_nodes.c_str()); + GELOGE(PARAM_INVALID, "[Parse][Param]Invalid of out_nodes: %s ", out_nodes.c_str()); ErrorManager::GetInstance().ATCReportErrMessage("E10014", {"parameter", "value"}, {"--out_nodes", out_nodes}); return PARAM_INVALID; } catch (std::out_of_range &) { - GELOGE(PARAM_INVALID, "Invalid of out_nodes: %s ", out_nodes.c_str()); + GELOGE(PARAM_INVALID, "[Parse][Param]Invalid of out_nodes: %s ", out_nodes.c_str()); ErrorManager::GetInstance().ATCReportErrMessage("E10013", {"parameter", "value"}, {"--out_nodes", out_nodes}); return PARAM_INVALID; } @@ -657,7 +669,8 @@ static Status CheckOpNameMap(const ComputeGraphPtr &graph, const std::string &op for (const NodePtr &node : graph->GetAllNodes()) { auto op_desc = node->GetOpDesc(); if (op_desc == nullptr) { - GELOGE(PARAM_INVALID, "Invalid parameter for opDesc."); + REPORT_INNER_ERROR("E19999", "param graph's node has no opdesc."); + GELOGE(PARAM_INVALID, "[Check][Param]Invalid parameter for opDesc."); return PARAM_INVALID; } graphNodeTypes[op_desc->GetType()] = ""; @@ -666,7 +679,7 @@ static Status CheckOpNameMap(const ComputeGraphPtr &graph, const std::string &op if (propertiesMap.empty()) { ErrorManager::GetInstance().ATCReportErrMessage( "E10003", {"parameter", "value", "reason"}, {"op_name_map", op_conf, "the file content is empty"}); - GELOGE(PARAM_INVALID, "op_name_map file content is empty, please check file!"); + GELOGE(PARAM_INVALID, "[Check][Param]op_name_map file content is empty, please check file!"); return PARAM_INVALID; } for (auto iter = propertiesMap.begin(); iter != propertiesMap.end(); iter++) { @@ -674,7 +687,8 @@ static Status CheckOpNameMap(const ComputeGraphPtr &graph, const std::string &op ErrorManager::GetInstance().ATCReportErrMessage( "E10003", {"parameter", "value", "reason"}, {"op_name_map", op_conf, "type[" + iter->second + "] is not found in model"}); - GELOGE(PARAM_INVALID, "Invalid parameter for op_name_map."); return PARAM_INVALID;); + GELOGE(PARAM_INVALID, "[Find][NodeType]Invalid parameter for op_name_map."); + return PARAM_INVALID;); } return SUCCESS; } @@ -711,15 +725,16 @@ FMK_FUNC_HOST_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::mapCreateModelParser(type); - GE_CHK_BOOL_RET_STATUS(model_parser != nullptr, FAILED, "ATC create model parser ret fail, type:%d.", type); - + if (model_parser == nullptr) { + REPORT_INNER_ERROR("E19999", "CreateModelParser failed, type:%d", type); + GELOGE(FAILED, "[Create][ModelParser] ret fail, type:%d.", type); + return FAILED; + } UpdateParserCtxWithOmgCtx(); Status ret = model_parser->Parse(model_file, graph); UpdateOmgCtxWithParserCtx(); @@ -749,7 +768,8 @@ FMK_FUNC_HOST_VISIBILITY Status ParseGraph(ge::Graph &graph, const std::map(model.model_data); model.model_data = nullptr; @@ -941,7 +963,7 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOm(const char *model_file, const char *js status = omFileLoadHelper.GetModelPartition(MODEL_DEF, ir_part); if (status != ge::GRAPH_SUCCESS) { ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"Get model part failed"}); - GELOGE(ge::FAILED, "Get model part failed."); + GELOGE(ge::FAILED, "[Get][ModelPartition] failed."); if (model.model_data != nullptr) { delete[] reinterpret_cast(model.model_data); model.model_data = nullptr; @@ -967,13 +989,13 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOm(const char *model_file, const char *js } else { ret = INTERNAL_ERROR; ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"ReadProtoFromArray failed"}); - GELOGE(ret, "ReadProtoFromArray failed."); + GELOGE(ret, "[Read][Proto]From Array failed."); } } else { ErrorManager::GetInstance().ATCReportErrMessage("E10003", {"parameter", "value", "reason"}, {"om", model_file, "invalid om file"}); GELOGE(ACL_ERROR_GE_PARAM_INVALID, - "ParseModelContent failed because of invalid om file. Please check --om param."); + "[Parse][ModelContent] failed because of invalid om file. Please check --om param."); } if (model.model_data != nullptr) { @@ -984,7 +1006,7 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertOm(const char *model_file, const char *js } catch (const std::exception &e) { ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"Convert om model to json failed, exception message[" + std::string(e.what()) + "]"}); - GELOGE(FAILED, "Convert om model to json failed, exception message : %s.", e.what()); + GELOGE(FAILED, "[Save][Model]Convert om model to json failed, exception message : %s.", e.what()); return FAILED; } } @@ -1003,7 +1025,8 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const }; if (ret != SUCCESS) { free_model_data(&model.model_data); - GELOGE(ret, "LoadFromFile failed."); + REPORT_CALL_ERROR("E19999", "LoadFromFile failed."); + GELOGE(ret, "[Invoke][LoadFromFile] failed."); return ret; } @@ -1015,7 +1038,7 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const if (!flag) { free_model_data(&model.model_data); ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"ParseFromString failed"}); - GELOGE(FAILED, "ParseFromString failed."); + GELOGE(FAILED, "[Invoke][ParseFromString] failed."); return FAILED; } GetGroupName(model_def); @@ -1024,7 +1047,8 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const ret = ModelSaver::SaveJsonToFile(json_file, j); if (ret != SUCCESS) { free_model_data(&model.model_data); - GELOGE(ret, "Save json to file fail."); + REPORT_CALL_ERROR("E19999", "SaveJsonToFile failed."); + GELOGE(ret, "[Save][Json] to file fail."); return ret; } free_model_data(&model.model_data); @@ -1033,12 +1057,12 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertPbtxtToJson(const char *model_file, const free_model_data(&model.model_data); ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"ParseFromString failed, exception message[" + std::string(e.what()) + "]"}); - GELOGE(FAILED, "ParseFromString failed. exception message : %s", e.what()); + GELOGE(FAILED, "[Invoke][ParseFromString] failed. exception message : %s", e.what()); return FAILED; } catch (const std::exception &e) { ErrorManager::GetInstance().ATCReportErrMessage("E19021", {"reason"}, {"Convert pbtxt to json failed, exception message[" + std::string(e.what()) + "]"}); - GELOGE(FAILED, "Convert pbtxt to json failed, exception message : %s.", e.what()); + GELOGE(FAILED, "[Save][pbtxt]Convert pbtxt to json failed, exception message : %s.", e.what()); return FAILED; } } @@ -1047,16 +1071,19 @@ FMK_FUNC_HOST_VISIBILITY Status ConvertFwkModelToJson(const domi::FrameworkType const char *json_file) { if (framework == domi::CAFFE || framework == domi::TENSORFLOW || framework == domi::ONNX) { auto model_parser = ModelParserFactory::Instance()->CreateModelParser(framework); - GE_CHK_BOOL_RET_STATUS(model_parser != nullptr, FAILED, "ATC create model parser ret fail, framework:%d.", - framework); + if (model_parser == nullptr) { + REPORT_INNER_ERROR("E19999", "CreateModelParser failed, framework:%d.", framework); + GELOGE(FAILED, "[Create][ModelParser] ret fail, framework:%d.", framework); + return FAILED; + } return model_parser->ToJson(model_file, json_file); } ErrorManager::GetInstance().ATCReportErrMessage( "E10001", {"parameter", "value", "reason"}, {"--framework", std::to_string(framework), "only support 0(Caffe) 3(TensorFlow) 5(Onnx)"}); - GELOGE(PARAM_INVALID, "Input parameter[--framework] is mandatory and it's value must be: 0(Caffe) 3(TensorFlow) " - "or 5(Onnx)."); + GELOGE(PARAM_INVALID, "[Check][Param]Input parameter[--framework] is mandatory " + "and it's value must be: 0(Caffe) 3(TensorFlow) or 5(Onnx)."); return PARAM_INVALID; } @@ -1072,7 +1099,8 @@ FMK_FUNC_HOST_VISIBILITY Status DumpInfershapeJson(const ge::Graph &graph, const if (buffer.GetData() != nullptr) { std::string str(reinterpret_cast(buffer.GetData()), buffer.GetSize()); if (!ge_proto.ParseFromString(str)) { - GELOGE(GRAPH_FAILED, "parse from string failed."); + REPORT_CALL_ERROR("E19999", "ParseFromString failed."); + GELOGE(GRAPH_FAILED, "[Invoke][ParseFromString] failed."); return FAILED; } diff --git a/ge/single_op/single_op_manager.cc b/ge/single_op/single_op_manager.cc index 667e987b..180b50c1 100644 --- a/ge/single_op/single_op_manager.cc +++ b/ge/single_op/single_op_manager.cc @@ -67,6 +67,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOpManager::Release std::lock_guard lock(mutex_); auto it = stream_resources_.find(resource_id); if (it == stream_resources_.end()) { + MemManager::Instance().CachingInstance(RT_MEMORY_HBM).TryFreeBlocks(); return SUCCESS; } delete it->second; diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h index 5ae5f036..cef3fc42 100644 --- a/inc/external/ge/ge_api_types.h +++ b/inc/external/ge/ge_api_types.h @@ -166,6 +166,8 @@ const std::string COMPRESS_FLAG = "ge.compressFlag"; const std::string PRECISION_MODE = "ge.exec.precision_mode"; +const std::string TUNE_DEVICE_IDS = "ge.exec.tuneDeviceIds"; + // Configure single op flag for FE // its value should be "0" or "1", default value is "0" const std::string SINGLE_OP_FLAG = "ge.exec.single_op"; @@ -359,6 +361,7 @@ using RunAsyncCallback = std::function ir_builder_suppported_options = {INPUT_FORMAT, DYNAMIC_DIMS, INSERT_OP_FILE, PRECISION_MODE, + TUNE_DEVICE_IDS, EXEC_DISABLE_REUSED_MEMORY, AUTO_TUNE_MODE, OUTPUT_TYPE, @@ -434,6 +439,7 @@ const std::set global_options = {CORE_TYPE, ENABLE_COMPRESS_WEIGHT, COMPRESS_WEIGHT_CONF, PRECISION_MODE, + TUNE_DEVICE_IDS, EXEC_DISABLE_REUSED_MEMORY, AUTO_TUNE_MODE, ENABLE_SINGLE_STREAM, diff --git a/inc/framework/common/util.h b/inc/framework/common/util.h index 92cb8397..bd84d0ac 100644 --- a/inc/framework/common/util.h +++ b/inc/framework/common/util.h @@ -113,14 +113,13 @@ } while (0) // Check if the parameter is null. If yes, return PARAM_INVALID and record the error -#define GE_CHECK_NOTNULL(val) \ - do { \ - if (val == nullptr) { \ - REPORT_INNER_ERROR("E19999", "Param:%s is nullptr, check invalid when %s", \ - #val, __FUNCTION__); \ - DOMI_LOGE("[Check][Param:%s]null is invalid when %s.", #val, __FUNCTION__); \ - return ge::PARAM_INVALID; \ - } \ +#define GE_CHECK_NOTNULL(val) \ + do { \ + if (val == nullptr) { \ + REPORT_INNER_ERROR("E19999", "Param:%s is nullptr, check invalid", #val); \ + DOMI_LOGE("[Check][Param:%s]null is invalid.", #val); \ + return ge::PARAM_INVALID; \ + } \ } while (0) // Check if the parameter is null. If yes, just return and record the error diff --git a/metadef b/metadef index 1e88df1d..1c41e02f 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit 1e88df1d6bfe60faae0aa9fa2d87f273b793aeb0 +Subproject commit 1c41e02f73b6e8f95369e052ee4de285145fb34f diff --git a/parser b/parser index df9abef6..dda9def8 160000 --- a/parser +++ b/parser @@ -1 +1 @@ -Subproject commit df9abef65f902f37ca664f6dda4c60727dac2aca +Subproject commit dda9def8bdb5177054a3abc132bf376172c3f01f diff --git a/tests/depends/error_manager/src/error_manager_stub.cc b/tests/depends/error_manager/src/error_manager_stub.cc index 5f62c91b..7ed8dbcb 100644 --- a/tests/depends/error_manager/src/error_manager_stub.cc +++ b/tests/depends/error_manager/src/error_manager_stub.cc @@ -16,16 +16,16 @@ #include "common/util/error_manager/error_manager.h" -using namespace ErrorMessage; +using namespace error_message; -namespace ErrorMessage { +thread_local Context ErrorManager::error_context_ = {0, "", "", ""}; + +namespace error_message { int FormatErrorMessage(char *str_dst, size_t dst_max, const char *format, ...) { return 1; } } -thread_local Context ErrorManager::error_context_ = {0, "", "", ""}; - ErrorManager &ErrorManager::GetInstance() { static ErrorManager instance; return instance; @@ -98,11 +98,11 @@ thread_local Context ErrorManager::error_context_ = {0, "", "", ""}; const std::string &ErrorManager::GetLogHeader() { return error_context_.log_header; } - struct Context &ErrorManager::GetErrorContext() { - struct Context error_context; + struct error_message::Context &ErrorManager::GetErrorManagerContext() { + struct error_message::Context error_context; return error_context; } -void ErrorManager::SetErrorContext(struct Context error_context) {} +void ErrorManager::SetErrorContext(struct error_message::Context error_context) {} void ErrorManager::SetStage(const std::string &first_stage, const std::string &second_stage) {} diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_def_type_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_def_type_unittest.cc index 10d6dc86..7dc124fe 100644 --- a/tests/ut/common/graph/testcase/ge_graph/ge_def_type_unittest.cc +++ b/tests/ut/common/graph/testcase/ge_graph/ge_def_type_unittest.cc @@ -32,14 +32,6 @@ class UtestGeTestDefType : public testing::Test { void TearDown() {} }; -TEST_F(UtestGeTestDefType, base) { - CompressInfo com1; - com1.set_blockrow(1); - int32_t a = com1.blockrow; - EXPECT_EQ(a, 1); - -} - TEST_F(UtestGeTestDefType, quant) { OpDescPtr desc_ptr1 = std::make_shared("name1", "type1"); EXPECT_EQ(desc_ptr1->AddInputDesc("x", GeTensorDesc(GeShape({1, 16, 16, 16}), FORMAT_NCHW)), GRAPH_SUCCESS); @@ -48,28 +40,4 @@ TEST_F(UtestGeTestDefType, quant) { EXPECT_EQ(OpDescUtils::HasQuantizeFactorParams(desc_ptr1), false); EXPECT_EQ(OpDescUtils::HasQuantizeFactorParams(*desc_ptr1), false); - QuantizeFactorParams q1; - EXPECT_EQ(q1.has_quantize_param(), false); - QuantizeFactor *qf1 = q1.mutable_quantize_param(); - EXPECT_EQ(q1.has_quantize_param(), true); - - string s1 = "value1"; - q1.quantize_param.set_scale_value(s1.data(), s1.size()); - EXPECT_EQ(OpDescUtils::SetQuantizeFactorParams(desc_ptr1, q1), GRAPH_SUCCESS); - QuantizeFactorParams q2; - EXPECT_EQ(OpDescUtils::GetQuantizeFactorParams(desc_ptr1, q2), GRAPH_SUCCESS); - string s2((char *)q2.quantize_param.scale_value.GetData(), q2.quantize_param.scale_value.GetSize()); - EXPECT_EQ(s2, "value1"); - - float f[2] = {1, 2}; - string s(static_cast(static_cast(f)), 2 * sizeof(float)); - q1.quantize_param.set_scale_value(f, 2 * sizeof(float)); - EXPECT_EQ(OpDescUtils::SetQuantizeFactorParams(*desc_ptr1, q1), GRAPH_SUCCESS); - QuantizeFactorParams q3; - EXPECT_EQ(OpDescUtils::GetQuantizeFactorParams(*desc_ptr1, q3), GRAPH_SUCCESS); - Buffer &b = q3.quantize_param.scale_value; - float f1[2]; - memcpy(f1, b.GetData(), b.GetSize()); - EXPECT_EQ(f1[0], 1); - EXPECT_EQ(f1[1], 2); } diff --git a/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc b/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc index 5c75bd01..aa43ac99 100644 --- a/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc +++ b/tests/ut/common/graph/testcase/ge_graph/ge_tensor_unittest.cc @@ -209,18 +209,6 @@ TEST_F(UtestGeTensor, test_tensor_desc_invalid_null) { tensor_desc2.SetDataType(DT_DUAL_SUB_INT8); EXPECT_EQ(tensor_desc2.GetDataType(), DT_DUAL_SUB_INT8); - CompressInfo info; - EXPECT_EQ(TensorUtils::GetCmpsInfo(tensor_desc2, info), GRAPH_FAILED); - TensorUtils::SetCmpsInfo(tensor_desc2, info); - EXPECT_EQ(TensorUtils::GetCmpsInfo(tensor_desc2, info), GRAPH_SUCCESS); - - AllOffsetQuantizeInfo quantize_info; - EXPECT_FALSE(TensorUtils::HasAlloffsetQuantizeInfo(tensor_desc2)); - EXPECT_EQ(TensorUtils::GetAlloffsetQuantizeInfo(tensor_desc2, quantize_info), GRAPH_FAILED); - TensorUtils::SetAlloffsetQuantizeInfo(tensor_desc2, quantize_info); - EXPECT_EQ(TensorUtils::GetAlloffsetQuantizeInfo(tensor_desc2, quantize_info), GRAPH_SUCCESS); - EXPECT_TRUE(TensorUtils::HasAlloffsetQuantizeInfo(tensor_desc2)); - TensorUtils::SetWeightSize(tensor_desc, 100); EXPECT_EQ(TensorUtils::GetWeightSize(tensor_desc), 0); } diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index 75985e4c..12b329d7 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -24,7 +24,7 @@ set(PROTO_LIST "${GE_CODE_DIR}/metadef/proto/insert_op.proto" "${GE_CODE_DIR}/metadef/proto/dump_task.proto" "${GE_CODE_DIR}/metadef/proto/fwk_adapter.proto" - "${GE_CODE_DIR}/metadef/proto/op_mapping_info.proto" + "${GE_CODE_DIR}/metadef/proto/op_mapping.proto" "${GE_CODE_DIR}/metadef/proto/optimizer_priority.proto" "${GE_CODE_DIR}/metadef/proto/ge_api.proto" "${GE_CODE_DIR}/metadef/proto/tensorflow/attr_value.proto" @@ -166,6 +166,7 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/common/dump/dump_properties.cc" "${GE_CODE_DIR}/ge/common/helper/model_helper.cc" "${GE_CODE_DIR}/ge/common/dump/dump_manager.cc" + "${GE_CODE_DIR}/ge/common/dump/exception_dumper.cc" "${GE_CODE_DIR}/ge/common/dump/opdebug_register.cc" "${GE_CODE_DIR}/ge/common/dump/dump_op.cc" "${GE_CODE_DIR}/ge/common/helper/om_file_helper.cc" @@ -286,7 +287,6 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/load/model_manager/zero_copy_task.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/cpu_queue_schedule.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/aipp_utils.cc" - "${GE_CODE_DIR}/ge/omm/csa_interact.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/tbe_handle_store.cc" "${GE_CODE_DIR}/ge/common/kernel_store.cc" "${GE_CODE_DIR}/ge/common/tbe_kernel_store.cc" @@ -307,7 +307,7 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/partition/stage_partition.cc" "${GE_CODE_DIR}/ge/graph/partition/dynamic_shape_partition.cc" "${GE_CODE_DIR}/ge/graph/optimize/summary_optimize.cc" - "${GE_CODE_DIR}/ge/ir_build/atc_ir_common.cc" + "${GE_CODE_DIR}/ge/ir_build/option_utils.cc" "${GE_CODE_DIR}/ge/graph/preprocess/insert_op/ge_aipp_op.cc" "${GE_CODE_DIR}/ge/graph/preprocess/multi_batch_options.cc" "${GE_CODE_DIR}/ge/graph/build/model_builder.cc" @@ -390,7 +390,6 @@ set(GRAPH_PARTITION_COMMON_SRC_FILES set(GRAPH_LOAD_COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/load/graph_loader.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_manager_utils.cc" - "${GE_CODE_DIR}/ge/omm/csa_interact.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_mem_allocator.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_var_manager.cc" "${GE_CODE_DIR}/ge/graph/manager/trans_var_data_utils.cc" @@ -593,6 +592,7 @@ set(SINGLE_OP_SRC_FILES "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_async_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_execution_context.cc" + "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_pipeline_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/subgraph_context.cc" "${GE_CODE_DIR}/ge/hybrid/executor/subgraph_executor.cc" "${GE_CODE_DIR}/ge/hybrid/executor/worker/task_compile_engine.cc" @@ -709,6 +709,7 @@ set(PASS_TEST_FILES "graph/passes/buffer_pool_memory_pass_unittest.cc" "graph/passes/mark_node_unknown_shape_pass_unittest.cc" "graph/passes/reshape_recovery_pass_unittest.cc" + "graph/passes/cast_remove_pass_unittest.cc" ) set(KERNEL_TEST_FILES @@ -755,6 +756,7 @@ set(MULTI_PARTS_TEST_FILES "common/datatype_transfer_unittest.cc" "common/dump_manager_unittest.cc" "common/dump_op_unittest.cc" + "common/dump_exception_unittest.cc" "common/opdebug_register_unittest.cc" "common/format_transfer_unittest.cc" "common/format_transfer_transpose_unittest.cc" @@ -780,10 +782,12 @@ set(MULTI_PARTS_TEST_FILES "graph/build/mem_assigner_unittest.cc" "graph/build/task_generator_unittest.cc" "graph/build/buffer_pool_mem_assigner_unittest.cc" + "graph/execute/graph_execute_unittest.cc" "graph/preprocess/graph_preprocess_unittest.cc" "graph/manager/hcom_util_unittest.cc" "graph/manager/graph_caching_allocator_unittest.cc" "graph/partition/dynamic_shape_partition_unittest.cc" + "graph/manager/graph_manager_unittest.cc" "session/omg_omg_unittest.cc" ) @@ -810,6 +814,7 @@ set(PROFILING_MNG_TEST_FILES set(HYBRID_TEST_FILES "hybrid/ge_hybrid_unittest.cc" "hybrid/known_node_executor_unittest.cc" + "hybrid/executor/worker/execution_engine_unittest.cc" ) set(OTHERS_TEST_FILES diff --git a/tests/ut/ge/common/dump_exception_unittest.cc b/tests/ut/ge/common/dump_exception_unittest.cc new file mode 100644 index 00000000..339d532e --- /dev/null +++ b/tests/ut/ge/common/dump_exception_unittest.cc @@ -0,0 +1,54 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define protected public +#define private public +#include "common/dump/exception_dumper.h" +#include "common/debug/log.h" +#include "common/ge_inner_error_codes.h" +#undef private +#undef protected + +namespace ge { +class UTEST_dump_exception : public testing::Test { + protected: + void SetUp() {} + void TearDown() {} +}; + +TEST_F(UTEST_dump_exception, save_dump_op_info_success) { + OpDescPtr op_desc = std::make_shared("GatherV2", "GatherV2"); + uint32_t task_id = 1; + uint32_t stream_id = 233; + vector input_addr; + vector output_addr; + ExceptionDumper exception_dumper; + exception_dumper.SaveDumpOpInfo(op_desc, task_id, stream_id, input_addr, output_addr); +} + +TEST_F(UTEST_dump_exception, dump_exception_info) { + rtExceptionInfo exception_info = {1, 2, 3, 4, 5}; + std::vector exception_infos = { exception_info }; + OpDescInfo op_desc_info = {"Save", "Save", 1, 2, {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {}, {2}, + {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {}, {2}}; + + ExceptionDumper exception_dumper; + exception_dumper.op_desc_info_ = { op_desc_info }; + exception_dumper.DumpExceptionInfo(exception_infos); +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/common/format_transfer_unittest.cc b/tests/ut/ge/common/format_transfer_unittest.cc index 73b7703d..04817d29 100644 --- a/tests/ut/ge/common/format_transfer_unittest.cc +++ b/tests/ut/ge/common/format_transfer_unittest.cc @@ -51,35 +51,5 @@ TEST_F(UtestFormatTransfer, build_unsupported_transfer) { auto transfer2 = BuildFormatTransfer(args2); EXPECT_EQ(transfer2, nullptr); } - -TEST_F(UtestFormatTransfer, get_size_by_data_type) { - EXPECT_EQ(GetSizeByDataType(DT_FLOAT), 4); - EXPECT_EQ(GetSizeByDataType(DT_FLOAT16), 2); - EXPECT_EQ(GetSizeByDataType(DT_INT8), 1); - EXPECT_EQ(GetSizeByDataType(DT_INT16), 2); - EXPECT_EQ(GetSizeByDataType(DT_UINT16), 2); - EXPECT_EQ(GetSizeByDataType(DT_UINT8), 1); - EXPECT_EQ(GetSizeByDataType(DT_INT32), 4); - EXPECT_EQ(GetSizeByDataType(DT_INT64), 8); - EXPECT_EQ(GetSizeByDataType(DT_UINT32), 4); - EXPECT_EQ(GetSizeByDataType(DT_UINT64), 8); - EXPECT_EQ(GetSizeByDataType(DT_BOOL), 1); - EXPECT_EQ(GetSizeByDataType(DT_DOUBLE), 8); - EXPECT_EQ(GetSizeByDataType(DT_STRING), -1); - EXPECT_EQ(GetSizeByDataType(DT_DUAL_SUB_INT8), 1); - EXPECT_EQ(GetSizeByDataType(DT_DUAL_SUB_UINT8), 1); - EXPECT_EQ(GetSizeByDataType(DT_COMPLEX64), 8); - EXPECT_EQ(GetSizeByDataType(DT_COMPLEX128), 16); - EXPECT_EQ(GetSizeByDataType(DT_QINT8), 1); - EXPECT_EQ(GetSizeByDataType(DT_QINT16), 2); - EXPECT_EQ(GetSizeByDataType(DT_QINT32), 4); - EXPECT_EQ(GetSizeByDataType(DT_QUINT8), 1); - EXPECT_EQ(GetSizeByDataType(DT_QUINT16), 2); - EXPECT_EQ(GetSizeByDataType(DT_RESOURCE), 8); - EXPECT_EQ(GetSizeByDataType(DT_STRING_REF), -1); - EXPECT_EQ(GetSizeByDataType(DT_DUAL), 5); - EXPECT_EQ(GetSizeByDataType(DT_UNDEFINED), -1); - EXPECT_EQ(DT_UNDEFINED, 28); -} } // namespace formats } // namespace ge diff --git a/tests/ut/ge/graph/build/mem_assigner_unittest.cc b/tests/ut/ge/graph/build/mem_assigner_unittest.cc index ba5cdcd4..c883e87f 100644 --- a/tests/ut/ge/graph/build/mem_assigner_unittest.cc +++ b/tests/ut/ge/graph/build/mem_assigner_unittest.cc @@ -191,6 +191,30 @@ class UtestMemoryAssignerTest : public testing::Test { return builder.GetGraph(); } + ComputeGraphPtr MakeRefNodeGraph() { + ge::ut::GraphBuilder builder("graph"); + auto var_input = builder.AddNode("var", "Variable", 1, 1); + auto const_input = builder.AddNode("const", "Const", 1, 1); + auto assign = builder.AddNode("assgin", "Assign", 2, 1); + // add link + builder.AddDataEdge(var_input, 0, assign, 0); + builder.AddDataEdge(const_input, 0, assign, 1); + // set offset + assign->GetOpDesc()->SetInputOffset({100, 0}); + assign->GetOpDesc()->SetOutputOffset({10000}); + var_input->GetOpDesc()->SetOutputOffset({10000}); + const_input->GetOpDesc()->SetOutputOffset({1000}); + // set mem type + ge::AttrUtils::SetListInt(assign->GetOpDesc(), ATTR_NAME_INPUT_MEM_TYPE_LIST, {RT_MEMORY_HBM, RT_MEMORY_L1}); + // set ref + auto output_tensordesc = assign->GetOpDesc()->MutableOutputDesc(0); + ge::TensorUtils::SetReuseInput(*output_tensordesc, true); + uint32_t reuse_input_index = 0; + ge::TensorUtils::SetReuseInputIndex(*output_tensordesc, reuse_input_index); + + return builder.GetGraph(); + } + protected: void SetUp() {} @@ -298,4 +322,42 @@ TEST_F(UtestMemoryAssignerTest, graph_memory_assign_ref_var_not_found) { size_t zero_memory_size = 0; VarManager::Instance(0)->Init(0, 0, 0, 0); EXPECT_NE(memory_assigner.AssignMemory(false, mem_offset, zero_memory_size), GRAPH_SUCCESS); -} \ No newline at end of file +} + +TEST_F(UtestMemoryAssignerTest, graph_memory_assign_set_input_offset) { + ge::ComputeGraphPtr graph = MakeRefNodeGraph(); + auto assgin = graph->FindNode("assgin"); + EXPECT_EQ(assgin->GetOpDesc()->GetOutputOffset()[0], 10000); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[0], 100); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[1], 0); + GraphMemoryAssigner memoryAssigner(graph); + MemoryOffset memory_offset(RT_MEMORY_HBM, 0); + memoryAssigner.memory_offset_.emplace(RT_MEMORY_HBM, memory_offset); + EXPECT_EQ(memoryAssigner.SetInputOffset(), GRAPH_SUCCESS); + EXPECT_EQ(assgin->GetOpDesc()->GetOutputOffset()[0], 10100); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[0], 10100); + EXPECT_EQ(assgin->GetOpDesc()->GetInputOffset()[1], 0); + EXPECT_EQ(memoryAssigner.CheckOffset(), GRAPH_SUCCESS); +} + +TEST_F(UtestMemoryAssignerTest, graph_memory_assign_update_ref_op_offset_reverse) { + ge::ut::GraphBuilder builder("graph"); + auto data_input = builder.AddNode("data", "Data", 1, 1); + auto const_input = builder.AddNode("const", "Const", 1, 1); + auto add = builder.AddNode("add", "Add", 2, 1); + // add link + builder.AddDataEdge(data_input, 0, add, 0); + builder.AddDataEdge(const_input, 0, add, 1); + // set ref + uint32_t reuse_input_index = 0; + auto output_tensordesc = data_input->GetOpDesc()->MutableOutputDesc(0); + ge::TensorUtils::SetReuseInput(*output_tensordesc, true); + ge::TensorUtils::SetReuseInputIndex(*output_tensordesc, reuse_input_index); + auto output_tensordesc1 = add->GetOpDesc()->MutableOutputDesc(0); + ge::TensorUtils::SetReuseInput(*output_tensordesc1, true); + ge::TensorUtils::SetReuseInputIndex(*output_tensordesc1, reuse_input_index); + ge::ComputeGraphPtr graph = builder.GetGraph(); + + GraphMemoryAssigner memoryAssigner(graph); + EXPECT_EQ(memoryAssigner.UpdateRefOpOffsetReverse(add), SUCCESS); +} diff --git a/tests/ut/ge/graph/execute/graph_execute_unittest.cc b/tests/ut/ge/graph/execute/graph_execute_unittest.cc new file mode 100644 index 00000000..e340df2f --- /dev/null +++ b/tests/ut/ge/graph/execute/graph_execute_unittest.cc @@ -0,0 +1,128 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#define protected public +#define private public +#include "graph/execute/graph_execute.h" +#include "graph/load/model_manager/model_manager.h" +#include "graph/load/model_manager/davinci_model.h" +#undef private +#undef public + + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace testing; +using namespace ge; +using namespace domi; + +namespace ge { +namespace { +const uint32_t kInvalidModelId = UINT32_MAX; +} + +class UtestGraphExecuteTest : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +TEST_F(UtestGraphExecuteTest, get_execute_model_id_invalid) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + GeRootModelPtr ge_root_model = MakeShared(graph); + auto model_id = executor.GetExecuteModelId(ge_root_model); + EXPECT_EQ(model_id, kInvalidModelId); +} + +TEST_F(UtestGraphExecuteTest, get_execute_model_id_1) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + GeRootModelPtr ge_root_model = MakeShared(graph); + auto model_manager = ModelManager::GetInstance(); + shared_ptr davinci_model1 = MakeShared(1, nullptr); + davinci_model1->SetId(1); + model_manager->InsertModel(1, davinci_model1); + ge_root_model->SetModelId(1); + auto model_id = executor.GetExecuteModelId(ge_root_model); + EXPECT_EQ(model_id, 1); +} + +TEST_F(UtestGraphExecuteTest, get_execute_model_id_2) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + GeRootModelPtr ge_root_model = MakeShared(graph); + auto model_manager = ModelManager::GetInstance(); + // model1 with 2 load + shared_ptr davinci_model1 = MakeShared(1, nullptr); + davinci_model1->SetId(1); + davinci_model1->data_inputer_ = new DataInputer(); + auto data = MakeShared(); + davinci_model1->data_inputer_->Push(data); + davinci_model1->data_inputer_->Push(data); + model_manager->InsertModel(1, davinci_model1); + // model 2 with 3 load + shared_ptr davinci_model2 = MakeShared(1, nullptr); + davinci_model2->SetId(2); + davinci_model2->data_inputer_ = new DataInputer(); + davinci_model2->data_inputer_->Push(data); + davinci_model2->data_inputer_->Push(data); + davinci_model2->data_inputer_->Push(data); + model_manager->InsertModel(2, davinci_model2); + // model 3 witH 1 load + shared_ptr davinci_model3 = MakeShared(1, nullptr); + davinci_model3->SetId(3); + davinci_model3->data_inputer_ = new DataInputer(); + davinci_model3->data_inputer_->Push(data); + model_manager->InsertModel(3, davinci_model3); + + ge_root_model->SetModelId(1); + ge_root_model->SetModelId(2); + ge_root_model->SetModelId(3); + + auto model_id = executor.GetExecuteModelId(ge_root_model); + // model 3 is picked for having least loads + EXPECT_EQ(model_id, 3); +} + +TEST_F(UtestGraphExecuteTest, test_set_callback) { + GraphExecutor executor; + ComputeGraphPtr graph = MakeShared("test"); + // is_unknown_shape_graph_ = false + GeRootModelPtr ge_root_model = MakeShared(graph); + RunAsyncCallback callback = [](Status, std::vector &) {}; + + auto model_manager = ModelManager::GetInstance(); + auto listener = MakeShared(); + shared_ptr davinci_model1 = MakeShared(1, listener); + davinci_model1->SetId(1); + model_manager->InsertModel(1, davinci_model1); + auto status = executor.SetCallback(1, ge_root_model, callback); + EXPECT_EQ(status, SUCCESS); +} +} // namespace ge \ No newline at end of file diff --git a/tests/ut/ge/graph/load/data_dumper_unittest.cc b/tests/ut/ge/graph/load/data_dumper_unittest.cc index 68040bf1..1eb9eb49 100644 --- a/tests/ut/ge/graph/load/data_dumper_unittest.cc +++ b/tests/ut/ge/graph/load/data_dumper_unittest.cc @@ -38,29 +38,53 @@ std::vector stub_get_output_addrs(const RuntimeParam &model_param, Const res.emplace_back(reinterpret_cast(23333)); return res; } -/* -TEST_F(UtestDataDumper, LoadDumpInfo_no_output_addrs_fail) { + +static ge::OpDescPtr CreateOpDesc(string name = "", string type = "") { + auto op_desc = std::make_shared(name, type); + op_desc->SetStreamId(0); + op_desc->SetId(0); + + op_desc->SetWorkspace({}); + op_desc->SetWorkspaceBytes({}); + op_desc->SetInputOffset({}); + op_desc->SetOutputOffset({100, 200}); + return op_desc; +} + +TEST_F(UtestDataDumper, LoadDumpInfo_success) { RuntimeParam rts_param; - DataDumper data_dumper(rts_param); + DataDumper data_dumper(&rts_param); data_dumper.SetModelName("test"); data_dumper.SetModelId(2333); std::shared_ptr op_desc_1(new OpDesc()); op_desc_1->AddOutputDesc("test", GeTensorDesc()); data_dumper.SaveDumpTask(0, 0, op_desc_1, 0); string dump_mode = "output"; + data_dumper.is_op_debug_ = true; data_dumper.dump_properties_.SetDumpMode(dump_mode); - Status ret = data_dumper.LoadDumpInfo(); - EXPECT_EQ(ret, SUCCESS); + EXPECT_EQ(data_dumper.LoadDumpInfo(), SUCCESS); + EXPECT_EQ(data_dumper.UnloadDumpInfo(), SUCCESS); } -*/ -TEST_F(UtestDataDumper, UnloadDumpInfo_success) { +TEST_F(UtestDataDumper, DumpOutputWithTask_success) { RuntimeParam rts_param; DataDumper data_dumper(&rts_param); data_dumper.SetModelName("test"); data_dumper.SetModelId(2333); - Status ret = data_dumper.UnloadDumpInfo(); + toolkit::aicpu::dump::Task task; + OpDescPtr op_desc = CreateOpDesc("conv", CONVOLUTION); + GeTensorDesc tensor_0(GeShape(), FORMAT_NCHW, DT_FLOAT); + GeTensorDesc tensor_1(GeShape(), FORMAT_NCHW, DT_FLOAT); + int32_t calc_type = 1; + ge::AttrUtils::SetInt(tensor_1, ATTR_NAME_MEMORY_SIZE_CALC_TYPE, calc_type); + op_desc->AddOutputDesc(tensor_0); + op_desc->AddOutputDesc(tensor_1); + DataDumper::InnerDumpInfo inner_dump_info; + inner_dump_info.op = op_desc; + Status ret = data_dumper.DumpOutputWithTask(inner_dump_info, task); EXPECT_EQ(ret, SUCCESS); + int64_t task_size = 1; + data_dumper.GenerateOpBuffer(task_size, task); } } // namespace ge diff --git a/tests/ut/ge/graph/load/davinci_model_unittest.cc b/tests/ut/ge/graph/load/davinci_model_unittest.cc index 0cf0f5cb..56a91ef8 100644 --- a/tests/ut/ge/graph/load/davinci_model_unittest.cc +++ b/tests/ut/ge/graph/load/davinci_model_unittest.cc @@ -1034,4 +1034,16 @@ TEST_F(UtestDavinciModel, NnExecute) { model.task_list_.resize(1); EXPECT_EQ(model.NnExecute(stream, false, input_data, output_data), SUCCESS); } +TEST_F(UtestDavinciModel, update_io_addr_success) { + DavinciModel model(0, nullptr); + uint32_t task_id = 1; + uint32_t stream_id = 2; + model.fixed_mem_base_ = 0x22; + model.mem_base_ = reinterpret_cast(&task_id); + OpDescInfo op_desc_info = {"Save", "Save", 1, 2, {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {nullptr}, {2}, + {FORMAT_NCHW}, {{1}}, {DT_FLOAT}, {nullptr}, {2}}; + model.exception_dumper_.op_desc_info_ = { op_desc_info }; + vector io_addr = {nullptr, nullptr}; + model.UpdateOpIOAddrs(task_id, stream_id, io_addr); +} } // namespace ge diff --git a/tests/ut/ge/graph/manager/graph_manager_unittest.cc b/tests/ut/ge/graph/manager/graph_manager_unittest.cc new file mode 100644 index 00000000..79beb02d --- /dev/null +++ b/tests/ut/ge/graph/manager/graph_manager_unittest.cc @@ -0,0 +1,405 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#define protected public +#define private public +#include "graph/manager/graph_manager.h" +#include "graph/load/model_manager/model_manager.h" +#include "graph/load/model_manager/davinci_model.h" +#define const +#include "common/helper/model_cache_helper.h" +#undef const +#include "init/gelib.h" +#undef private +#undef public + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/math/math_util.h" +#include "common/thread_pool.h" +#include "common/dump/dump_manager.h" +#include "analyzer/analyzer.h" +#include "graph/common/ge_call_wrapper.h" +#include "graph/common/local_context.h" +#include "graph/common/transop_util.h" +#include "graph/ge_context.h" +#include "graph/ge_global_options.h" +#include "graph/manager/util/rt_context_util.h" +#include "graph/partition/dynamic_shape_partition.h" +#include "graph/passes/enter_pass.h" +#include "graph/partition/stage_partition.h" +#include "graph/passes/addn_pass.h" +#include "graph/passes/bitcast_pass.h" +#include "graph/passes/assign_remove_pass.h" +#include "graph/passes/inplace_support_check_pass.h" +#include "graph/passes/atomic_addr_clean_pass.h" +#include "graph/passes/attach_stream_label_pass.h" +#include "graph/passes/cast_remove_pass.h" +#include "graph/passes/common_subexpression_elimination_pass.h" +#include "graph/passes/compile_nodes_pass.h" +#include "graph/passes/cond_remove_pass.h" +#include "graph/passes/constant_folding_pass.h" +#include "graph/passes/constant_fuse_same_pass.h" +#include "graph/passes/control_trigger_pass.h" +#include "graph/passes/ctrl_edge_transfer_pass.h" +#include "graph/passes/dimension_adjust_pass.h" +#include "graph/passes/dimension_compute_pass.h" +#include "graph/passes/flow_ctrl_pass.h" +#include "graph/passes/fuse_data_nodes_with_common_input_pass.h" +#include "graph/passes/identity_pass.h" +#include "graph/passes/input_output_connection_identify_pass.h" +#include "graph/passes/iterator_op_pass.h" +#include "graph/passes/link_gen_mask_nodes_pass.h" +#include "graph/passes/mark_graph_unknown_status_pass.h" +#include "graph/passes/merge_pass.h" +#include "graph/passes/merge_input_memcpy_pass.h" +#include "graph/passes/merge_to_stream_merge_pass.h" +#include "graph/passes/multi_batch_pass.h" +#include "graph/passes/next_iteration_pass.h" +#include "graph/passes/permute_pass.h" +#include "graph/passes/prune_pass.h" +#include "graph/passes/ref_identity_delete_op_pass.h" +#include "graph/passes/remove_same_const_pass.h" +#include "graph/passes/reshape_recovery_pass.h" +#include "graph/passes/reshape_remove_pass.h" +#include "graph/passes/same_transdata_breadth_fusion_pass.h" +#include "graph/passes/subgraph_pass.h" +#include "graph/passes/switch_data_edges_bypass.h" +#include "graph/passes/switch_dead_branch_elimination.h" +#include "graph/passes/switch_logic_remove_pass.h" +#include "graph/passes/switch_to_stream_switch_pass.h" +#include "graph/passes/transop_breadth_fusion_pass.h" +#include "graph/passes/transop_nearby_allreduce_fusion_pass.h" +#include "graph/passes/transop_symmetry_elimination_pass.h" +#include "graph/passes/transop_without_reshape_fusion_pass.h" +#include "graph/passes/transpose_transdata_pass.h" +#include "graph/passes/useless_control_out_remove_pass.h" +#include "graph/passes/variable_op_pass.h" +#include "graph/passes/variable_ref_delete_op_pass.h" +#include "graph/passes/variable_ref_useless_control_out_delete_pass.h" +#include "graph/passes/end_of_sequence_add_control_pass.h" +#include "graph/passes/subexpression_migration_pass.h" +#include "graph/passes/subgraph_const_migration_pass.h" +#include "graph/passes/unused_args_clean_pass.h" +#include "graph/passes/global_step_insert_pass.h" +#include "graph/passes/memcpy_addr_async_pass.h" +#include "graph/passes/hccl_continuous_memcpy_pass.h" +#include "graph/build/label_allocator.h" +#include "graph/utils/tensor_adapter.h" +#include "inc/pass_manager.h" +#include "ir_build/option_utils.h" +#include "graph/common/local_context.h" +#include "graph/common/omg_util.h" +#include "common/formats/utils/formats_trans_utils.h" +#include "register/custom_pass_helper.h" +#include "graph/ops_stub.h" + +using namespace std; +using namespace testing; +using namespace ge; +using namespace domi; + +namespace { +const uint32_t kNotAdded = 0; +const uint32_t kStartAdd = 1; +const uint32_t kDoneAdded = 2; +} +class UtestGraphManagerTest : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +void CreateGraph(Graph &graph) { + TensorDesc desc(ge::Shape({1, 3, 224, 224})); + uint32_t size = desc.GetShape().GetShapeSize(); + desc.SetSize(size); + auto data = op::Data("Data").set_attr_index(0); + data.update_input_desc_data(desc); + data.update_output_desc_out(desc); + + auto flatten = op::Flatten("Flatten").set_input_x(data, data.name_out_out()); + + std::vector inputs{data}; + std::vector outputs{flatten}; + std::vector targets{flatten}; + // Graph graph("test_graph"); + graph.SetInputs(inputs).SetOutputs(outputs).SetTargets(targets); +} + +TEST_F(UtestGraphManagerTest, set_and_get_add_graph_flag) { + GraphId graph_id = 1; + GraphManager graph_manager; + graph_manager.SetAddGraphCondition(graph_id, 1); + uint32_t res = graph_manager.GetAddGraphCondition(graph_id); + EXPECT_EQ(res, 1); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_1) { + GraphId graph_id = 1; + GraphManager graph_manager; + // create graph + Graph graph("test_graph"); + CreateGraph(graph); + + std::map options; + OmgContext context; + Status status = graph_manager.AddGraph(graph_id, graph, options, context); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_2) { + GraphId graph_id = 1; + GraphManager graph_manager; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_manager.SetAddGraphCondition(graph_id, kDoneAdded); + Graph graph("test_graph"); + CreateGraph(graph); + std::map options; + OmgContext context; + Status status = graph_manager.AddGraph(graph_id, graph, options, context); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_3) { + GraphId graph_id = 1; + GraphManager graph_manager; + Graph graph("test_graph"); + CreateGraph(graph); + + std::map options; + OmgContext context; + + std::future fut1 = std::async(std::launch::async, + &GraphManager::AddGraph, &graph_manager, graph_id, graph, options, context); + std::future fut2 = std::async(std::launch::async, + &GraphManager::AddGraph, &graph_manager, graph_id, graph, options, context); + fut1.wait(); + fut2.wait(); + Status status1 = fut1.get(); + Status status2 = fut2.get(); + EXPECT_EQ(status1, ge::SUCCESS); + EXPECT_EQ(status2, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_remove_graph_1) { + GraphId graph_id = 1; + GraphManager graph_manager; + GraphNodePtr graph_node = MakeShared(graph_id); + Status status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::GE_GRAPH_GRAPH_NOT_EXIST); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_node->SetRunFlag(true); + status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_remove_graph_2) { + GraphId graph_id = 1; + GraphManager graph_manager; + GraphNodePtr graph_node = MakeShared(graph_id); + Graph graph("test_graph"); + CreateGraph(graph); + auto compute_graph = GraphUtils::GetComputeGraph(graph); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + auto model_manager = ModelManager::GetInstance(); + auto listener = MakeShared(); + shared_ptr davinci_model1 = MakeShared(1, listener); + davinci_model1->SetId(1); + shared_ptr davinci_model2 = MakeShared(2, listener); + davinci_model1->SetId(2); + model_manager->InsertModel(1, davinci_model1); + model_manager->InsertModel(2, davinci_model2); + ge_root_model->SetModelId(1); + ge_root_model->SetModelId(2); + graph_node->SetGeRootModel(ge_root_model); + graph_node->SetLoadFlag(true); + graph_manager.AddGraphNode(graph_id, graph_node); + Status status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_pre_run_thread) { + + GraphManager graph_manager; + graph_manager.thread_run_flag_ = true; + + GraphId graph_id = 1; + std::vector input_tensor; + uint64_t session_id = 0; + error_message::Context error_context; + GEThreadLocalContext context; + RunAsyncCallback callback; + // PreRunArgs args{graph_id, input_tensor, session_id, error_context, context, callback}; + bool ret = graph_manager.prerun_args_q_.Push({graph_id, input_tensor, session_id, error_context, context, callback}); + EXPECT_EQ(ret, true); + + GraphNodePtr graph_node = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_manager.PreRunThread(&graph_manager); + // end with failed +} + +TEST_F(UtestGraphManagerTest, test_pre_run_thread_2) { + + GraphManager graph_manager; + graph_manager.thread_run_flag_ = true; + + GraphId graph_id = 1; + GraphNodePtr graph_node_1 = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node_1); + graph_manager.IncreaseGraphCount(graph_id); + graph_manager.IncreaseGraphCount(graph_id); + graph_node_1->SetBuildFlag(true); + std::vector input_tensor; + uint64_t session_id = 0; + error_message::Context error_context; + GEThreadLocalContext context; + RunAsyncCallback callback; + // PreRunArgs args{graph_id, input_tensor, session_id, error_context, context, callback}; + bool ret = graph_manager.prerun_args_q_.Push({graph_id, input_tensor, session_id, error_context, context, callback}); + EXPECT_EQ(ret, true); + graph_id = 2; + GraphNodePtr graph_node_2 = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node_2); + ret = graph_manager.prerun_args_q_.Push({graph_id, input_tensor, session_id, error_context, context, callback}); + EXPECT_EQ(ret, true); + graph_manager.PreRunThread(&graph_manager); + // end with failed +} + +TEST_F(UtestGraphManagerTest, test_check_and_release_memory) { + + GraphManager graph_manager; + GeModelPtr ge_model = make_shared(); + int64_t memory_size = 25 * 1024UL * 1024UL * 1024UL; + int64_t weight_size = 25 * 1024UL * 1024UL * 1024UL; + uint64_t session_id = 0; + ge::AttrUtils::SetInt(ge_model, ATTR_MODEL_MEMORY_SIZE, memory_size); + ge::AttrUtils::SetInt(ge_model, ATTR_MODEL_WEIGHT_SIZE, weight_size); + ge::AttrUtils::SetInt(ge_model, MODEL_ATTR_SESSION_ID, session_id); + + + GraphId graph_id = 1; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_manager.AddGraphNode(graph_id, graph_node); + graph_manager.IncreaseGraphCount(graph_id); + graph_manager.IncreaseGraphCount(graph_id); + + auto model_manager = ModelManager::GetInstance(); + auto listener = MakeShared(); + shared_ptr davinci_model1 = MakeShared(1, listener); + davinci_model1->SetId(1); + shared_ptr davinci_model2 = MakeShared(2, listener); + davinci_model1->SetId(2); + model_manager->InsertModel(1, davinci_model1); + model_manager->InsertModel(2, davinci_model2); + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + bool is_dynamic_shape = false; + (void)AttrUtils::GetBool(compute_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + ge_root_model->SetModelId(1); + ge_root_model->SetModelId(2); + graph_node->SetGeRootModel(ge_root_model); + graph_node->SetLoadFlag(true); + Status status = graph_manager.CheckAndReleaseMemory(ge_model, graph_node); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_check_incre_build_and_pre_run_1) { + // no need to build + GraphId graph_id = 1; + GraphManager graph_manager; + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + GraphManager::PreRunArgs arg; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_node->SetBuildFlag(true); + Status status = graph_manager.CheckIncreBuildAndPreRun(&graph_manager, arg, graph_node, ge_root_model); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_check_incre_build_and_pre_run_2) { + // need build while buildflag is true, var format changed + GraphId graph_id = 1; + GraphManager graph_manager; + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + GraphManager::PreRunArgs arg; + arg.callback = [](Status, std::vector &) {}; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_node->SetBuildFlag(true); + graph_node->Lock(); + graph_manager.var_acc_ctrl_.graph_ids_need_rebuild_.insert(graph_id); + Status status = graph_manager.CheckIncreBuildAndPreRun(&graph_manager, arg, graph_node, ge_root_model); + EXPECT_EQ(status, ge::PARAM_INVALID); +} + +TEST_F(UtestGraphManagerTest, test_check_incre_build_and_pre_run_3) { + // need build while buildflag is false, var format unchanged + GraphId graph_id = 1; + GraphManager graph_manager; + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + GeRootModelPtr ge_root_model = MakeShared(compute_graph); + GraphManager::PreRunArgs arg; + arg.callback = [](Status, std::vector &) {}; + GraphNodePtr graph_node = MakeShared(graph_id); + graph_node->SetBuildFlag(false); + graph_node->Lock(); + Status status = graph_manager.CheckIncreBuildAndPreRun(&graph_manager, arg, graph_node, ge_root_model); + EXPECT_NE(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_with_copy_success) { + GraphId graph_id = 1; + GraphManager graph_manager; + // create graph + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + Graph graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph); + + std::map options; + OmgContext context; + Status status = graph_manager.AddGraphWithCopy(graph_id, graph, options, context); + EXPECT_EQ(status, ge::SUCCESS); +} + +TEST_F(UtestGraphManagerTest, test_add_graph_with_copy_fail) { + GraphId graph_id = 1; + GraphManager graph_manager; + // create graph + ComputeGraphPtr compute_graph = MakeShared("test_graph"); + Graph graph = GraphUtils::CreateGraphFromComputeGraph(compute_graph); + + std::map options; + OmgContext context; + Status status = graph_manager.AddGraph(graph_id, graph, options, context); + EXPECT_EQ(status, ge::SUCCESS); + status = graph_manager.RemoveGraph(graph_id); + EXPECT_EQ(status, ge::SUCCESS); + status = graph_manager.AddGraphWithCopy(graph_id, graph, options, context); + EXPECT_NE(status, ge::SUCCESS); +} diff --git a/tests/ut/ge/graph/passes/cast_remove_pass_unittest.cc b/tests/ut/ge/graph/passes/cast_remove_pass_unittest.cc new file mode 100644 index 00000000..6831d451 --- /dev/null +++ b/tests/ut/ge/graph/passes/cast_remove_pass_unittest.cc @@ -0,0 +1,117 @@ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#define protected public +#define private public +#include "graph/passes/cast_remove_pass.h" +#undef protected +#undef private + +#include "anchor.h" +#include "common/debug/log.h" +#include "common/debug/memory_dumper.h" +#include "common/op/attr_value_util.h" +#include "common/types.h" +#include "framework/common/ge_inner_error_codes.h" +#include "graph/attr_value.h" +#include "graph/debug/ge_attr_define.h" +#include "inc/pass_manager.h" +#include "graph_builder_utils.h" +#include +#include +#include +#include "opskernel_manager/ops_kernel_manager.h" +#include "omg/omg_inner_types.h" + + +using namespace testing; +using namespace ge; +using namespace std; + +class UtestGraphPassesCastRemovePass : public testing::Test { + protected: + void SetUp() {} + + void TearDown() {} +}; + +// case1:no net_out_put_node +// TEST_F(UtestGraphPassesCastRemovePass, DoFuseProcess) { +// std::vector nodes_to_fuse; + +// auto builder = ut::GraphBuilder("g1"); +// auto data = builder.AddNode("data", DATA, 1, 1); +// auto cast1 = builder.AddNode("cast1", CAST, 1, 1); +// cast1->GetOpDesc()->MutableOutputDesc(0)->SetDataType(DT_FLOAT16); +// auto trans = builder.AddNode("trans", TRANSPOSE, 1, 1, FORMAT_NCHW, DT_FLOAT16); +// auto cast2 = builder.AddNode("cast2", CAST, 1, 1); +// cast2->GetOpDesc()->MutableInputDesc(0)->SetDataType(DT_FLOAT16); +// auto net = builder.AddNode("netout", NETOUTPUT, 1, 1); + +// builder.AddDataEdge(data, 0, cast1, 0); +// builder.AddDataEdge(cast1, 0, trans, 0); +// builder.AddDataEdge(trans, 0, cast2, 0); +// builder.AddDataEdge(cast2, 0, net, 0); +// ComputeGraphPtr compute_graph = builder.GetGraph(); + +// map options; + +// CastRemovePass cast_remove_pass; +// DataType type = DT_FLOAT; +// nodes_to_fuse.emplace_back(cast1); +// nodes_to_fuse.emplace_back(trans); +// nodes_to_fuse.emplace_back(cast2); +// OpsKernelManager ops_kernel_manager; +// cast_remove_pass.DoFuse(ops_kernel_manager, type, nodes_to_fuse); +// EXPECT_EQ(compute_graph->GetAllNodesSize(),5); +// std::vector to_be_deleted_cast_index; +// to_be_deleted_cast_index.emplace_back(0); +// to_be_deleted_cast_index.emplace_back(2); +// (void)cast_remove_pass.DoRemoveCast(to_be_deleted_cast_index, nodes_to_fuse); +// EXPECT_EQ(compute_graph->GetAllNodesSize(),3); +// } + +TEST_F(UtestGraphPassesCastRemovePass, DoFuseProcess) { + std::vector nodes_to_fuse; + + auto builder = ut::GraphBuilder("g1"); + auto data = builder.AddNode("data", DATA, 1, 1); + auto cast1 = builder.AddNode("cast1", CAST, 1, 1); + cast1->GetOpDesc()->MutableOutputDesc(0)->SetDataType(DT_FLOAT16); + auto trans = builder.AddNode("trans", TRANSPOSE, 1, 1, FORMAT_NCHW, DT_FLOAT16); + auto cast2 = builder.AddNode("cast2", CAST, 1, 1); + cast2->GetOpDesc()->MutableInputDesc(0)->SetDataType(DT_FLOAT16); + auto net = builder.AddNode("netout", NETOUTPUT, 1, 1); + + builder.AddDataEdge(data, 0, cast1, 0); + builder.AddDataEdge(cast1, 0, trans, 0); + builder.AddDataEdge(trans, 0, cast2, 0); + builder.AddDataEdge(cast2, 0, net, 0); + ComputeGraphPtr compute_graph = builder.GetGraph(); + + map options; + + CastRemovePass cast_remove_pass; + DataType type = DT_FLOAT; + nodes_to_fuse.emplace_back(cast1); + nodes_to_fuse.emplace_back(trans); + nodes_to_fuse.emplace_back(cast2); + cast_remove_pass.RemoveCast(type, nodes_to_fuse); + EXPECT_EQ(compute_graph->GetAllNodesSize(),3); +} diff --git a/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc b/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc index 79e34a60..41ea5828 100644 --- a/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/dimension_adjust_pass_unittest.cc @@ -28,6 +28,7 @@ #include "graph/types.h" #include "graph/utils/graph_utils.h" #include "graph/utils/op_desc_utils.h" +#include "inc/kernel.h" #include "inc/kernel_factory.h" #undef protected #undef private @@ -37,11 +38,27 @@ using namespace testing; namespace ge { +class TestExpandDimKernel : public Kernel { + public: + Status Compute(const NodePtr &node_ptr) override { + return SUCCESS; + } +}; +REGISTER_KERNEL(EXPANDDIMS, TestExpandDimKernel); +class TestExpandDimKernelNotChange : public Kernel { + public: + Status Compute(const NodePtr &node_ptr) override { + return NOT_CHANGED; + } +}; + class UtestGraphPassesDimensionAdjustPass : public testing::Test { protected: void SetUp() {} - void TearDown() {} + void TearDown() { + KernelFactory::Instance().creator_map_.clear(); + } }; TEST_F(UtestGraphPassesDimensionAdjustPass, succ) { @@ -96,8 +113,11 @@ TEST_F(UtestGraphPassesDimensionAdjustPass, succ) { GraphUtils::AddEdge(op_node->GetOutDataAnchor(0), netoutput_node->GetInDataAnchor(0)); std::shared_ptr pass = make_shared(); + NamesToPass names_to_passes; + EXPECT_EQ(4, graph->GetDirectNodesSize()); ge::Status ret = pass->Run(op_node); EXPECT_EQ(SUCCESS, ret); + EXPECT_EQ(2, op_node->GetOwnerComputeGraph()->GetDirectNodesSize()); } TEST_F(UtestGraphPassesDimensionAdjustPass, input_node_is_nullptr) { diff --git a/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc b/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc index 511ddece..716cc91d 100644 --- a/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/link_gen_mask_nodes_pass_unittest.cc @@ -43,7 +43,7 @@ ut::GraphBuilder Graph1Builder() { ut::GraphBuilder builder = ut::GraphBuilder("g1"); auto const1 = builder.AddNode("const1", "Const", 0, 1); auto const2 = builder.AddNode("const2", "Const", 0, 1); - auto gen_mask1 = builder.AddNode("gen_mask1", "DropOutGenMask", 2, 1); + auto gen_mask1 = builder.AddNode("gen_mask1_DropOutGenMask", "DropOutGenMask", 2, 1); auto gen_mask2 = builder.AddNode("gen_mask2", "DropOutGenMaskV3", 2, 1); auto gen_mask3 = builder.AddNode("gen_mask3", "DropOutGenMaskV3D", 2, 1); auto do_mask1 = builder.AddNode("do_mask1", "DropOutDoMask", 3, 1); @@ -106,6 +106,6 @@ TEST_F(UtestLinkGenMaskNodesPass, link_gen_mask_nodes_pass_success) { auto out_ctrl_nodes = gen_mask2->GetOutControlNodes(); EXPECT_EQ(out_ctrl_nodes.size(), 1); auto out_ctrl_node = out_ctrl_nodes.at(0); - EXPECT_EQ(out_ctrl_node->GetName(), "gen_mask1"); + EXPECT_EQ(out_ctrl_node->GetName(), "gen_mask1_DropOutGenMask"); } } // namespace ge diff --git a/tests/ut/ge/graph/passes/net_output_pass_unittest.cc b/tests/ut/ge/graph/passes/net_output_pass_unittest.cc index 031985f3..ac6cd63a 100644 --- a/tests/ut/ge/graph/passes/net_output_pass_unittest.cc +++ b/tests/ut/ge/graph/passes/net_output_pass_unittest.cc @@ -631,6 +631,23 @@ TEST_F(UtestGraphPassesNetOutputPass, no_output_no_target_no_retval_success) { EXPECT_EQ(status, ge::SUCCESS); } +TEST_F(UtestGraphPassesNetOutputPass, no_output_no_target_no_retval_no_outnodes_success) { + ge::ComputeGraphPtr compute_graph = build_graph(); + + ge::PassManager pass_managers; + pass_managers.AddPass("", new (std::nothrow) NetOutputPass); + Status status = pass_managers.Run(compute_graph); + EXPECT_EQ(status, ge::SUCCESS); + + NodePtr net_out_node = compute_graph->FindNode(NODE_NAME_NET_OUTPUT); + EXPECT_NE(net_out_node, nullptr); + EXPECT_EQ(net_out_node->GetInControlNodes().size(), 2); + + int stream_label = -1; + EXPECT_TRUE(ge::AttrUtils::GetInt(net_out_node->GetOpDesc(), ATTR_NAME_TRUE_BRANCH_STREAM, stream_label)); + EXPECT_EQ(stream_label, 0); +} + TEST_F(UtestGraphPassesNetOutputPass, user_out_node_success) { ge::ComputeGraphPtr compute_graph = build_graph(); diff --git a/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc b/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc index dd6b1881..ec7b9488 100644 --- a/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc +++ b/tests/ut/ge/graph_ir/ge_ir_build_unittest.cc @@ -15,8 +15,9 @@ */ #include -#include "ir_build/atc_ir_common.h" +#include "ir_build/option_utils.h" #include "graph/testcase/ge_graph/graph_builder_utils.h" +#include "graph/debug/ge_attr_define.h" #define protected public #define private public @@ -68,6 +69,20 @@ TEST(UtestIrCommon, update_data_op_shape) { EXPECT_EQ(ret, ge::SUCCESS); } +TEST(UtestIrCommon, update_data_op_shape_range) { + ge::OpDescPtr op_desc = CreateOpDesc("Data", "Data"); + std::vector>> index_shape_range_map; + + std::pair range_pair(1, 2); + vector> range_pair_tmp = { range_pair }; + + index_shape_range_map.push_back(range_pair_tmp); + + AttrUtils::SetInt(op_desc, ATTR_NAME_INDEX, 0); + Status ret = UpdateDataOpShapeRange(op_desc, index_shape_range_map); + EXPECT_EQ(ret, ge::SUCCESS); +} + TEST(UtestIrCommon, update_dynamic_shape_range_success) { ComputeGraphPtr graph = BuildComputeGraph(); std::string input_shape_range = "input1:[1, 2~3, -1];input2:[3~5, 10]"; @@ -108,3 +123,107 @@ TEST(UtestIrCommon, update_dynamic_shape_range_failed) { ret = UpdateDynamicInputShapeRange(graph, input_shape_range); EXPECT_EQ(ret, ge::PARAM_INVALID); } + +TEST(UtestIrCommon, check_dynamic_image_size_fail) { + map> shape_map; + shape_map["input1"] = {8, 3, -1, -1}; + string input_format = "NCHW"; + string dynamic_image_size = "@64,64;128,128;"; + + bool ret = CheckDynamicImagesizeInputShapeValid(shape_map, input_format, dynamic_image_size); + EXPECT_EQ(ret, false); +} + +TEST(UtestIrCommon, check_input_format_failed) { + std::string format = "invalid"; + Status ret = CheckInputFormat(format); + EXPECT_EQ(ret, ge::PARAM_INVALID); +} + +TEST(UtestIrCommon, check_dynamic_batch_size_input_shape_succ) { + map> shape_map; + shape_map.insert(std::pair>("data", {-1, 2, 3})); + std::string dynamic_batch_size = "11"; + + bool ret = CheckDynamicBatchSizeInputShapeValid(shape_map, dynamic_batch_size); + EXPECT_EQ(ret, true); +} + +TEST(UtestIrCommon, check_dynamic_images_size_input_shape_succ) { + map> shape_map; + shape_map.insert(std::pair>("data", {4, -1, -1, 5})); + std::string input_format = "NCHW"; + std::string dynamic_image_size = "4,5"; + + Status ret = CheckDynamicImagesizeInputShapeValid(shape_map, input_format, dynamic_image_size); + EXPECT_EQ(ret, ge::SUCCESS); +} + +TEST(UtestIrCommon, check_dynamic_input_param_succ) { + string dynamic_batch_size = "1"; + string dynamic_image_size; + string dynamic_dims; + string input_shape = "data:-1,3,244,244"; + string input_shape_range; + string input_format = "NCHW"; + bool is_dynamic_input = false; + + Status ret = CheckDynamicInputParamValid(dynamic_batch_size, dynamic_image_size, dynamic_dims, + input_shape, input_shape_range, input_format,is_dynamic_input); + EXPECT_EQ(ret, ge::SUCCESS); +} + +TEST(UtestIrCommon, check_dynamic_input_param_failed) { + string dynamic_batch_size = "1"; + string dynamic_image_size; + string dynamic_dims; + string input_shape = "data:1,3,244,244"; + string input_shape_range; + string input_format = "NCHW"; + bool is_dynamic_input = false; + + Status ret = CheckDynamicInputParamValid(dynamic_batch_size, dynamic_image_size, dynamic_dims, + input_shape, input_shape_range, input_format,is_dynamic_input); + EXPECT_EQ(ret, ge::PARAM_INVALID); +} + +TEST(UtestIrCommon, check_compress_weight) { + std::string enable_compress_weight = "true"; + std::string compress_weight_conf="./"; + Status ret = CheckCompressWeightParamValid(enable_compress_weight, compress_weight_conf); + EXPECT_EQ(ret, PARAM_INVALID); + + enable_compress_weight = "yes"; + compress_weight_conf = "./"; + ret = CheckCompressWeightParamValid(enable_compress_weight, compress_weight_conf); + EXPECT_EQ(ret, PARAM_INVALID); +} + +TEST(UtestIrCommon, check_param_failed) { + std::string param_invalid = "invalid"; + + Status ret = CheckOutputTypeParamValid(param_invalid); + EXPECT_EQ(ret, PARAM_INVALID); + + ret = CheckBufferOptimizeParamValid(param_invalid); + EXPECT_EQ(ret, PARAM_INVALID); + + ret = CheckKeepTypeParamValid(param_invalid); + EXPECT_EQ(ret, PARAM_INVALID); + + ret = CheckInsertOpConfParamValid(param_invalid); + EXPECT_EQ(ret, PARAM_INVALID); + + ret = CheckDisableReuseMemoryParamValid(param_invalid); + EXPECT_EQ(ret, PARAM_INVALID); + + ret = CheckEnableSingleStreamParamValid(param_invalid); + EXPECT_EQ(ret, PARAM_INVALID); + + std::string optypelist_for_implmode; + std::string op_select_implmode = "1"; + ret = CheckImplmodeParamValid(optypelist_for_implmode, op_select_implmode); + EXPECT_EQ(ret, PARAM_INVALID); + + ret = CheckLogParamValidAndSetLogLevel(param_invalid); +} diff --git a/tests/ut/ge/hybrid/executor/worker/execution_engine_unittest.cc b/tests/ut/ge/hybrid/executor/worker/execution_engine_unittest.cc new file mode 100644 index 00000000..5fa0d22c --- /dev/null +++ b/tests/ut/ge/hybrid/executor/worker/execution_engine_unittest.cc @@ -0,0 +1,119 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "runtime/rt.h" + +#define protected public +#define private public +#include "hybrid/model/hybrid_model.h" +#include "hybrid/node_executor/node_executor.h" +#include "hybrid/executor/hybrid_execution_context.h" +#include "hybrid/executor/hybrid_model_executor.h" +#include "hybrid/executor/worker/execution_engine.h" +#undef private +#undef protected + +using namespace std; +using namespace testing; +using namespace ge; +using namespace hybrid; + + +class UtestExecutionEngine : public testing::Test { + protected: + void SetUp() {} + + void TearDown() { + } +}; +namespace { +const int kIntBase = 10; +} +static ge::OpDescPtr CreateOpDesc(string name = "", string type = "") { + auto op_desc = std::make_shared(name, type); + op_desc->SetStreamId(0); + op_desc->SetId(0); + op_desc->SetWorkspace({}); + op_desc->SetWorkspaceBytes({}); + op_desc->SetInputOffset({}); + op_desc->SetOutputOffset({}); + + ge::AttrUtils::SetStr(op_desc, ge::TVM_ATTR_NAME_MAGIC, "RT_DEV_BINARY_MAGIC_ELF_AIVEC"); + bool support_dynamic = true; + ge::AttrUtils::GetBool(op_desc, "support_dynamicshape", support_dynamic); + return op_desc; +} + +TEST_F(UtestExecutionEngine, ExecuteAsync_without_kernel_task) { + auto graph = make_shared("graph"); + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + GeShape shape({2, 16}); + GeTensorDesc tensor_desc(shape); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddOutputDesc(tensor_desc); + auto node = graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + ASSERT_TRUE(node_item != nullptr); + node_item->input_start = 0; + node_item->output_start = 0; + + GraphExecutionContext execution_context; + execution_context.profiling_level = 1; + SubgraphContext subgraph_context(nullptr, &execution_context); + + NodeState node_state(*node_item, &subgraph_context); + auto task_context = TaskContext::Create(&node_state, &execution_context, &subgraph_context); + auto shared_task_context = std::shared_ptr(task_context.release()); + node_state.SetTaskContext(shared_task_context); + + ExecutionEngine execution_engine; + ASSERT_TRUE(node_state.GetTaskContext() != nullptr); + EXPECT_EQ(execution_engine.ExecuteAsync(node_state, node_state.GetTaskContext(), execution_context), INTERNAL_ERROR); +} + +TEST_F(UtestExecutionEngine, ExecuteAsync_without_callback_and_kernel_task) { + auto graph = make_shared("graph"); + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + GeShape shape({2, 16}); + GeTensorDesc tensor_desc(shape); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddOutputDesc(tensor_desc); + auto node = graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + ASSERT_TRUE(node_item != nullptr); + node_item->input_start = 0; + node_item->output_start = 0; + + GraphExecutionContext execution_context; + GeRootModelPtr ge_root_model = make_shared(graph); + HybridModel hybrid_model(ge_root_model); + execution_context.model = &hybrid_model; + SubgraphContext subgraph_context(nullptr, &execution_context); + + NodeState node_state(*node_item, &subgraph_context); + auto task_context = TaskContext::Create(&node_state, &execution_context, &subgraph_context); + auto shared_task_context = std::shared_ptr(task_context.release()); + node_state.SetTaskContext(shared_task_context); + + ExecutionEngine execution_engine; + ASSERT_TRUE(node_state.GetTaskContext() != nullptr); + EXPECT_EQ(execution_engine.ExecuteAsync(node_state, node_state.GetTaskContext(), execution_context), INTERNAL_ERROR); +} diff --git a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc index 274cc56f..b5aac527 100644 --- a/tests/ut/ge/hybrid/ge_hybrid_unittest.cc +++ b/tests/ut/ge/hybrid/ge_hybrid_unittest.cc @@ -39,7 +39,7 @@ #include "hybrid/common/npu_memory_allocator.h" #include "graph/types.h" #include "graph/utils/tensor_utils.h" - +#include "graph/testcase/ge_graph/graph_builder_utils.h" #undef private #undef protected @@ -154,9 +154,11 @@ TEST_F(UtestGeHybrid, index_taskdefs_failed) { ComputeGraphPtr graph = std::make_shared("test"); GeRootModelPtr ge_root_model = make_shared(graph); + ge_root_model->SetModelName("test_name"); HybridModel hybrid_model(ge_root_model); HybridModelBuilder hybrid_model_builder(hybrid_model); + ASSERT_EQ(hybrid_model_builder.Build(), INTERNAL_ERROR); ASSERT_EQ(hybrid_model_builder.IndexTaskDefs(graph, ge_model), INTERNAL_ERROR); } @@ -173,6 +175,36 @@ TEST_F(UtestGeHybrid, parse_force_infershape_nodes) { HybridModelBuilder hybrid_model_builder(hybrid_model); ASSERT_EQ(hybrid_model_builder.ParseForceInfershapeNodes(node, *new_node), SUCCESS); } +static ComputeGraphPtr BuildDataDirectConnectGraph() { + const char *kRefIndex = "_parent_node_index"; + ge::ut::GraphBuilder builder("subgraph"); + auto data = builder.AddNode("Data", "Data", 1, 1); + auto netoutput = builder.AddNode("NetOutput", "NetOutput", 1, 1); + (void)AttrUtils::SetInt(netoutput->GetOpDesc()->MutableInputDesc(0), kRefIndex, 0); + + builder.AddDataEdge(data, 0, netoutput, 0); + return builder.GetGraph(); +} +TEST_F(UtestGeHybrid, data_direct_connect) { + std::unique_ptr node_item; + auto root_graph = make_shared("root_graph"); + OpDescPtr op_desc = CreateOpDesc("PartitionedCall", "PartitionedCall"); + auto node = root_graph->AddNode(op_desc); + node->SetOwnerComputeGraph(root_graph); + auto sub_graph = BuildDataDirectConnectGraph(); + sub_graph->SetParentGraph(root_graph); + sub_graph->SetParentNode(node); + node->GetOpDesc()->AddSubgraphName("subgraph"); + node->GetOpDesc()->SetSubgraphInstanceName(0, "subgraph"); + root_graph->AddSubgraph("subgraph", sub_graph); + std::unique_ptr new_node; + NodeItem::Create(node, new_node); + GeRootModelPtr ge_root_model = make_shared(root_graph); + HybridModel hybrid_model(ge_root_model); + HybridModelBuilder hybrid_model_builder(hybrid_model); + auto ret = hybrid_model_builder.IdentifyVariableOutputs(*new_node.get()); + ASSERT_EQ(ret, SUCCESS); +} TEST_F(UtestGeHybrid, index_taskdefs_success) { // build aicore task @@ -426,6 +458,40 @@ TEST_F(UtestGeHybrid, TestTaskContext) { ASSERT_EQ(new_desc.GetShape().GetDims(), new_shape.GetDims()); } +TEST_F(UtestGeHybrid, hybrid_model_executor_update_args) { + auto aicore_task = std::unique_ptr(new(std::nothrow)hybrid::AiCoreOpTask()); + + auto graph = make_shared("graph"); + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + GeShape shape({2, 16}); + GeTensorDesc tensor_desc(shape); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddInputDesc(tensor_desc); + op_desc->AddOutputDesc(tensor_desc); + auto node = graph->AddNode(op_desc); + + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->input_start = 0; + node_item->output_start = 0; + + GraphExecutionContext execution_context; + SubgraphContext subgraph_context(nullptr, &execution_context); + subgraph_context.all_inputs_.resize(2); + subgraph_context.all_outputs_.resize(1); + + NodeState node_state(*node_item, &subgraph_context); + auto task_context = TaskContext::Create(&node_state, &execution_context, &subgraph_context); + + int32_t buffer[1]; + aicore_task->tiling_buffer_ = TensorBuffer::Create(buffer, sizeof(buffer)); + EXPECT_NE(aicore_task->tiling_buffer_, nullptr); + aicore_task->max_arg_count_ = 0; + EXPECT_EQ(aicore_task->UpdateArgs(*task_context), ACL_ERROR_GE_MEMORY_OPERATE_FAILED); + aicore_task->args_ = std::unique_ptr(new uint8_t[sizeof(uintptr_t) * 2]); + EXPECT_EQ(aicore_task->UpdateArgs(*task_context), SUCCESS); +} + TEST_F(UtestGeHybrid, hybrid_model_executor_check_shape) { HybridModelExecutor::ExecuteArgs args; GeTensorDescPtr ge_tensor = make_shared(GeTensorDesc()); @@ -446,6 +512,7 @@ TEST_F(UtestGeHybrid, hybrid_model_executor_check_shape) { NodePtr node = graph->AddNode(op_desc); std::unique_ptr new_node; NodeItem::Create(node, new_node); + new_node->is_dynamic = true; GraphItem graph_item; graph_item.input_nodes_.emplace_back(new_node.get()); @@ -465,6 +532,10 @@ TEST_F(UtestGeHybrid, hybrid_model_executor_check_shape) { ret = HybridModelExecutor::CheckInputShapeByShapeRange(&graph_item, args1); ASSERT_EQ(ret, ge::INTERNAL_ERROR); + + HybridModelExecutor::ExecuteArgs args3; + ret = HybridModelExecutor::CheckInputShapeByShapeRange(&graph_item, args3); + ASSERT_EQ(ret, ge::INTERNAL_ERROR); } TEST_F(UtestGeHybrid, TestOptimizeDependenciesForConstInputs) { @@ -555,3 +626,35 @@ TEST_F(UtestGeHybrid, test_key_for_kernel_bin) { EXPECT_EQ(atomic_task->GetKeyForTvmMetaData(), ATOMIC_ATTR_TVM_METADATA); EXPECT_EQ(atomic_task->GetKeyForKernelName(op_desc), "Sum_atomic_kernelname"); } + +TEST_F(UtestGeHybrid, TestParseDependentInputNodesForHccl) { + NodeExecutorManager::GetInstance().engine_mapping_.emplace("ops_kernel_info_hccl", + NodeExecutorManager::ExecutorType::HCCL); + ComputeGraphPtr compute_graph = MakeShared("test"); + + OpDescPtr op_desc = CreateOpDesc("Add", "Add"); + auto node = compute_graph->AddNode(op_desc); + std::unique_ptr node_item; + NodeItem::Create(node, node_item); + node_item->node_id = 0; + + OpDescPtr op_desc_1 = CreateOpDesc("AllReduce", "AllReduce"); + op_desc_1->SetOpKernelLibName("ops_kernel_info_hccl"); + auto node_1 = compute_graph->AddNode(op_desc_1); + std::unique_ptr node_item_1; + NodeItem::Create(node_1, node_item_1); + node_item_1->node_id = 1; + + node->GetOutControlAnchor()->LinkTo(node_1->GetInControlAnchor()); + + GeRootModelPtr root_model = MakeShared(compute_graph); + HybridModel model(root_model); + model.root_graph_ = compute_graph; + model.node_items_.emplace(node, std::move(node_item)); + + HybridModelBuilder builder(model); + std::vector deps; + ASSERT_EQ(builder.ParseDependentInputNodes(*node_item_1, deps), SUCCESS); + ASSERT_TRUE(model.GetNodeItem(node)->has_observer); + ASSERT_EQ(node_item_1->dependents_for_execution.size(), 1); +} diff --git a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc index 3dfbff41..2da80b32 100644 --- a/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc +++ b/tests/ut/ge/profiling/ge_profiling_manager_unittest.cc @@ -78,3 +78,9 @@ TEST_F(UtestGeProfilinganager, plungin_init_) { EXPECT_EQ(ret, INTERNAL_ERROR); ProfilingManager::Instance().prof_cb_.msprofReporterCallback = nullptr; } + +TEST_F(UtestGeProfilinganager, report_data_) { + std::string data = "ge is better than tensorflow."; + std::string tag_name = "fmk"; + ProfilingManager::Instance().ReportData(0, data, tag_name); +} \ No newline at end of file