diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index 6e0e9235..1285d6e6 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -341,7 +341,6 @@ set(TRAIN_SRC_LIST "init/gelib.cc" "model/ge_model.cc" "model/ge_root_model.cc" - "omm/csa_interact.cc" "opskernel_manager/ops_kernel_manager.cc" "opskernel_manager/ops_kernel_builder_manager.cc" "session/inner_session.cc" @@ -416,7 +415,6 @@ set(TRAIN_SRC_LIST set(INFER_SRC_LIST "graph/manager/trans_var_data_utils.cc" - "omm/csa_interact.cc" "common/fp16_t.cc" "common/formats/utils/formats_trans_utils.cc" "common/formats/format_transfers/datatype_transfer.cc" diff --git a/ge/executor/CMakeLists.txt b/ge/executor/CMakeLists.txt index d215b7ef..548711b7 100644 --- a/ge/executor/CMakeLists.txt +++ b/ge/executor/CMakeLists.txt @@ -86,7 +86,6 @@ set(SRC_LIST "../common/profiling/ge_profiling.cc" "../graph/load/graph_loader.cc" "../graph/execute/graph_execute.cc" - "../omm/csa_interact.cc" "../graph/manager/graph_manager_utils.cc" "../graph/manager/graph_var_manager.cc" "../graph/manager/graph_mem_allocator.cc" diff --git a/ge/executor/module.mk b/ge/executor/module.mk index 4966eeb5..7a7e2b51 100644 --- a/ge/executor/module.mk +++ b/ge/executor/module.mk @@ -11,7 +11,6 @@ local_ge_executor_src_files := \ ../common/profiling/ge_profiling.cc \ ../graph/load/graph_loader.cc \ ../graph/execute/graph_execute.cc \ - ../omm/csa_interact.cc \ ../graph/manager/graph_manager_utils.cc \ ../graph/manager/graph_var_manager.cc \ ../graph/manager/rdma_pool_allocator.cc \ diff --git a/ge/ge_inference.mk b/ge/ge_inference.mk index 32fc206d..ddfeb8f4 100755 --- a/ge/ge_inference.mk +++ b/ge/ge_inference.mk @@ -4,7 +4,6 @@ COMMON_LOCAL_SRC_FILES := \ proto/fusion_model.proto \ proto/optimizer_priority.proto \ graph/manager/trans_var_data_utils.cc \ - omm/csa_interact.cc \ common/fp16_t.cc \ common/formats/utils/formats_trans_utils.cc \ common/formats/format_transfers/datatype_transfer.cc \ diff --git a/ge/ge_runner.mk b/ge/ge_runner.mk index 49515fe4..e14dee82 100644 --- a/ge/ge_runner.mk +++ b/ge/ge_runner.mk @@ -256,7 +256,6 @@ LIBGE_LOCAL_SRC_FILES := \ init/gelib.cc \ model/ge_model.cc \ model/ge_root_model.cc \ - omm/csa_interact.cc \ opskernel_manager/ops_kernel_manager.cc \ opskernel_manager/ops_kernel_builder_manager.cc \ session/inner_session.cc \ diff --git a/ge/graph/execute/graph_execute.cc b/ge/graph/execute/graph_execute.cc index 5142e347..1d22016e 100755 --- a/ge/graph/execute/graph_execute.cc +++ b/ge/graph/execute/graph_execute.cc @@ -21,7 +21,6 @@ #include "graph/load/model_manager/model_manager.h" #include "graph/load/model_manager/davinci_model.h" -#include "omm/csa_interact.h" namespace ge { using Uint32Pair = pair; @@ -490,12 +489,10 @@ Status GraphExecutor::AsyncExecuteModel(const GeRootModelPtr &ge_root_model, con } catch (std::bad_alloc &) { REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed"); GELOGE(MEMALLOC_FAILED, "RunAsync failed, bad memory allocation occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return MEMALLOC_FAILED; } catch (...) { REPORT_INNER_ERROR("E19999", "Some exceptions occur failed"); GELOGE(FAILED, "RunAsync failed, some exceptions occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return FAILED; } @@ -509,18 +506,15 @@ Status GraphExecutor::DataInput(const InputData &input_data, OutputData &output_ Status ret = model_manager->DataInput(input_data, output_data); if (ret != SUCCESS) { GELOGE(ret, "DataInput: DataInput failed."); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return ret; } } catch (std::bad_alloc &) { REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed"); GELOGE(MEMALLOC_FAILED, "DataInput failed, bad memory allocation occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return MEMALLOC_FAILED; } catch (...) { REPORT_INNER_ERROR("E19999", "Some exceptions occur failed"); GELOGE(FAILED, "DataInput failed, some exceptions occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return FAILED; } @@ -535,18 +529,15 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vectorGetInputOutputDescInfo(model_id, input_desc, output_desc); if (ret != SUCCESS) { GELOGE(ret, "GetInputOutputDescInfo failed."); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return ret; } } catch (std::bad_alloc &) { REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed"); GELOGE(MEMALLOC_FAILED, "GetInputOutputDescInfo failed, bad memory allocation occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return MEMALLOC_FAILED; } catch (...) { REPORT_INNER_ERROR("E19999", "Some exceptions occur failed"); GELOGE(FAILED, "GetInputOutputDescInfo failed, some exceptions occur !"); - CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); return FAILED; } @@ -564,18 +555,15 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vectorUnload(model_id); if (ret != SUCCESS) { GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_UNLOAD); return ret; } GELOGI("UnLoad model success, model id:%u.", model_id); @@ -55,7 +53,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptrLoadModelOnline(model_id, ge_root_model_ptr, listener); if (ret != SUCCESS) { GELOGE(ret, "LoadModel: Load failed. ret = %u", ret); - CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_LOAD); - rt_ret = rtDeviceReset(GetContext().DeviceId()); if (rt_ret != RT_ERROR_NONE) { REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X", @@ -94,7 +89,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptrId(); @@ -2742,7 +2740,6 @@ void *DavinciModel::Run(DavinciModel *model) { bool rslt_flg = true; if (model->GetDataInputer() == nullptr) { GELOGW("Data inputer is nullptr."); - CsaInteract::GetInstance().StoreInternalErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); break; } @@ -2763,7 +2760,6 @@ void *DavinciModel::Run(DavinciModel *model) { ret = model->SyncVarData(); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "Copy input data to model failed."); // [No need to check value] GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(Model_SyncVarData, "Model Run SyncVarData")); @@ -2773,7 +2769,6 @@ void *DavinciModel::Run(DavinciModel *model) { ret = model->CopyInputData(current_data, false); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "Copy input data to model failed."); // [No need to check value] if (model->is_online_infer_dynamic_ && !model->is_getnext_sink_dynamic_) { model->cur_dynamic_dims_.clear(); @@ -2794,7 +2789,6 @@ void *DavinciModel::Run(DavinciModel *model) { rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0); GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false; (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput()); - CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue); GELOGI("rtModelExecute end"); GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute")); @@ -2812,7 +2806,6 @@ void *DavinciModel::Run(DavinciModel *model) { rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag); (void)model->ReturnResult(current_data.index, false, seq_end_flag, data_wrapper->GetOutput()); // [No need to check value] - CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue); } @@ -2841,7 +2834,6 @@ void *DavinciModel::Run(DavinciModel *model) { GELOGI("run iterator count is %lu, model_id:%u", model->iterator_count_, model->model_id_); } - CsaInteract::GetInstance().WriteInternalErrorCode(); GELOGI("Model run end, model id:%u", model->model_id_); return nullptr; } diff --git a/ge/hybrid/executor/hybrid_model_async_executor.cc b/ge/hybrid/executor/hybrid_model_async_executor.cc index 1fed16a5..3294a286 100644 --- a/ge/hybrid/executor/hybrid_model_async_executor.cc +++ b/ge/hybrid/executor/hybrid_model_async_executor.cc @@ -19,7 +19,6 @@ #include "graph/utils/tensor_utils.h" #include "graph/utils/type_utils.h" #include "graph/ge_context.h" -#include "omm/csa_interact.h" namespace ge { namespace hybrid { @@ -163,7 +162,6 @@ Status HybridModelAsyncExecutor::RunInternal() { ret = PreRun(current_data, args); GE_CHK_BOOL_TRUE_EXEC_WITH_LOG( ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC); continue, "[Invoke][PreRun] failed, model_id:%u.", model_id_); // [No need to check value] if (pipe_executor_ != nullptr) { @@ -181,7 +179,6 @@ Status HybridModelAsyncExecutor::RunInternal() { } ret = HandleResult(ret, current_data.index, args, data_wrapper->GetOutput()); if (ret != SUCCESS) { - CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC); continue; } @@ -191,7 +188,6 @@ Status HybridModelAsyncExecutor::RunInternal() { GELOGI("run iterator count is %lu, model_id:%u", iterator_count_, model_id_); } - CsaInteract::GetInstance().WriteInternalErrorCode(); GELOGI("Model run end, model id:%u", model_id_); return SUCCESS; } diff --git a/ge/init/gelib.cc b/ge/init/gelib.cc index ab7fbb29..caaacd27 100644 --- a/ge/init/gelib.cc +++ b/ge/init/gelib.cc @@ -42,7 +42,6 @@ #include "graph/manager/graph_mem_allocator.h" #include "graph/manager/host_mem_manager.h" #include "graph/manager/graph_var_manager.h" -#include "omm/csa_interact.h" #include "runtime/kernel.h" #include "opskernel_manager/ops_kernel_builder_manager.h" #include "external/runtime/rt_error_codes.h" @@ -376,10 +375,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt } GE_CHK_STATUS_RET(HostMemManager::Instance().Initialize()); - // Update CSA file - CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId()); - Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT); - GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret); // set device id GELOGI("set logical device id:%u", options.device_id); @@ -408,10 +403,6 @@ Status GELib::SystemShutdownWithOptions(const Options &options) { GE_CHK_RT(rtDeviceReset(options.device_id)); - // Update CSA file - Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_SUCCEED); - GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret); - is_system_inited = false; is_shutdown = true; GELOGI("%s finalize GELib success.", mode.c_str()); diff --git a/ge/omm/csa_interact.cc b/ge/omm/csa_interact.cc deleted file mode 100644 index 15bca075..00000000 --- a/ge/omm/csa_interact.cc +++ /dev/null @@ -1,265 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "omm/csa_interact.h" - -#include "framework/common/debug/ge_log.h" -#include "framework/common/debug/log.h" -#include "framework/common/util.h" -#include "graph/ge_context.h" -#include "graph/manager/graph_var_manager.h" -#include "graph/utils/tensor_utils.h" -#include "mmpa/mmpa_api.h" -#include "nlohmann/json.hpp" - -namespace ge { -namespace { -const char FMK_STATUS_FILE_DIR_ENV[] = "FMK_STATUS_FILE_DIR"; -const char JOBSTATE_FILE_NAME[] = "jobstateupdate_framework"; -const char HCOM_DETECT_FILE_NAME[] = "hcom_detection_result"; -const char FILE_SEPARATE[] = "/"; -} // namespace - -/// -/// @brief Obtain CsaInteract instance -/// @return CsaInteract instance -/// -CsaInteract &CsaInteract::GetInstance() { - static CsaInteract instance; - return instance; -} - -/// -/// @brief CsaInteract instance initialization -/// @param [in] dev_index device index -/// @param [in] job_id job id -/// @return void -/// -void CsaInteract::Init(int32_t dev_index, int64_t job_id) { - if (!is_init_) { - dev_index_ = dev_index; - job_id_ = job_id; - - char file_dir_env[MMPA_MAX_PATH] = { 0x00 }; - INT32 res = mmGetEnv(FMK_STATUS_FILE_DIR_ENV, file_dir_env, MMPA_MAX_PATH); - string csa_path_prefix; - if (res == EN_OK) { - csa_path_prefix = file_dir_env; - } - if (!csa_path_prefix.empty()) { - job_state_file_ = csa_path_prefix + std::to_string(dev_index_) + FILE_SEPARATE + JOBSTATE_FILE_NAME; - hcom_detect_file_ = csa_path_prefix + std::to_string(dev_index_) + FILE_SEPARATE + HCOM_DETECT_FILE_NAME; - } - is_init_ = true; - } -} - -/// -/// @brief Update job state file -/// @param [in] job_state job state -/// @param [in] job_sub_state detailed job state -/// @param [in] module_ret_errcode sub module training failure error code -/// @param [in] error_module error module identified by FMK -/// @return Status -/// -Status CsaInteract::WriteJobState(JobState job_state, JobSubState job_sub_state, uint32_t module_ret_errcode, - ErrorModule error_module) { - if (!is_init_) { - GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState"); - REPORT_INNER_ERROR("E19999", "WriteJobState failed before init. "); - return INTERNAL_ERROR; - } - if ((curr_state_ == JOBSTATE_FAILED) || (curr_state_ == JOBSTATE_KILLED)) { - return SUCCESS; - } - - if (job_state_file_.empty()) { - return SUCCESS; - } - - std::string content; - try { - nlohmann::json content_json; - content_json["job_id"] = job_id_; - content_json["jobstate"] = job_state; - // Only the running or running failure state has a job sub state - if ((job_state == JOBSTATE_RUNNING) || (job_state == JOBSTATE_FAILED)) { - content_json["job_sub_state"] = job_sub_state; - } - content_json["time"] = CurrentTimeInStr(); - // Write error code only if run failed - if (job_state == JOBSTATE_FAILED) { - content_json["errorcode"] = module_ret_errcode; - content_json["errmodule"] = error_module; - } - - content = content_json.dump(); - } catch (const nlohmann::json::exception &e) { - GELOGE(INTERNAL_ERROR, "[Create][JsonObject] exception:%s job_state:%u job_sub_state:%u.", - e.what(), job_state, job_sub_state); - REPORT_INNER_ERROR("E19999", "Create json object failed. exception:%s job_state:%u job_sub_state:%u.", - e.what(), job_state, job_sub_state); - return INTERNAL_ERROR; - } - - if (WriteFile(job_state_file_, content) != SUCCESS) { - // The error log subfunction has been printed and will not print again - return INTERNAL_ERROR; - } - - curr_state_ = job_state; - return SUCCESS; -} - -/// -/// @brief Update error code in the job state file -/// @param [in] module_ret_errcode sub module training failure error code -/// @param [in] error_module error module identified by FMK -/// @param [in] job_sub_state detailed job state -/// @return void -/// -void CsaInteract::WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, JobSubState job_sub_state) { - // The error log subfunction has been printed and will not print again - Status ret = WriteJobState(JOBSTATE_FAILED, job_sub_state, module_ret_errcode, error_module); - if (ret != SUCCESS) { - GELOGW("write error code fail. ret_code: %u, status: %u", module_ret_errcode, job_sub_state); - } -} - -/// -/// @brief Record errors that occurred durning the training -/// @param [in] module_ret_errcode sub module training failure error code -/// @param [in] error_module error module identified by FMK -/// @param [in] job_sub_state detailed job state -/// @return void -/// -void CsaInteract::StoreInternalErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, - JobSubState job_sub_state) { - is_have_internal_error_ = true; - - csa_error_code_.module_ret_errcode = module_ret_errcode; - csa_error_code_.error_module = error_module; - csa_error_code_.job_sub_state = job_sub_state; -} - -/// -/// @brief Update training error code in the job state file -/// @return void -/// -void CsaInteract::WriteInternalErrorCode() { - if (is_have_internal_error_) { - WriteErrorCode(csa_error_code_.module_ret_errcode, csa_error_code_.error_module, csa_error_code_.job_sub_state); - } -} - -/// -/// @brief Update network connectivity detect file -/// @param [in] content network connectivity content -/// @return Status -/// -Status CsaInteract::WriteHcomDetection(const std::string &content) { - if (!is_init_) { - GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState"); - REPORT_INNER_ERROR("E19999", "WriteHcomDetection failed before init."); - return INTERNAL_ERROR; - } - - if (hcom_detect_file_.empty()) { - return SUCCESS; - } - - return WriteFile(hcom_detect_file_, content); -} - -/// -/// @ingroup WriteFile -/// @brief Write the content into the file. If the file does not exist, create the file -/// @param [in] file_name: File name to be written -/// @param [in] content: Contents to be written -/// @return Status -/// -Status CsaInteract::WriteFile(const std::string &file_name, const std::string &content) { - // if file path is not exist, then make path - INT32 flags = M_WRONLY | O_TRUNC | M_CREAT; - int32_t fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD); - if (fd == EN_ERROR) { - if (MakePath(file_name) != SUCCESS) { - GELOGE(INTERNAL_ERROR, "[Create][File Path] errno is %d", errno); - REPORT_CALL_ERROR("E19999", "MakePath failed. errno is %d", errno); - return INTERNAL_ERROR; - } - fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD); - if (fd == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Open][File] errno is %d file_name: %s", errno, file_name.c_str()); - REPORT_CALL_ERROR("E19999", "mmOpen2 failed. errno is %d file_name: %s", errno, file_name.c_str()); - return INTERNAL_ERROR; - } - } - - mmSsize_t ret = mmWrite(fd, reinterpret_cast(const_cast(content.c_str())), content.length()); - if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Write][File] errno is %d", errno); - REPORT_CALL_ERROR("E19999", "mmWrite failed. errno is %d", errno); - ret = mmClose(fd); - if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno); - REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno); - } - return INTERNAL_ERROR; - } - ret = mmClose(fd); - if (ret == EN_ERROR) { - GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno); - REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno); - return INTERNAL_ERROR; - } - - return SUCCESS; -} - -/// -/// @ingroup MakePath -/// @brief Verify whether the file path exists, if not, recursively create the folder -/// @param [in] file_name: File name to be verified -/// @return Status -/// -Status CsaInteract::MakePath(const std::string &file_name) { - std::size_t found = file_name.find_last_of("/"); - if (found == std::string::npos) { - return PARAM_INVALID; - } - - std::string file_path = file_name.substr(0, found + 1); - if (mmAccess(file_path.c_str()) == EN_OK) { - return SUCCESS; - } - - found = file_path.find_first_of("/"); - while (found != std::string::npos) { - std::string pre_path = file_path.substr(0, found + 1); - if (mmAccess(pre_path.c_str()) != EN_OK) { - if (mmMkdir(pre_path.c_str(), M_IRWXU) != EN_OK) { - GELOGE(INTERNAL_ERROR, "[Create][FileDir] fail, errno is %d, pre_path:%s", errno, pre_path.c_str()); - REPORT_CALL_ERROR("E19999", "mmMkdir failed. errno is %d pre_path:%s", errno, pre_path.c_str()); - return INTERNAL_ERROR; - } - } - found = file_path.find_first_of("/", found + 1); - } - - return SUCCESS; -} -} // namespace ge diff --git a/ge/omm/csa_interact.h b/ge/omm/csa_interact.h deleted file mode 100644 index 0a609e09..00000000 --- a/ge/omm/csa_interact.h +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GE_OMM_CSA_INTERACT_H_ -#define GE_OMM_CSA_INTERACT_H_ - -#include - -#include "framework/common/ge_inner_error_codes.h" - -namespace ge { -enum JobState { - JOBSTATE_WAITING = 1, - JOBSTATE_RUNNING, - JOBSTATE_KILLING, - JOBSTATE_SUCCEED, - JOBSTATE_FAILED, - JOBSTATE_KILLED, - JOBSTATE_UNKOWN -}; - -enum JobSubState { - JOBSUBSTATE_ENV_INIT = 201, - JOBSUBSTATE_ENV_FIN, - JOBSUBSTATE_RESOUCE_ALLOC, - JOBSUBSTATE_MODEL_COMPILE, - JOBSUBSTATE_GRAPH_PREPARE, - JOBSUBSTATE_GRAPH_SPLIT, - JOBSUBSTATE_GRAPH_OPTIMIZE, - JOBSUBSTATE_GRAPH_BUILD, - JOBSUBSTATE_GRAPH_LOAD, - JOBSUBSTATE_GRAPH_EXEC, - JOBSUBSTATE_GRAPH_UNLOAD, - JOBSUBSTATE_OTHER -}; - -enum ErrorModule { - ERROR_MODULE_DRIVER = 0x01, - ERROR_MODULE_RUNTIME = 0x04, - ERROR_MODULE_CCE = 0x06, - ERROR_MODULE_FMK = 0x08, - ERROR_MODULE_HCCL = 0x12 -}; - -struct CsaErrorCode { - CsaErrorCode() - : module_ret_errcode(0), - error_module(ERROR_MODULE_FMK), - job_sub_state(JOBSUBSTATE_OTHER) {} - ~CsaErrorCode() {} - uint32_t module_ret_errcode; - ErrorModule error_module; - JobSubState job_sub_state; -}; -class CsaInteract { - public: - /// - /// @brief Obtain CsaInteract instance - /// @return CsaInteract instance - /// - static CsaInteract& GetInstance(); - - /// - /// @brief CsaInteract instance initialization - /// @param [in] dev_index device index - /// @param [in] job_id job id - /// @return void - /// - void Init(int32_t dev_index, int64_t job_id); - - /// - /// @brief Update job state file - /// @param [in] job_state job state - /// @param [in] job_sub_state detailed job state - /// @param [in] module_ret_errcode sub module training failure error code - /// @param [in] error_module error module identified by FMK - /// @return Status - /// - Status WriteJobState(JobState job_state, - JobSubState job_sub_state = JOBSUBSTATE_OTHER, - uint32_t module_ret_errcode = SUCCESS, - ErrorModule error_module = ERROR_MODULE_FMK); - - /// - /// @brief Update error code in the job state file - /// @param [in] module_ret_errcode sub module training failure error code - /// @param [in] error_module error module identified by FMK - /// @param [in] job_sub_state detailed job state - /// @return void - /// - void WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, - JobSubState job_sub_state); - - /// - /// @brief Record errors that occurred durning the training - /// @param [in] module_ret_errcode sub module training failure error code - /// @param [in] error_module error module identified by FMK - /// @param [in] job_sub_state detailed job state - /// @return void - /// - void StoreInternalErrorCode(uint32_t module_ret_errcode, - ErrorModule error_module, - JobSubState job_sub_state); - - /// - /// @brief Update training error code in the job state file - /// @return void - /// - void WriteInternalErrorCode(); - - /// - /// @brief Update network connectivity detect file - /// @param [in] content network connectivity content - /// @return Status - /// - Status WriteHcomDetection(const std::string& content); - - private: - CsaInteract() - : dev_index_(0), - job_id_(0), - is_init_(false), - curr_state_(JOBSTATE_UNKOWN), - is_have_internal_error_(false) {} - - ~CsaInteract() {} - - CsaInteract(const CsaInteract&) = delete; - CsaInteract(CsaInteract&&) = delete; - CsaInteract& operator=(const CsaInteract&) = delete; - CsaInteract& operator=(CsaInteract&&) = delete; - - /// - /// @ingroup WriteFile - /// @brief Write the content into the file. If the file does not exist, create the file - /// @param [in] file_name: File name to be written - /// @param [in] content: Contents to be written - /// @return Status - /// - Status WriteFile(const std::string& file_name, const std::string& content); - - /// - /// @ingroup MakePath - /// @brief Verify whether the file path exists, if not, recursively create the folder - /// @param [in] file_name: File name to be verified - /// @return Status - /// - Status MakePath(const std::string& file_name); - - // device index - int32_t dev_index_; - // job id - int64_t job_id_; - // is initialization complete - bool is_init_; - // current job state - JobState curr_state_; - // job state file - std::string job_state_file_; - // network connectivity detect file - std::string hcom_detect_file_; - // identification of internal errors that occurred during the training - bool is_have_internal_error_; - // error code information - CsaErrorCode csa_error_code_; -}; -} // namespace ge - -#endif // GE_OMM_CSA_INTERACT_H_ - diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt index dabc1485..918b18d5 100755 --- a/tests/ut/ge/CMakeLists.txt +++ b/tests/ut/ge/CMakeLists.txt @@ -287,7 +287,6 @@ set(COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/load/model_manager/zero_copy_task.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/cpu_queue_schedule.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/aipp_utils.cc" - "${GE_CODE_DIR}/ge/omm/csa_interact.cc" "${GE_CODE_DIR}/ge/graph/load/model_manager/tbe_handle_store.cc" "${GE_CODE_DIR}/ge/common/kernel_store.cc" "${GE_CODE_DIR}/ge/common/tbe_kernel_store.cc" @@ -391,7 +390,6 @@ set(GRAPH_PARTITION_COMMON_SRC_FILES set(GRAPH_LOAD_COMMON_SRC_FILES "${GE_CODE_DIR}/ge/graph/load/graph_loader.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_manager_utils.cc" - "${GE_CODE_DIR}/ge/omm/csa_interact.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_mem_allocator.cc" "${GE_CODE_DIR}/ge/graph/manager/graph_var_manager.cc" "${GE_CODE_DIR}/ge/graph/manager/trans_var_data_utils.cc" diff --git a/tests/ut/ge/graph/execute/graph_execute_unittest.cc b/tests/ut/ge/graph/execute/graph_execute_unittest.cc index b24985be..e340df2f 100644 --- a/tests/ut/ge/graph/execute/graph_execute_unittest.cc +++ b/tests/ut/ge/graph/execute/graph_execute_unittest.cc @@ -22,7 +22,6 @@ #include "graph/execute/graph_execute.h" #include "graph/load/model_manager/model_manager.h" #include "graph/load/model_manager/davinci_model.h" -#include "omm/csa_interact.h" #undef private #undef public