Browse Source

modified: ge/graph/execute/graph_execute.cc

modified:   ge/graph/load/graph_loader.cc
	modified:   ge/graph/load/model_manager/davinci_model.cc
	modified:   ge/hybrid/executor/hybrid_model_async_executor.cc
modified:   ge/init/gelib.cc
	deleted:    ge/omm/csa_interact.cc
	deleted:    ge/omm/csa_interact.h
	modified:   tests/ut/ge/graph/execute/graph_execute_unittest.cc

	modified:   ge/CMakeLists.txt
	modified:   ge/executor/CMakeLists.txt
	modified:   ge/executor/module.mk
	modified:   ge/ge_inference.mk
	modified:   ge/ge_runner.mk
	modified:   ge/graph/execute/graph_execute.cc
	modified:   ge/graph/load/graph_loader.cc
	modified:   ge/graph/load/model_manager/davinci_model.cc
	modified:   ge/hybrid/executor/hybrid_model_async_executor.cc
	modified:   ge/init/gelib.cc
	deleted:    ge/omm/csa_interact.cc
	deleted:    ge/omm/csa_interact.h
	modified:   tests/ut/ge/CMakeLists.txt
	modified:   tests/ut/ge/graph/execute/graph_execute_unittest.cc

	modified:   ge/CMakeLists.txt
	modified:   ge/executor/CMakeLists.txt
	modified:   ge/executor/module.mk
	modified:   ge/ge_inference.mk
	modified:   ge/ge_runner.mk
	modified:   ge/graph/execute/graph_execute.cc
	modified:   ge/graph/load/graph_loader.cc
	modified:   ge/graph/load/model_manager/davinci_model.cc
	modified:   ge/hybrid/executor/hybrid_model_async_executor.cc
	modified:   ge/init/gelib.cc
	deleted:    ge/omm/csa_interact.cc
	deleted:    ge/omm/csa_interact.h
	modified:   tests/ut/ge/CMakeLists.txt
	modified:   tests/ut/ge/graph/execute/graph_execute_unittest.cc
tags/v1.3.0
zhaoxinxin 4 years ago
parent
commit
94a9281184
14 changed files with 0 additions and 497 deletions
  1. +0
    -2
      ge/CMakeLists.txt
  2. +0
    -1
      ge/executor/CMakeLists.txt
  3. +0
    -1
      ge/executor/module.mk
  4. +0
    -1
      ge/ge_inference.mk
  5. +0
    -1
      ge/ge_runner.mk
  6. +0
    -12
      ge/graph/execute/graph_execute.cc
  7. +0
    -7
      ge/graph/load/graph_loader.cc
  8. +0
    -8
      ge/graph/load/model_manager/davinci_model.cc
  9. +0
    -4
      ge/hybrid/executor/hybrid_model_async_executor.cc
  10. +0
    -9
      ge/init/gelib.cc
  11. +0
    -265
      ge/omm/csa_interact.cc
  12. +0
    -183
      ge/omm/csa_interact.h
  13. +0
    -2
      tests/ut/ge/CMakeLists.txt
  14. +0
    -1
      tests/ut/ge/graph/execute/graph_execute_unittest.cc

+ 0
- 2
ge/CMakeLists.txt View File

@@ -341,7 +341,6 @@ set(TRAIN_SRC_LIST
"init/gelib.cc"
"model/ge_model.cc"
"model/ge_root_model.cc"
"omm/csa_interact.cc"
"opskernel_manager/ops_kernel_manager.cc"
"opskernel_manager/ops_kernel_builder_manager.cc"
"session/inner_session.cc"
@@ -416,7 +415,6 @@ set(TRAIN_SRC_LIST

set(INFER_SRC_LIST
"graph/manager/trans_var_data_utils.cc"
"omm/csa_interact.cc"
"common/fp16_t.cc"
"common/formats/utils/formats_trans_utils.cc"
"common/formats/format_transfers/datatype_transfer.cc"


+ 0
- 1
ge/executor/CMakeLists.txt View File

@@ -86,7 +86,6 @@ set(SRC_LIST
"../common/profiling/ge_profiling.cc"
"../graph/load/graph_loader.cc"
"../graph/execute/graph_execute.cc"
"../omm/csa_interact.cc"
"../graph/manager/graph_manager_utils.cc"
"../graph/manager/graph_var_manager.cc"
"../graph/manager/graph_mem_allocator.cc"


+ 0
- 1
ge/executor/module.mk View File

@@ -11,7 +11,6 @@ local_ge_executor_src_files := \
../common/profiling/ge_profiling.cc \
../graph/load/graph_loader.cc \
../graph/execute/graph_execute.cc \
../omm/csa_interact.cc \
../graph/manager/graph_manager_utils.cc \
../graph/manager/graph_var_manager.cc \
../graph/manager/rdma_pool_allocator.cc \


+ 0
- 1
ge/ge_inference.mk View File

@@ -4,7 +4,6 @@ COMMON_LOCAL_SRC_FILES := \
proto/fusion_model.proto \
proto/optimizer_priority.proto \
graph/manager/trans_var_data_utils.cc \
omm/csa_interact.cc \
common/fp16_t.cc \
common/formats/utils/formats_trans_utils.cc \
common/formats/format_transfers/datatype_transfer.cc \


+ 0
- 1
ge/ge_runner.mk View File

@@ -256,7 +256,6 @@ LIBGE_LOCAL_SRC_FILES := \
init/gelib.cc \
model/ge_model.cc \
model/ge_root_model.cc \
omm/csa_interact.cc \
opskernel_manager/ops_kernel_manager.cc \
opskernel_manager/ops_kernel_builder_manager.cc \
session/inner_session.cc \


+ 0
- 12
ge/graph/execute/graph_execute.cc View File

@@ -21,7 +21,6 @@

#include "graph/load/model_manager/model_manager.h"
#include "graph/load/model_manager/davinci_model.h"
#include "omm/csa_interact.h"

namespace ge {
using Uint32Pair = pair<uint32_t, uint32_t>;
@@ -490,12 +489,10 @@ Status GraphExecutor::AsyncExecuteModel(const GeRootModelPtr &ge_root_model, con
} catch (std::bad_alloc &) {
REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed");
GELOGE(MEMALLOC_FAILED, "RunAsync failed, bad memory allocation occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return MEMALLOC_FAILED;
} catch (...) {
REPORT_INNER_ERROR("E19999", "Some exceptions occur failed");
GELOGE(FAILED, "RunAsync failed, some exceptions occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return FAILED;
}

@@ -509,18 +506,15 @@ Status GraphExecutor::DataInput(const InputData &input_data, OutputData &output_
Status ret = model_manager->DataInput(input_data, output_data);
if (ret != SUCCESS) {
GELOGE(ret, "DataInput: DataInput failed.");
CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return ret;
}
} catch (std::bad_alloc &) {
REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed");
GELOGE(MEMALLOC_FAILED, "DataInput failed, bad memory allocation occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return MEMALLOC_FAILED;
} catch (...) {
REPORT_INNER_ERROR("E19999", "Some exceptions occur failed");
GELOGE(FAILED, "DataInput failed, some exceptions occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return FAILED;
}

@@ -535,18 +529,15 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vector<Inp
Status ret = model_manager->GetInputOutputDescInfo(model_id, input_desc, output_desc);
if (ret != SUCCESS) {
GELOGE(ret, "GetInputOutputDescInfo failed.");
CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return ret;
}
} catch (std::bad_alloc &) {
REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed");
GELOGE(MEMALLOC_FAILED, "GetInputOutputDescInfo failed, bad memory allocation occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return MEMALLOC_FAILED;
} catch (...) {
REPORT_INNER_ERROR("E19999", "Some exceptions occur failed");
GELOGE(FAILED, "GetInputOutputDescInfo failed, some exceptions occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return FAILED;
}

@@ -564,18 +555,15 @@ Status GraphExecutor::GetInputOutputDescInfo(const uint32_t model_id, vector<Inp
new_model_desc);
if (ret != SUCCESS) {
GELOGE(ret, "GetInputOutputDescInfo failed.");
CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return ret;
}
} catch (std::bad_alloc &) {
REPORT_INNER_ERROR("E19999", "Bad memory allocation exception occur failed");
GELOGE(MEMALLOC_FAILED, "GetInputOutputDescInfo failed, bad memory allocation occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return MEMALLOC_FAILED;
} catch (...) {
REPORT_INNER_ERROR("E19999", "Some exceptions occur failed");
GELOGE(FAILED, "GetInputOutputDescInfo failed, some exceptions occur !");
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return FAILED;
}



+ 0
- 7
ge/graph/load/graph_loader.cc View File

@@ -24,7 +24,6 @@
#include "graph/ge_context.h"
#include "graph/load/model_manager/model_manager.h"
#include "graph/manager/graph_var_manager.h"
#include "omm/csa_interact.h"

namespace ge {
Status GraphLoader::UnloadModel(uint32_t model_id) {
@@ -40,7 +39,6 @@ Status GraphLoader::UnloadModel(uint32_t model_id) {
ret = model_manager->Unload(model_id);
if (ret != SUCCESS) {
GELOGE(ret, "UnloadModel: Unload failed. model id:%u", model_id);
CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_UNLOAD);
return ret;
}
GELOGI("UnLoad model success, model id:%u.", model_id);
@@ -55,7 +53,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_LOAD);
return RT_FAILED;
}
if (ge_root_model_ptr == nullptr) {
@@ -69,8 +66,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener);
if (ret != SUCCESS) {
GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_LOAD);

rt_ret = rtDeviceReset(GetContext().DeviceId());
if (rt_ret != RT_ERROR_NONE) {
REPORT_CALL_ERROR("E19999", "Call rtDeviceReset failed, device_id:%u, ret:0x%X",
@@ -94,7 +89,6 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
}

GELOGE(ret, "LoadModel: Start failed.");
CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return ret;
}
rt_ret = rtDeviceReset(GetContext().DeviceId());
@@ -247,7 +241,6 @@ Status GraphLoader::GetMemoryInfo(int64_t &free) {
REPORT_CALL_ERROR("E19999", "Call rtSetDevice failed, device_id:%u, ret:0x%X",
GetContext().DeviceId(), rt_ret);
GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_LOAD);
return RT_FAILED;
}
size_t total_mem = 0;


+ 0
- 8
ge/graph/load/model_manager/davinci_model.cc View File

@@ -50,7 +50,6 @@
#include "graph/utils/type_utils.h"
#include "init/gelib.h"
#include "mmpa/mmpa_api.h"
#include "omm/csa_interact.h"
#include "runtime/base.h"
#include "runtime/dev.h"
#include "runtime/event.h"
@@ -2718,7 +2717,6 @@ Status DavinciModel::ReturnNoOutput(uint32_t data_id) {

void *DavinciModel::Run(DavinciModel *model) {
GE_CHK_BOOL_EXEC(model != nullptr,
CsaInteract::GetInstance().WriteErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
return nullptr, "model_pointer is null!")
bool seq_end_flag = false;
uint32_t model_id = model->Id();
@@ -2742,7 +2740,6 @@ void *DavinciModel::Run(DavinciModel *model) {
bool rslt_flg = true;
if (model->GetDataInputer() == nullptr) {
GELOGW("Data inputer is nullptr.");
CsaInteract::GetInstance().StoreInternalErrorCode(FAILED, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
break;
}

@@ -2763,7 +2760,6 @@ void *DavinciModel::Run(DavinciModel *model) {
ret = model->SyncVarData();
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
continue, "Copy input data to model failed."); // [No need to check value]
GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(Model_SyncVarData, "Model Run SyncVarData"));

@@ -2773,7 +2769,6 @@ void *DavinciModel::Run(DavinciModel *model) {
ret = model->CopyInputData(current_data, false);
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
ret != SUCCESS, (void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
continue, "Copy input data to model failed."); // [No need to check value]
if (model->is_online_infer_dynamic_ && !model->is_getnext_sink_dynamic_) {
model->cur_dynamic_dims_.clear();
@@ -2794,7 +2789,6 @@ void *DavinciModel::Run(DavinciModel *model) {
rt_ret = rtModelExecute(model->rt_model_handle_, model->rt_model_stream_, 0);
GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, rslt_flg = false;
(void)model->ReturnResult(current_data.index, false, false, data_wrapper->GetOutput());
CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
continue);
GELOGI("rtModelExecute end");
GE_IF_BOOL_EXEC(model->is_first_execute_, GE_TIMESTAMP_EVENT_END(rtModelExecute, "GraphExcute::rtModelExecute"));
@@ -2812,7 +2806,6 @@ void *DavinciModel::Run(DavinciModel *model) {
rt_ret != RT_ERROR_NONE, rslt_flg = false; GELOGI("seq_end_flg: %d", seq_end_flag);
(void)model->ReturnResult(current_data.index, false, seq_end_flag,
data_wrapper->GetOutput()); // [No need to check value]
CsaInteract::GetInstance().StoreInternalErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
continue);
}

@@ -2841,7 +2834,6 @@ void *DavinciModel::Run(DavinciModel *model) {
GELOGI("run iterator count is %lu, model_id:%u", model->iterator_count_, model->model_id_);
}

CsaInteract::GetInstance().WriteInternalErrorCode();
GELOGI("Model run end, model id:%u", model->model_id_);
return nullptr;
}


+ 0
- 4
ge/hybrid/executor/hybrid_model_async_executor.cc View File

@@ -19,7 +19,6 @@
#include "graph/utils/tensor_utils.h"
#include "graph/utils/type_utils.h"
#include "graph/ge_context.h"
#include "omm/csa_interact.h"

namespace ge {
namespace hybrid {
@@ -163,7 +162,6 @@ Status HybridModelAsyncExecutor::RunInternal() {
ret = PreRun(current_data, args);
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
ret != SUCCESS, (void) HandleResult(ret, current_data.index, args, data_wrapper->GetOutput());
CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_EXEC);
continue, "[Invoke][PreRun] failed, model_id:%u.", model_id_); // [No need to check value]

if (pipe_executor_ != nullptr) {
@@ -181,7 +179,6 @@ Status HybridModelAsyncExecutor::RunInternal() {
}
ret = HandleResult(ret, current_data.index, args, data_wrapper->GetOutput());
if (ret != SUCCESS) {
CsaInteract::GetInstance().StoreInternalErrorCode(ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_EXEC);
continue;
}

@@ -191,7 +188,6 @@ Status HybridModelAsyncExecutor::RunInternal() {
GELOGI("run iterator count is %lu, model_id:%u", iterator_count_, model_id_);
}

CsaInteract::GetInstance().WriteInternalErrorCode();
GELOGI("Model run end, model id:%u", model_id_);
return SUCCESS;
}


+ 0
- 9
ge/init/gelib.cc View File

@@ -42,7 +42,6 @@
#include "graph/manager/graph_mem_allocator.h"
#include "graph/manager/host_mem_manager.h"
#include "graph/manager/graph_var_manager.h"
#include "omm/csa_interact.h"
#include "runtime/kernel.h"
#include "opskernel_manager/ops_kernel_builder_manager.h"
#include "external/runtime/rt_error_codes.h"
@@ -376,10 +375,6 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt
}

GE_CHK_STATUS_RET(HostMemManager::Instance().Initialize());
// Update CSA file
CsaInteract::GetInstance().Init(options.device_id, GetContext().TraceId());
Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_RUNNING, JOBSUBSTATE_ENV_INIT);
GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret);

// set device id
GELOGI("set logical device id:%u", options.device_id);
@@ -408,10 +403,6 @@ Status GELib::SystemShutdownWithOptions(const Options &options) {

GE_CHK_RT(rtDeviceReset(options.device_id));

// Update CSA file
Status ret = CsaInteract::GetInstance().WriteJobState(JOBSTATE_SUCCEED);
GE_LOGE_IF(ret != SUCCESS, "[Write][JobState] failed, ret:%u ", ret);

is_system_inited = false;
is_shutdown = true;
GELOGI("%s finalize GELib success.", mode.c_str());


+ 0
- 265
ge/omm/csa_interact.cc View File

@@ -1,265 +0,0 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "omm/csa_interact.h"

#include "framework/common/debug/ge_log.h"
#include "framework/common/debug/log.h"
#include "framework/common/util.h"
#include "graph/ge_context.h"
#include "graph/manager/graph_var_manager.h"
#include "graph/utils/tensor_utils.h"
#include "mmpa/mmpa_api.h"
#include "nlohmann/json.hpp"

namespace ge {
namespace {
const char FMK_STATUS_FILE_DIR_ENV[] = "FMK_STATUS_FILE_DIR";
const char JOBSTATE_FILE_NAME[] = "jobstateupdate_framework";
const char HCOM_DETECT_FILE_NAME[] = "hcom_detection_result";
const char FILE_SEPARATE[] = "/";
} // namespace

///
/// @brief Obtain CsaInteract instance
/// @return CsaInteract instance
///
CsaInteract &CsaInteract::GetInstance() {
static CsaInteract instance;
return instance;
}

///
/// @brief CsaInteract instance initialization
/// @param [in] dev_index device index
/// @param [in] job_id job id
/// @return void
///
void CsaInteract::Init(int32_t dev_index, int64_t job_id) {
if (!is_init_) {
dev_index_ = dev_index;
job_id_ = job_id;

char file_dir_env[MMPA_MAX_PATH] = { 0x00 };
INT32 res = mmGetEnv(FMK_STATUS_FILE_DIR_ENV, file_dir_env, MMPA_MAX_PATH);
string csa_path_prefix;
if (res == EN_OK) {
csa_path_prefix = file_dir_env;
}
if (!csa_path_prefix.empty()) {
job_state_file_ = csa_path_prefix + std::to_string(dev_index_) + FILE_SEPARATE + JOBSTATE_FILE_NAME;
hcom_detect_file_ = csa_path_prefix + std::to_string(dev_index_) + FILE_SEPARATE + HCOM_DETECT_FILE_NAME;
}
is_init_ = true;
}
}

///
/// @brief Update job state file
/// @param [in] job_state job state
/// @param [in] job_sub_state detailed job state
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @return Status
///
Status CsaInteract::WriteJobState(JobState job_state, JobSubState job_sub_state, uint32_t module_ret_errcode,
ErrorModule error_module) {
if (!is_init_) {
GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState");
REPORT_INNER_ERROR("E19999", "WriteJobState failed before init. ");
return INTERNAL_ERROR;
}
if ((curr_state_ == JOBSTATE_FAILED) || (curr_state_ == JOBSTATE_KILLED)) {
return SUCCESS;
}

if (job_state_file_.empty()) {
return SUCCESS;
}

std::string content;
try {
nlohmann::json content_json;
content_json["job_id"] = job_id_;
content_json["jobstate"] = job_state;
// Only the running or running failure state has a job sub state
if ((job_state == JOBSTATE_RUNNING) || (job_state == JOBSTATE_FAILED)) {
content_json["job_sub_state"] = job_sub_state;
}
content_json["time"] = CurrentTimeInStr();
// Write error code only if run failed
if (job_state == JOBSTATE_FAILED) {
content_json["errorcode"] = module_ret_errcode;
content_json["errmodule"] = error_module;
}

content = content_json.dump();
} catch (const nlohmann::json::exception &e) {
GELOGE(INTERNAL_ERROR, "[Create][JsonObject] exception:%s job_state:%u job_sub_state:%u.",
e.what(), job_state, job_sub_state);
REPORT_INNER_ERROR("E19999", "Create json object failed. exception:%s job_state:%u job_sub_state:%u.",
e.what(), job_state, job_sub_state);
return INTERNAL_ERROR;
}

if (WriteFile(job_state_file_, content) != SUCCESS) {
// The error log subfunction has been printed and will not print again
return INTERNAL_ERROR;
}

curr_state_ = job_state;
return SUCCESS;
}

///
/// @brief Update error code in the job state file
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @param [in] job_sub_state detailed job state
/// @return void
///
void CsaInteract::WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module, JobSubState job_sub_state) {
// The error log subfunction has been printed and will not print again
Status ret = WriteJobState(JOBSTATE_FAILED, job_sub_state, module_ret_errcode, error_module);
if (ret != SUCCESS) {
GELOGW("write error code fail. ret_code: %u, status: %u", module_ret_errcode, job_sub_state);
}
}

///
/// @brief Record errors that occurred durning the training
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @param [in] job_sub_state detailed job state
/// @return void
///
void CsaInteract::StoreInternalErrorCode(uint32_t module_ret_errcode, ErrorModule error_module,
JobSubState job_sub_state) {
is_have_internal_error_ = true;

csa_error_code_.module_ret_errcode = module_ret_errcode;
csa_error_code_.error_module = error_module;
csa_error_code_.job_sub_state = job_sub_state;
}

///
/// @brief Update training error code in the job state file
/// @return void
///
void CsaInteract::WriteInternalErrorCode() {
if (is_have_internal_error_) {
WriteErrorCode(csa_error_code_.module_ret_errcode, csa_error_code_.error_module, csa_error_code_.job_sub_state);
}
}

///
/// @brief Update network connectivity detect file
/// @param [in] content network connectivity content
/// @return Status
///
Status CsaInteract::WriteHcomDetection(const std::string &content) {
if (!is_init_) {
GELOGE(INTERNAL_ERROR, "[Init][CsaInteract] obj has not init, can't WriteJobState");
REPORT_INNER_ERROR("E19999", "WriteHcomDetection failed before init.");
return INTERNAL_ERROR;
}

if (hcom_detect_file_.empty()) {
return SUCCESS;
}

return WriteFile(hcom_detect_file_, content);
}

///
/// @ingroup WriteFile
/// @brief Write the content into the file. If the file does not exist, create the file
/// @param [in] file_name: File name to be written
/// @param [in] content: Contents to be written
/// @return Status
///
Status CsaInteract::WriteFile(const std::string &file_name, const std::string &content) {
// if file path is not exist, then make path
INT32 flags = M_WRONLY | O_TRUNC | M_CREAT;
int32_t fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD);
if (fd == EN_ERROR) {
if (MakePath(file_name) != SUCCESS) {
GELOGE(INTERNAL_ERROR, "[Create][File Path] errno is %d", errno);
REPORT_CALL_ERROR("E19999", "MakePath failed. errno is %d", errno);
return INTERNAL_ERROR;
}
fd = mmOpen2(file_name.c_str(), flags, M_IRUSR | M_IWUSR | M_UMASK_GRPREAD);
if (fd == EN_ERROR) {
GELOGE(INTERNAL_ERROR, "[Open][File] errno is %d file_name: %s", errno, file_name.c_str());
REPORT_CALL_ERROR("E19999", "mmOpen2 failed. errno is %d file_name: %s", errno, file_name.c_str());
return INTERNAL_ERROR;
}
}

mmSsize_t ret = mmWrite(fd, reinterpret_cast<void *>(const_cast<char *>(content.c_str())), content.length());
if (ret == EN_ERROR) {
GELOGE(INTERNAL_ERROR, "[Write][File] errno is %d", errno);
REPORT_CALL_ERROR("E19999", "mmWrite failed. errno is %d", errno);
ret = mmClose(fd);
if (ret == EN_ERROR) {
GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno);
REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno);
}
return INTERNAL_ERROR;
}
ret = mmClose(fd);
if (ret == EN_ERROR) {
GELOGE(INTERNAL_ERROR, "[Close][File] error is %d", errno);
REPORT_CALL_ERROR("E19999", "mmClose failed. error is %d", errno);
return INTERNAL_ERROR;
}

return SUCCESS;
}

///
/// @ingroup MakePath
/// @brief Verify whether the file path exists, if not, recursively create the folder
/// @param [in] file_name: File name to be verified
/// @return Status
///
Status CsaInteract::MakePath(const std::string &file_name) {
std::size_t found = file_name.find_last_of("/");
if (found == std::string::npos) {
return PARAM_INVALID;
}

std::string file_path = file_name.substr(0, found + 1);
if (mmAccess(file_path.c_str()) == EN_OK) {
return SUCCESS;
}

found = file_path.find_first_of("/");
while (found != std::string::npos) {
std::string pre_path = file_path.substr(0, found + 1);
if (mmAccess(pre_path.c_str()) != EN_OK) {
if (mmMkdir(pre_path.c_str(), M_IRWXU) != EN_OK) {
GELOGE(INTERNAL_ERROR, "[Create][FileDir] fail, errno is %d, pre_path:%s", errno, pre_path.c_str());
REPORT_CALL_ERROR("E19999", "mmMkdir failed. errno is %d pre_path:%s", errno, pre_path.c_str());
return INTERNAL_ERROR;
}
}
found = file_path.find_first_of("/", found + 1);
}

return SUCCESS;
}
} // namespace ge

+ 0
- 183
ge/omm/csa_interact.h View File

@@ -1,183 +0,0 @@
/**
* Copyright 2019-2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef GE_OMM_CSA_INTERACT_H_
#define GE_OMM_CSA_INTERACT_H_

#include <string>

#include "framework/common/ge_inner_error_codes.h"

namespace ge {
enum JobState {
JOBSTATE_WAITING = 1,
JOBSTATE_RUNNING,
JOBSTATE_KILLING,
JOBSTATE_SUCCEED,
JOBSTATE_FAILED,
JOBSTATE_KILLED,
JOBSTATE_UNKOWN
};

enum JobSubState {
JOBSUBSTATE_ENV_INIT = 201,
JOBSUBSTATE_ENV_FIN,
JOBSUBSTATE_RESOUCE_ALLOC,
JOBSUBSTATE_MODEL_COMPILE,
JOBSUBSTATE_GRAPH_PREPARE,
JOBSUBSTATE_GRAPH_SPLIT,
JOBSUBSTATE_GRAPH_OPTIMIZE,
JOBSUBSTATE_GRAPH_BUILD,
JOBSUBSTATE_GRAPH_LOAD,
JOBSUBSTATE_GRAPH_EXEC,
JOBSUBSTATE_GRAPH_UNLOAD,
JOBSUBSTATE_OTHER
};

enum ErrorModule {
ERROR_MODULE_DRIVER = 0x01,
ERROR_MODULE_RUNTIME = 0x04,
ERROR_MODULE_CCE = 0x06,
ERROR_MODULE_FMK = 0x08,
ERROR_MODULE_HCCL = 0x12
};

struct CsaErrorCode {
CsaErrorCode()
: module_ret_errcode(0),
error_module(ERROR_MODULE_FMK),
job_sub_state(JOBSUBSTATE_OTHER) {}
~CsaErrorCode() {}
uint32_t module_ret_errcode;
ErrorModule error_module;
JobSubState job_sub_state;
};
class CsaInteract {
public:
///
/// @brief Obtain CsaInteract instance
/// @return CsaInteract instance
///
static CsaInteract& GetInstance();

///
/// @brief CsaInteract instance initialization
/// @param [in] dev_index device index
/// @param [in] job_id job id
/// @return void
///
void Init(int32_t dev_index, int64_t job_id);

///
/// @brief Update job state file
/// @param [in] job_state job state
/// @param [in] job_sub_state detailed job state
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @return Status
///
Status WriteJobState(JobState job_state,
JobSubState job_sub_state = JOBSUBSTATE_OTHER,
uint32_t module_ret_errcode = SUCCESS,
ErrorModule error_module = ERROR_MODULE_FMK);

///
/// @brief Update error code in the job state file
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @param [in] job_sub_state detailed job state
/// @return void
///
void WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module,
JobSubState job_sub_state);

///
/// @brief Record errors that occurred durning the training
/// @param [in] module_ret_errcode sub module training failure error code
/// @param [in] error_module error module identified by FMK
/// @param [in] job_sub_state detailed job state
/// @return void
///
void StoreInternalErrorCode(uint32_t module_ret_errcode,
ErrorModule error_module,
JobSubState job_sub_state);

///
/// @brief Update training error code in the job state file
/// @return void
///
void WriteInternalErrorCode();

///
/// @brief Update network connectivity detect file
/// @param [in] content network connectivity content
/// @return Status
///
Status WriteHcomDetection(const std::string& content);

private:
CsaInteract()
: dev_index_(0),
job_id_(0),
is_init_(false),
curr_state_(JOBSTATE_UNKOWN),
is_have_internal_error_(false) {}

~CsaInteract() {}

CsaInteract(const CsaInteract&) = delete;
CsaInteract(CsaInteract&&) = delete;
CsaInteract& operator=(const CsaInteract&) = delete;
CsaInteract& operator=(CsaInteract&&) = delete;

///
/// @ingroup WriteFile
/// @brief Write the content into the file. If the file does not exist, create the file
/// @param [in] file_name: File name to be written
/// @param [in] content: Contents to be written
/// @return Status
///
Status WriteFile(const std::string& file_name, const std::string& content);

///
/// @ingroup MakePath
/// @brief Verify whether the file path exists, if not, recursively create the folder
/// @param [in] file_name: File name to be verified
/// @return Status
///
Status MakePath(const std::string& file_name);

// device index
int32_t dev_index_;
// job id
int64_t job_id_;
// is initialization complete
bool is_init_;
// current job state
JobState curr_state_;
// job state file
std::string job_state_file_;
// network connectivity detect file
std::string hcom_detect_file_;
// identification of internal errors that occurred during the training
bool is_have_internal_error_;
// error code information
CsaErrorCode csa_error_code_;
};
} // namespace ge

#endif // GE_OMM_CSA_INTERACT_H_


+ 0
- 2
tests/ut/ge/CMakeLists.txt View File

@@ -287,7 +287,6 @@ set(COMMON_SRC_FILES
"${GE_CODE_DIR}/ge/graph/load/model_manager/zero_copy_task.cc"
"${GE_CODE_DIR}/ge/graph/load/model_manager/cpu_queue_schedule.cc"
"${GE_CODE_DIR}/ge/graph/load/model_manager/aipp_utils.cc"
"${GE_CODE_DIR}/ge/omm/csa_interact.cc"
"${GE_CODE_DIR}/ge/graph/load/model_manager/tbe_handle_store.cc"
"${GE_CODE_DIR}/ge/common/kernel_store.cc"
"${GE_CODE_DIR}/ge/common/tbe_kernel_store.cc"
@@ -391,7 +390,6 @@ set(GRAPH_PARTITION_COMMON_SRC_FILES
set(GRAPH_LOAD_COMMON_SRC_FILES
"${GE_CODE_DIR}/ge/graph/load/graph_loader.cc"
"${GE_CODE_DIR}/ge/graph/manager/graph_manager_utils.cc"
"${GE_CODE_DIR}/ge/omm/csa_interact.cc"
"${GE_CODE_DIR}/ge/graph/manager/graph_mem_allocator.cc"
"${GE_CODE_DIR}/ge/graph/manager/graph_var_manager.cc"
"${GE_CODE_DIR}/ge/graph/manager/trans_var_data_utils.cc"


+ 0
- 1
tests/ut/ge/graph/execute/graph_execute_unittest.cc View File

@@ -22,7 +22,6 @@
#include "graph/execute/graph_execute.h"
#include "graph/load/model_manager/model_manager.h"
#include "graph/load/model_manager/davinci_model.h"
#include "omm/csa_interact.h"
#undef private
#undef public



Loading…
Cancel
Save