From ade84fddb9b086fe5919163f4ddefc1d5ed028a1 Mon Sep 17 00:00:00 2001 From: zhupuxu Date: Fri, 16 Jul 2021 15:02:04 +0800 Subject: [PATCH] delete report Signed-off-by: zhupuxu --- ge/CMakeLists.txt | 9 +- ge/client/ge_api.cc | 88 ++++++ ge/common/CMakeLists.txt | 1 + ge/common/profiling/ge_profiling.cc | 230 --------------- ge/common/profiling/ge_runner_profiling.cc | 26 -- ge/common/profiling/profiling_init.cc | 246 ++++++++++++++++ ge/common/profiling/profiling_init.h | 54 ++++ ge/common/profiling/profiling_manager.cc | 325 +++++++++++++++------ ge/common/profiling/profiling_manager.h | 36 ++- ge/common/profiling/profiling_properties.cc | 110 +++++++ ge/common/profiling/profiling_properties.h | 62 ++++ ge/executor/ge_executor.cc | 2 - ge/graph/build/task_generator.cc | 16 +- ge/graph/build/task_generator.h | 2 +- ge/graph/execute/graph_execute.cc | 2 +- ge/graph/manager/graph_manager.cc | 4 - ge/init/gelib.cc | 20 +- ge/session/inner_session.cc | 10 + inc/framework/common/profiling/ge_profiling.h | 24 -- .../{ge_runner_profiling.h => profiling_init.h} | 0 third_party/fwkacllib/inc/runtime/base.h | 27 ++ .../fwkacllib/inc/toolchain/prof_callback.h | 9 - 22 files changed, 880 insertions(+), 423 deletions(-) delete mode 100644 ge/common/profiling/ge_runner_profiling.cc create mode 100644 ge/common/profiling/profiling_init.cc create mode 100644 ge/common/profiling/profiling_init.h create mode 100644 ge/common/profiling/profiling_properties.cc create mode 100644 ge/common/profiling/profiling_properties.h rename inc/framework/common/profiling/{ge_runner_profiling.h => profiling_init.h} (100%) diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt index f98297d8..e9a2f4ca 100755 --- a/ge/CMakeLists.txt +++ b/ge/CMakeLists.txt @@ -107,6 +107,8 @@ target_link_libraries(ge_proto_client PRIVATE ) endif () +set( + ################################################################## set(EXECUTOR_SRC_LIST "common/dump/dump_op.cc" @@ -259,10 +261,11 @@ set(EXECUTOR_SRC_LIST ################################################################## set(COMPILER_SRC_LIST "analyzer/analyzer.cc" - "common/dump/dump_op.cc" + #"common/dump/dump_op.cc" "common/ge/op_tiling_manager.cc" "common/ge/plugin_manager.cc" - "common/profiling/profiling_manager.cc" + #"common/helper/model_cache_helper.cc" + #"common/profiling/profiling_manager.cc" "engine_manager/dnnengine_manager.cc" "ge_local_engine/engine/host_cpu_engine.cc" "ge_opt_info/ge_opt_info.cc" @@ -473,7 +476,7 @@ set(RUNNER_SRC_LIST "client/ge_api.cc" "session/inner_session.cc" "session/session_manager.cc" - "common/profiling/ge_runner_profiling.cc" + "common/profiling/profiling_init.cc" "graph/manager/memory_api.cc" "graph/manager/util/hcom_util.cc" "graph/load/model_manager/task_info/hccl_task_info.cc" diff --git a/ge/client/ge_api.cc b/ge/client/ge_api.cc index e4a016b3..daf8deaf 100644 --- a/ge/client/ge_api.cc +++ b/ge/client/ge_api.cc @@ -35,6 +35,10 @@ #include "common/util/error_manager/error_manager.h" #include "toolchain/plog.h" #include "ir_build/option_utils.h" +#include "framework/common/ge_types.h" +#include "external/ge/ge_api_types.h" +#include "graph/ge_context.h" +#include "common/profiling/profiling_init.h" using domi::OpRegistry; using std::map; @@ -43,6 +47,84 @@ using std::vector; namespace { const int32_t kMaxStrLen = 128; +const int kDecimal = 10; +const int kDefaultDeviceIdForTrain = 0; +const int kDefaultDeviceIdForInfer = -1; +const char *const kGlobalOptionFpCeilingModeDefault = "2"; + +void InitOptions(const map &option_map, ge::Options &options) { + GELOGI("InitOptions start"); + options.session_id = 0; + auto is_train_mode = false; + auto iter = option_map.find(ge::OPTION_GRAPH_RUN_MODE); + if (iter != option_map.end()) { + if (ge::GraphRunMode(std::strtol(iter->second.c_str(), nullptr, kDecimal)) >= ge::TRAIN) { + is_train_mode = true; + } + } + iter = option_map.find(ge::OPTION_EXEC_SESSION_ID); + if (iter != option_map.end()) { + options.session_id = std::strtoll(iter->second.c_str(), nullptr, kDecimal); + } + options.device_id = is_train_mode ? kDefaultDeviceIdForTrain : kDefaultDeviceIdForInfer; + iter = option_map.find(ge::OPTION_EXEC_DEVICE_ID); + if (iter != option_map.end()) { + options.device_id = static_cast(std::strtol(iter->second.c_str(), nullptr, kDecimal)); + } + iter = option_map.find(ge::OPTION_EXEC_JOB_ID); + if (iter != option_map.end()) { + options.job_id = iter->second.c_str(); + } + options.isUseHcom = false; + iter = option_map.find(ge::OPTION_EXEC_IS_USEHCOM); + if (iter != option_map.end()) { + std::istringstream(iter->second) >> options.isUseHcom; + } + options.isUseHvd = false; + iter = option_map.find(ge::OPTION_EXEC_IS_USEHVD); + if (iter != option_map.end()) { + std::istringstream(iter->second) >> options.isUseHvd; + } + options.deployMode = false; + iter = option_map.find(ge::OPTION_EXEC_DEPLOY_MODE); + if (iter != option_map.end()) { + std::istringstream(iter->second) >> options.deployMode; + } + iter = option_map.find(ge::OPTION_EXEC_POD_NAME); + if (iter != option_map.end()) { + options.podName = iter->second.c_str(); + } + iter = option_map.find(ge::OPTION_EXEC_PROFILING_MODE); + if (iter != option_map.end()) { + options.profiling_mode = iter->second.c_str(); + } + iter = option_map.find(ge::OPTION_EXEC_PROFILING_OPTIONS); + if (iter != option_map.end()) { + options.profiling_options = iter->second.c_str(); + } + iter = option_map.find(ge::OPTION_EXEC_RANK_ID); + if (iter != option_map.end()) { + options.rankId = std::strtoll(iter->second.c_str(), nullptr, kDecimal); + } + iter = option_map.find(ge::OPTION_EXEC_RANK_TABLE_FILE); + if (iter != option_map.end()) { + options.rankTableFile = iter->second.c_str(); + } + options.enable_atomic = true; + iter = option_map.find(ge::OPTION_EXEC_ATOMIC_FLAG); + GE_IF_BOOL_EXEC(iter != option_map.end(), + options.enable_atomic = std::strtol(iter->second.c_str(), nullptr, kDecimal)); + GELOGI("ge InnerInitialize, the enable_atomic_flag in options_ is %d", options.enable_atomic); +} + +void InitProfiling(ge::Options &options) { + GELOGI("InitProfiling start"); + ge::GetContext().Init(); + // Profiling init + if (ge::ProfilingInit::Instance().Init(options) != ge::SUCCESS) { + GELOGW("Profiling init failed."); + } +} } // namespace static bool g_ge_initialized = false; @@ -91,6 +173,7 @@ Status CheckOptionsValid(const std::map &options) { // Initialize GE, prepare for execution, call GELib::Initialize Status GEInitializeImpl(const std::map &options) { ErrorManager::GetInstance().GenWorkStreamIdDefault(); + GELOGI("GEInitializeImpl start"); GELOGT(TRACE_INIT, "GEInitialize start"); std::string path_base = ge::GELib::GetPath(); auto ret = ErrorManager::GetInstance().Init(path_base); @@ -128,6 +211,9 @@ Status GEInitializeImpl(const std::map &options) { if (CheckOptionsValid(options) != SUCCESS) { return FAILED; } + ge::Options str_options; + InitOptions(options, str_options); + InitProfiling(str_options); GE_TIMESTAMP_END(CheckOptionsValid, "GEInitialize::CheckOptionsValid"); ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOpsProtoInit); @@ -173,6 +259,7 @@ Status GEInitializeImpl(const std::map &options) { // Initialize GE, prepare for execution, call GELib::Initialize Status GEInitialize(const std::map &options) { + GELOGI("GEInitialize with string"); ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); if (DlogReportInitialize() != SUCCESS) { GELOGW("Dlog report device log initialize failed."); @@ -181,6 +268,7 @@ Status GEInitialize(const std::map &options) { } Status GEInitialize(const std::map &options) { + GELOGI("GEInitialize with AscendString"); ErrorManager::GetInstance().SetStage(error_message::kInitialize, error_message::kOther); std::map str_options; for (auto &option : options) { diff --git a/ge/common/CMakeLists.txt b/ge/common/CMakeLists.txt index 99d6ead3..a1844051 100755 --- a/ge/common/CMakeLists.txt +++ b/ge/common/CMakeLists.txt @@ -50,6 +50,7 @@ set(SRC_LIST "${GE_CODE_DIR}/ge/common/transop_util.cc" "${GE_CODE_DIR}/ge/common/types.cc" "${GE_CODE_DIR}/ge/common/util.cc" + "${GE_CODE_DIR}/ge/common/profiling/profiling_properties.cc" ) if (NOT ENABLE_D AND NOT ENABLE_ACL) diff --git a/ge/common/profiling/ge_profiling.cc b/ge/common/profiling/ge_profiling.cc index a5857b35..573e299a 100644 --- a/ge/common/profiling/ge_profiling.cc +++ b/ge/common/profiling/ge_profiling.cc @@ -19,245 +19,15 @@ #include "common/profiling/profiling_manager.h" #include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" -#include "graph/load/graph_loader.h" -#include "graph/ge_context.h" -#include "init/gelib.h" #include "framework/common/ge_inner_error_codes.h" #include "common/model/ge_model.h" -#include "framework/omg/omg_inner_types.h" namespace { -const uint32_t kDeviceListIndex = 3; -const std::string kDeviceNums = "devNums"; -const std::string kDeviceIdList = "devIdList"; -const std::string kProfilingInit = "prof_init"; -const std::string kProfilingFinalize = "prof_finalize"; -const std::string kProfilingStart = "prof_start"; -const std::string kProfilingStop = "prof_stop"; -const std::string kProfModelSubscribe = "prof_model_subscribe"; -const std::string kProfModelUnsubscribe = "prof_model_cancel_subscribe"; -const std::string kRtSetDeviceRegName = "profiling"; -const std::string kPofilingModelId = "modelId"; - -const std::map kProfCommandTypeMap = { - {kProfCommandhandleInit, kProfilingInit}, - {kProfCommandhandleStart, kProfilingStart}, - {kProfCommandhandleStop, kProfilingStop}, - {kProfCommandhandleFinalize, kProfilingFinalize}, - {kProfCommandhandleModelSubscribe, kProfModelSubscribe}, - {kProfCommandhandleModelUnsubscribe, kProfModelUnsubscribe}}; - const uint64_t kModelId = ge::INVALID_MODEL_ID; const uint16_t kStepStart = 0; const uint16_t kStepEnd = 1; - -ge::Status NeedUnsubscribe(ProfCommandHandleType type, bool is_subscribe, - uint32_t graph_id, vector &prof_params) { - if (type == kProfCommandhandleModelUnsubscribe && is_subscribe) { - prof_params.clear(); - prof_params.emplace_back(kPofilingModelId); - uint32_t model_id = 0; - auto ret = ge::ProfilingManager::Instance().GetModelIdFromGraph(graph_id, model_id); - if (ret != ge::SUCCESS) { - GELOGE(ret, "graph_id:%u not not found", graph_id); - return ret; - } - prof_params.emplace_back(std::to_string(model_id)); - } - return ge::SUCCESS; -} } // namespace -bool TransProfConfigToParam(const ProfCommandHandleData &profCommand, vector &prof_config_params) { - prof_config_params.clear(); - prof_config_params.emplace_back(kDeviceNums); - prof_config_params.emplace_back(std::to_string(profCommand.devNums)); - prof_config_params.emplace_back(kDeviceIdList); - std::string devID = ""; - if (profCommand.devNums == 0) { - GELOGW("The device num is invalid."); - return false; - } - for (uint32_t i = 0; i < profCommand.devNums; i++) { - devID.append(std::to_string(profCommand.devIdList[i])); - if (i != profCommand.devNums - 1) { - devID.append(","); - } - } - - prof_config_params.push_back(devID); - return true; -} - -bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums) { - if (deviceid_list == nullptr) { - GELOGE(ge::PARAM_INVALID, "[Check][DeviceIDList]Invalid, it is nullptr"); - REPORT_INNER_ERROR("E19999", "Device id list is nullptr"); - return false; - } - if (device_nums == 0 || device_nums > MAX_DEV_NUM) { - GELOGE(ge::PARAM_INVALID, "[Check][DeviceNums]Invalid, device nums: %u", device_nums); - REPORT_INNER_ERROR("E19999", "DeviceNums %u check invalid", device_nums); - return false; - } - - // real device num - int32_t dev_count = 0; - rtError_t rt_err = rtGetDeviceCount(&dev_count); - if (rt_err != RT_ERROR_NONE) { - GELOGE(ge::INTERNAL_ERROR, "[Get][DeviceCount]Failed, error_code %d", rt_err); - REPORT_CALL_ERROR("E19999", "Get device count failed, error_code %d", rt_err); - return false; - } - - if (device_nums > static_cast(dev_count)) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]Device num %u is not in range [1,%d]", - device_nums, dev_count); - REPORT_INNER_ERROR("E19999", "Device num %u check invalid, it is not in range [1,%d]", - device_nums, dev_count); - return false; - } - - std::set record; - for (size_t i = 0; i < device_nums; ++i) { - uint32_t dev_id = deviceid_list[i]; - if (dev_id >= static_cast(dev_count)) { - GELOGE(ge::PARAM_INVALID, "[Check][DeviceId]Device id %u is not in range [0,%d)", - dev_id, dev_count); - REPORT_CALL_ERROR("E19999", "Device id %u is not in range [0,%d)", dev_id, dev_count); - return false; - } - if (record.count(dev_id) > 0) { - GELOGE(ge::PARAM_INVALID, "[Check][DeviceId]Device id %u is duplicatedly set", dev_id); - REPORT_CALL_ERROR("E19999", "Device id %u is not unique, duplicatedly set", dev_id); - return false; - } - record.insert(dev_id); - } - return true; -} - -ge::Status RegProfCtrlCallback(MsprofCtrlCallback func) { - if (func == nullptr) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]Msprof ctrl callback is nullptr"); - REPORT_INNER_ERROR("E19999", "Msprof ctrl callback is nullptr"); - return ge::PARAM_INVALID; - } - if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofCtrlCallback != nullptr) { - GELOGW("Msprof ctrl callback is exist, just ignore it."); - } else { - ge::ProfilingManager::Instance().SetMsprofCtrlCallback(func); - } - return ge::SUCCESS; -} - -ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) { - if (func == nullptr) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofSetDeviceCallback callback is nullptr"); - REPORT_INNER_ERROR("E19999", "MsprofSetDeviceCallback callback is nullptr"); - return ge::PARAM_INVALID; - } - // Pass MsprofSetDeviceCallback to runtime - ge::Status rt_ret = rtRegDeviceStateCallback(kRtSetDeviceRegName.c_str(), static_cast(func)); - if (rt_ret != ge::SUCCESS) { - GELOGE(rt_ret, "[Pass][MsprofSetDeviceCallback]To runtime failed, ret 0x%X", rt_ret); - REPORT_CALL_ERROR("E19999", "Pass MsprofSetDeviceCallback to runtime failed, ret 0x%X", rt_ret); - return rt_ret; - } - return ge::SUCCESS; -} - -ge::Status RegProfReporterCallback(MsprofReporterCallback func) { - if (func == nullptr) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofReporterCallback callback is nullptr"); - REPORT_INNER_ERROR("E19999", "MsprofReporterCallback callback is nullptr"); - return ge::PARAM_INVALID; - } - if (ge::ProfilingManager::Instance().GetMsprofCallback().msprofReporterCallback != nullptr) { - GELOGW("Msprof reporter callback is exist, just ignore it."); - } else { - GELOGI("GE register Msprof reporter callback."); - ge::ProfilingManager::Instance().SetMsprofReporterCallback(func); - // Pass MsprofReporterCallback to runtime - ge::Status rt_ret = rtSetMsprofReporterCallback(func); - if (rt_ret != ge::SUCCESS) { - GELOGE(rt_ret, "[Pass][Param]Pass MsprofReporterCallback to runtime failed, error_code %u", - rt_ret); - REPORT_CALL_ERROR("E19999", "Pass MsprofReporterCallback to runtime failed, error_code %u", - rt_ret); - return rt_ret; - } - // Pass MsprofReporterCallback to hccl - } - return ge::SUCCESS; -} - -ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) { - if (type != kProfCommandhandleFinalize) { - GE_CHECK_NOTNULL(data); - } - ProfCommandHandleData *prof_config_param = reinterpret_cast(data); - auto iter = kProfCommandTypeMap.find(type); - if (iter == kProfCommandTypeMap.end()) { - GELOGW("The prof comand type is invalid."); - return ge::PARAM_INVALID; - } - std::vector prof_params; - if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { - if (!isProfConfigValid(prof_config_param->devIdList, prof_config_param->devNums)) { - return ge::FAILED; - } - - if (!TransProfConfigToParam(*prof_config_param, prof_params)) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]Transfer profilerConfig to string vector failed"); - REPORT_CALL_ERROR("E19999", "Transfer profilerConfig to string vector failed"); - return ge::PARAM_INVALID; - } - } - auto &profiling_manager = ge::ProfilingManager::Instance(); - auto is_train = domi::GetContext().train_flag; - if (type == kProfCommandhandleModelSubscribe && is_train) { - profiling_manager.SetSubscribeInfo(prof_config_param->profSwitch, prof_config_param->modelId, true); - return ge::SUCCESS; - } - auto is_subscribe = profiling_manager.GetSubscribeInfo().is_subscribe; - // GraphId is actually stored in prof_config_param - auto graph_id = prof_config_param->modelId; - ge::Status ret = NeedUnsubscribe(type, is_subscribe, graph_id, prof_params); - if (ret != ge::SUCCESS) { - GELOGE(ret, "graph_id:%u not not found", graph_id); - REPORT_INPUT_ERROR("E10001", std::vector({"value", "parameter", "reason"}), - std::vector({std::to_string(graph_id), - "GraphToModelMap", - "graph_id does not exist!"})); - return ge::FAILED; - } - ge::GraphLoader graph_loader; - ge::Command command; - command.cmd_params.clear(); - command.cmd_type = iter->second; - command.cmd_params = prof_params; - if (type != kProfCommandhandleFinalize) { - command.module_index = prof_config_param->profSwitch; - } - GELOGI("GE commandhandle execute, Command Type: %s, data type config: 0x%lx", iter->second.c_str(), - command.module_index); - if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { - GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str()); - } - ret = graph_loader.CommandHandle(command); - if (ret != ge::SUCCESS) { - GELOGE(ret, "[Handle][Command]Handle profiling command failed, command type %s, error_code %u", - iter->second.c_str(), ret); - REPORT_CALL_ERROR("E19999", "Handle profiling command failed, command type %s, error_code %u", - iter->second.c_str(), ret); - return ge::FAILED; - } - - GELOGI("Successfully execute profiling command type: %d, command 0x%lx.", type, command.module_index); - return ge::SUCCESS; -} - ge::Status ProfSetStepInfo(uint64_t index_id, uint16_t tag_id, rtStream_t stream) { static bool is_first_run = true; int32_t device_id = 0; diff --git a/ge/common/profiling/ge_runner_profiling.cc b/ge/common/profiling/ge_runner_profiling.cc deleted file mode 100644 index f74ce384..00000000 --- a/ge/common/profiling/ge_runner_profiling.cc +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Copyright 2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "framework/common/profiling/ge_runner_profiling.h" -#include "init/gelib.h" - -bool IsInitialize() { - std::shared_ptr instance_ptr = ge::GELib::GetInstance(); - if (instance_ptr == nullptr || instance_ptr->InitFlag() == false) { - return false; - } - return true; -} diff --git a/ge/common/profiling/profiling_init.cc b/ge/common/profiling/profiling_init.cc new file mode 100644 index 00000000..e3da199d --- /dev/null +++ b/ge/common/profiling/profiling_init.cc @@ -0,0 +1,246 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "profiling_init.h" + +#include "common/properties_manager.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/debug/log.h" +#include "common/profiling/profiling_properties.h" +#include "runtime/base.h" +#include "profiling_manager.h" + +namespace { +const char *const kTrainingTrace = "training_trace"; +const char *const kFpPoint = "fp_point"; +const char *const kBpPoint = "bp_point"; +} + +namespace ge { + +ProfilingInit &ProfilingInit::Instance() { + static ProfilingInit profiling_init; + return profiling_init; +} + +ge::Status ProfilingInit::Init(const Options &options) { + GELOGI("ProfilingInit::Init job_id:%s", options.job_id.c_str()); + + struct MsprofGeOptions prof_conf = {{0}}; + bool is_execute_profiling = false; + Status ret = InitFromOptions(options, prof_conf, is_execute_profiling); + if (ret != SUCCESS) { + GELOGE(ret, "[Init][Profiling]Failed, error_code %u", ret); + REPORT_CALL_ERROR("E19999", "Init profiling failed, error_code %u", ret); + return ret; + } + + if (is_execute_profiling) { + int32_t cb_ret = MsprofInit(static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), + static_cast(&prof_conf), sizeof(MsprofGeOptions)); + if (cb_ret != 0) { + GELOGE(FAILED, "[Call][msprofCtrlCallback]Failed, type %u, return %d", + static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), cb_ret); + REPORT_CALL_ERROR("E19999", "Call msprofCtrlCallback failed, type %u, return %d", + static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), cb_ret); + return FAILED; + } + GELOGI("Profiling init success"); + } + else { + GELOGI("The profiling is off, skip the initialization"); + } + return SUCCESS; +} + +ge::Status ProfilingInit::ProfRegisterCtrlCallback() { + auto &prof_manager = ge::ProfilingManager::Instance(); + MsprofCtrlHandle callback = prof_manager.CommandHandle; + rtError_t rt_ret = rtProfRegisterCtrlCallback(GE,callback); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(FAILED, "register func failed"); + return FAILED; + } + return SUCCESS; +} + +ge::Status ProfilingInit::InitFromOptions(const Options &options, MsprofGeOptions &prof_conf, + bool &is_execute_profiling) { + // enable profiling by env + char env_profiling_mode[MMPA_MAX_PATH] = {0x00}; + + if (options.profiling_mode == "1" && !options.profiling_options.empty()) { + // enable profiling by ge option + if (strncpy_s(prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX, options.profiling_options.c_str(), + MSPROF_OPTIONS_DEF_LEN_MAX - 1) != EOK) { + GELOGE(INTERNAL_ERROR, "[copy][ProfilingOptions]Failed, options %s", options.profiling_options.c_str()); + REPORT_CALL_ERROR("E19999", "Copy profiling_options %s failed", options.profiling_options.c_str()); + return INTERNAL_ERROR; + } + is_execute_profiling = true; + GELOGI("The profiling in options is %s, %s. origin option: %s", options.profiling_mode.c_str(), prof_conf.options, + options.profiling_options.c_str()); + } else { + (void)mmGetEnv("PROFILING_MODE", env_profiling_mode, MMPA_MAX_PATH); + (void)mmGetEnv("PROFILING_OPTIONS", prof_conf.options, MSPROF_OPTIONS_DEF_LEN_MAX); + // The env is invalid + if ((strcmp("true", env_profiling_mode) != 0) || (strcmp(prof_conf.options, "\0") == 0)) { + return SUCCESS; + } + // enable profiling by env + is_execute_profiling = true; + GELOGI("The profiling in env is %s, %s", env_profiling_mode, prof_conf.options); + } + + ProfilingProperties::Instance().SetExecuteProfiling(is_execute_profiling); + if (!is_execute_profiling) { + return SUCCESS; + } + + // Parse json str for bp fp + Status ret = ParseOptions(prof_conf.options); + if (ret != ge::SUCCESS) { + GELOGE(ge::PARAM_INVALID, "[Parse][Options]Parse training trace param %s failed, error_code %u", prof_conf.options, + ret); + REPORT_CALL_ERROR("E19999", "Parse training trace param %s failed, error_code %u", prof_conf.options, ret); + return ge::PARAM_INVALID; + } + + if (strncpy_s(prof_conf.jobId, MSPROF_OPTIONS_DEF_LEN_MAX, options.job_id.c_str(), MSPROF_OPTIONS_DEF_LEN_MAX - 1) != + EOK) { + GELOGE(INTERNAL_ERROR, "[Copy][JobId]Failed, original job_id %s", options.job_id.c_str()); + REPORT_CALL_ERROR("E19999", "Copy job_id %s failed", options.job_id.c_str()); + return INTERNAL_ERROR; + } + GELOGI("Job id: %s, original job id: %s.", prof_conf.jobId, options.job_id.c_str()); + return ge::SUCCESS; +} + +ge::Status ProfilingInit::ParseOptions(const std::string &options) { + if (options.empty()) { + GELOGE(ge::PARAM_INVALID, "[Check][Param]Profiling options is empty"); + REPORT_INNER_ERROR("E19999", "Profiling options is empty"); + return ge::PARAM_INVALID; + } + try { + Json prof_options = Json::parse(options); + if (options.find(kTrainingTrace) == std::string::npos) { + return ge::SUCCESS; + } + std::string training_trace; + if (prof_options.contains(kTrainingTrace)) { + training_trace = prof_options[kTrainingTrace]; + } + if (training_trace.empty()) { + GELOGI("Training trace will not take effect."); + return ge::SUCCESS; + } + GELOGI("GE profiling training trace:%s", training_trace.c_str()); + if (training_trace != "on") { + GELOGE(ge::PARAM_INVALID, "[Check][Param]Training trace param:%s is invalid.", training_trace.c_str()); + REPORT_INNER_ERROR("E19999", "Training trace param:%s is invalid.", training_trace.c_str()); + return ge::PARAM_INVALID; + } + string fp_point; + string bp_point; + if (prof_options.contains(kFpPoint)) { + fp_point = prof_options[kFpPoint]; + } + if (prof_options.contains(kBpPoint)) { + bp_point = prof_options[kBpPoint]; + } + if (!fp_point.empty() && !bp_point.empty()) { + GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point.c_str(), fp_point.c_str()); + } + ProfilingProperties::Instance().SetTrainingTrace(true); + ProfilingProperties::Instance().SetFpBpPoint(fp_point,bp_point); + } catch (...) { + GELOGE(FAILED, "[Check][Param]Json prof_conf options is invalid"); + REPORT_INNER_ERROR("E19999", "Json prof_conf options is invalid"); + return ge::PARAM_INVALID; + } + return ge::SUCCESS; +} + +void ProfilingInit::StopProfiling() { + uint64_t module = GetProfilingModule(); + // The following if case will not be executed in normal case, inc case of ProfStopProfiling is abnormal + auto device_id = ProfilingProperties::Instance().GetDeviceID(); + int32_t device_num = static_cast(device_id.size()); + if (device_num != 0) { + auto device_id_ptr = std::unique_ptr(new (std::nothrow) uint32_t[device_num]); + if (device_id_ptr == nullptr) { + GELOGE(FAILED, "[Stop][Profiling]Device id ptr is null."); + REPORT_INNER_ERROR("E19999", "Stop profiling, device id ptr is null"); + return; + } + for (int32_t i = 0; i < device_num; i++) { + device_id_ptr[i] = static_cast(device_id[i]); + } + rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get()); + if (rt_ret != RT_ERROR_NONE) { + GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret); + } + } + + // stop profiling + int32_t cb_ret = MsprofFinalize(); + if (cb_ret != 0) { + GELOGW("call msprofCtrlCallback failed, type:%u, return:%d", + static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), cb_ret); + return; + } + GELOGI("Stop Profiling success."); +} + +void ProfilingInit::ShutDownProfiling() { + StopProfiling(); + ProfilingManager::Instance().PluginUnInit(); +} + +uint64_t ProfilingInit::GetProfilingModule() { + uint64_t module = PROF_MODEL_EXECUTE_MASK | + PROF_RUNTIME_API_MASK | + PROF_RUNTIME_TRACE_MASK | + PROF_SCHEDULE_TIMELINE_MASK | + PROF_SCHEDULE_TRACE_MASK | + PROF_TASK_TIME_MASK | + PROF_SUBTASK_TIME_MASK | + PROF_AICPU_TRACE_MASK | + PROF_AICORE_METRICS_MASK | + PROF_AIVECTORCORE_METRICS_MASK | + PROF_MODEL_LOAD_MASK; + return module; +} + +Status ProfilingInit::SetDeviceIdByModelId(uint32_t model_id, uint32_t &device_id) { + auto rt_ret = rtSetDeviceIdByModelIdx(model_id, device_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(ge::FAILED, "[Set][Device]Set Device id failed"); + return ge::FAILED; + } + return ge::SUCCESS; +} + +Status ProfilingInit::UnsetDeviceIdByModelId(uint32_t model_id, uint32_t &device_id) { + auto rt_ret = rtSetDeviceIdByModelIdx(model_id, device_id); + if (rt_ret != RT_ERROR_NONE) { + GELOGE(ge::FAILED, "[Unset][Device]Set Device id failed"); + return ge::FAILED; + } + return ge::SUCCESS; +} +} // namespace ge \ No newline at end of file diff --git a/ge/common/profiling/profiling_init.h b/ge/common/profiling/profiling_init.h new file mode 100644 index 00000000..ed537587 --- /dev/null +++ b/ge/common/profiling/profiling_init.h @@ -0,0 +1,54 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_PROFILING_PROFILING_INIT_H_ +#define GE_COMMON_PROFILING_PROFILING_INIT_H_ + +#include +#include +#include + +#include "common/profiling/profiling_properties.h" +#include "framework/common/ge_inner_error_codes.h" +#include "framework/common/ge_types.h" +#include "toolchain/prof_callback.h" + +using std::map; +using std::string; +using std::vector; +using Json = nlohmann::json; + +namespace ge { +class ProfilingInit { + public: + static ProfilingInit &Instance(); + Status Init(const Options &options); + void StopProfiling(); + Status ProfRegisterCtrlCallback(); + void ShutDownProfiling(); + Status SetDeviceIdByModelId(uint32_t model_id, uint32_t &device_id); + Status UnsetDeviceIdByModelId(uint32_t model_id, uint32_t &device_id); + + private: + ProfilingInit() = default; + ~ProfilingInit() = default; + Status InitFromOptions(const Options &options, MsprofGeOptions &prof_conf, bool &is_execute_profiling); + Status ParseOptions(const std::string &options); + uint64_t GetProfilingModule(); +}; +} // namespace ge + +#endif // GE_COMMON_PROFILING_PROFILING_INIT_H_ diff --git a/ge/common/profiling/profiling_manager.cc b/ge/common/profiling/profiling_manager.cc index e8f41cc4..3a0bfc76 100644 --- a/ge/common/profiling/profiling_manager.cc +++ b/ge/common/profiling/profiling_manager.cc @@ -25,6 +25,7 @@ #include "runtime/base.h" #include "graph/load/model_manager/davinci_model.h" #include "mmpa/mmpa_api.h" +#include "graph/load/graph_loader.h" namespace { const char *const kTrainingTrace = "training_trace"; @@ -62,17 +63,39 @@ const std::string kShape = "shape"; const std::string kIdx = "idx"; #endif +const uint32_t kDeviceListIndex = 3; +const uint32_t kCommandNum = 6; +const int kMaxDevNum = 64; +const std::string kDeviceNums = "devNums"; +const std::string kDeviceIdList = "devIdList"; +const std::string kProfilingInit = "prof_init"; +const std::string kProfilingFinalize = "prof_finalize"; +const std::string kProfilingStart = "prof_start"; +const std::string kProfilingStop = "prof_stop"; +const std::string kProfilingModelSubscribe = "prof_model_subscribe"; +const std::string kProfilingModelUnsubscribe = "prof_model_cancel_subscribe"; +const std::string kPofilingModelId = "modelId"; + +const std::map kProfCommandTypeMap = { + {kProfCommandhandleInit, kProfilingInit}, + {kProfCommandhandleStart, kProfilingStart}, + {kProfCommandhandleStop, kProfilingStop}, + {kProfCommandhandleFinalize, kProfilingFinalize}, + {kProfCommandhandleModelSubscribe, kProfilingModelSubscribe}, + {kProfCommandhandleModelUnsubscribe, kProfilingModelUnsubscribe}}; } // namespace namespace ge { + +ProfSubscribeInfo ProfilingManager::subscribe_info_ = {false, 0, 0}; +MsprofReporterCallback ProfilingManager::reporter_callback_ = nullptr; + ProfilingManager::ProfilingManager() : is_load_profiling_(false), is_execute_profiling_(false), is_training_trace_(false), subscribe_count_(0), - prof_cb_({nullptr, nullptr}), - index_id_(UINT64_MAX), - subscribe_info_({false, 0, 0}) { + index_id_(UINT64_MAX) { } ProfilingManager::~ProfilingManager() {} @@ -82,45 +105,6 @@ ProfilingManager &ProfilingManager::Instance() { return profiling_manager; } -ge::Status ProfilingManager::Init(const Options &options) { -#ifdef DAVINCI_SUPPORT_PROFILING - vector().swap(device_id_); - subscribe_count_ = 0; - GELOGI("ProfilingManager::Init job_id:%s", options.job_id.c_str()); - - struct MsprofGeOptions prof_conf = {{ 0 }}; - Status ret = InitFromOptions(options, prof_conf); - if (ret != SUCCESS) { - GELOGE(ret, "[Init][Profiling]Failed, error_code %u", ret); - REPORT_CALL_ERROR("E19999", "Init profiling failed, error_code %u", ret); - return ret; - } - - if (is_execute_profiling_) { - if (prof_cb_.msprofCtrlCallback == nullptr) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofCtrlCallback callback is nullptr"); - REPORT_INNER_ERROR("E19999", "MsprofCtrlCallback callback is nullptr"); - return ge::PARAM_INVALID; - } - int32_t cb_ret = prof_cb_.msprofCtrlCallback( - static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), - static_cast(&prof_conf), sizeof(MsprofGeOptions)); - if (cb_ret != 0) { - GELOGE(FAILED, "[Call][msprofCtrlCallback]Failed, type %u, return %d", - static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), cb_ret); - REPORT_CALL_ERROR("E19999", "Call msprofCtrlCallback failed, type %u, return %d", - static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), - cb_ret); - return FAILED; - } - GELOGI("Profiling init success"); - } else { - GELOGI("The profiling is off, skip the initialization"); - } -#endif - return SUCCESS; -} - ge::Status ProfilingManager::InitFromOptions(const Options &options, MsprofGeOptions &prof_conf) { #ifdef DAVINCI_SUPPORT_PROFILING // enable profiling by env @@ -221,44 +205,6 @@ ge::Status ProfilingManager::ParseOptions(const std::string &options) { return ge::SUCCESS; } -void ProfilingManager::StopProfiling() { -#ifdef DAVINCI_SUPPORT_PROFILING - uint64_t module = GetProfilingModule(); - // The following if case will not be executed in normal case, inc case of ProfStopProfiling is abnormal - int32_t device_num = static_cast(device_id_.size()); - if (device_num != 0) { - auto device_id_ptr = std::unique_ptr(new (std::nothrow) uint32_t[device_num]); - if (device_id_ptr == nullptr) { - GELOGE(FAILED, "[Stop][Profiling]Device id ptr is null."); - REPORT_INNER_ERROR("E19999", "Stop profiling, device id ptr is null"); - return; - } - for (int32_t i = 0; i < device_num; i++) { - device_id_ptr[i] = static_cast(device_id_[i]); - } - rtError_t rt_ret = rtProfilerStop(module, device_num, device_id_ptr.get()); - if (rt_ret != RT_ERROR_NONE) { - GELOGW("Call rtProfilerStop failed, ret:%d", rt_ret); - } - } - - // stop profiling - if (prof_cb_.msprofCtrlCallback == nullptr) { - GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofCtrlCallback callback is nullptr"); - REPORT_INNER_ERROR("E19999", "MsprofCtrlCallback callback is nullptr"); - return; - } - int32_t cb_ret = prof_cb_.msprofCtrlCallback(static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), - nullptr, 0); - if (cb_ret != 0) { - GELOGW("call msprofCtrlCallback failed, type:%u, return:%d", - static_cast(MsprofCtrlCallbackType::MSPROF_CTRL_FINALIZE), cb_ret); - return; - } - GELOGI("Stop Profiling success."); -#endif -} - void ProfilingManager::ProfilingOpInputOutInfo(const TaskDescInfo &task, Json &task_json) { #ifdef DAVINCI_SUPPORT_PROFILING for (size_t i = 0; i < task.input_format.size(); i++) { @@ -896,13 +842,13 @@ bool ProfilingManager::ProfilingModelExecuteOn() const { return execute_model_prof_on; } -Status ProfilingManager::PluginInit() { - if (prof_cb_.msprofReporterCallback == nullptr) { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::PluginInit() { + if (reporter_callback_ == nullptr) { GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofReporterCallback callback is nullptr"); REPORT_INNER_ERROR("E19999", "MsprofReporterCallback callback is nullptr"); return ge::PARAM_INVALID; } - int32_t cb_ret = prof_cb_.msprofReporterCallback( + int32_t cb_ret = reporter_callback_( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_INIT), nullptr, 0); @@ -912,7 +858,7 @@ Status ProfilingManager::PluginInit() { return INTERNAL_ERROR; } - cb_ret = prof_cb_.msprofReporterCallback( + cb_ret = reporter_callback_( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_DATA_MAX_LEN), &reporter_max_len_, sizeof(uint32_t)); @@ -927,12 +873,12 @@ Status ProfilingManager::PluginInit() { void ProfilingManager::PluginUnInit() const { #ifdef DAVINCI_SUPPORT_PROFILING - if (prof_cb_.msprofReporterCallback == nullptr) { + if (reporter_callback_ == nullptr) { GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofReporterCallback callback is nullptr"); REPORT_INNER_ERROR("E19999", "MsprofReporterCallback callback is nullptr"); return; } - int32_t cb_ret = prof_cb_.msprofReporterCallback( + int32_t cb_ret = reporter_callback_( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT), nullptr, 0); @@ -942,13 +888,14 @@ void ProfilingManager::PluginUnInit() const { #endif } -Status ProfilingManager::CallMsprofReport(ReporterData &reporter_data) const { - if (prof_cb_.msprofReporterCallback == nullptr) { +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::CallMsprofReport( + ReporterData &reporter_data) const { + if (reporter_callback_ == nullptr) { GELOGE(ge::PARAM_INVALID, "[Check][Param]MsprofReporterCallback callback is nullptr"); REPORT_INNER_ERROR("E19999", "MsprofReporterCallback callback is nullptr"); return ge::PARAM_INVALID; } - return prof_cb_.msprofReporterCallback( + return reporter_callback_( static_cast(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK), static_cast(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT), static_cast(&reporter_data), sizeof(ReporterData)); @@ -1089,4 +1036,204 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::GetMod GELOGE(PARAM_INVALID, "[Check][GraphId]graph_id:%u does not exist!", graph_id); return FAILED; } + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::SetDeviceByModelId( + uint32_t model_id, uint32_t &device_id) { + auto ret = rtSetDeviceIdByModelIdx(model_id, device_id); + if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "set device id by model_id:%u failed!", model_id); + GELOGE(FAILED, "[Check][ModelId]set device id by model_id:%u failed!", model_id); + return FAILED; + } + return SUCCESS; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::UnsetDeviceByModelId( + uint32_t model_id, uint32_t &device_id) { + auto ret = rtUnsetDeviceIdByModelIdx(model_id, device_id); + if (ret != RT_ERROR_NONE) { + REPORT_CALL_ERROR("E19999", "unset device id by model_id:%u failed!", model_id); + GELOGE(FAILED, "[Check][ModelId]unset device id by model_id:%u failed!", model_id); + return FAILED; + } + return SUCCESS; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::isProfTypeValid(uint32_t type) { + if (type < 0 || type >= kCommandNum) { + GELOGE(ge::PARAM_INVALID, "[Check][Type]Type %u is invalid", type); + return false; + } + GELOGD("Type is %u", type); + return true; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::isProfConfigValid( + const uint32_t *deviceid_list, uint32_t device_nums) { + if (deviceid_list == nullptr) { + GELOGE(ge::PARAM_INVALID, "[Check][DeviceIDList]Invalid, it is nullptr"); + REPORT_INNER_ERROR("E19999", "Device id list is nullptr"); + return false; + } + if (device_nums == 0 || device_nums > kMaxDevNum) { + GELOGE(ge::PARAM_INVALID, "[Check][DeviceNums]Invalid, device nums: %u", device_nums); + REPORT_INNER_ERROR("E19999", "DeviceNums %u check invalid", device_nums); + return false; + } + + // real device num + int32_t dev_count = 0; + rtError_t rt_err = rtGetDeviceCount(&dev_count); + if (rt_err != RT_ERROR_NONE) { + GELOGE(ge::INTERNAL_ERROR, "[Get][DeviceCount]Failed, error_code %d", rt_err); + REPORT_CALL_ERROR("E19999", "Get device count failed, error_code %d", rt_err); + return false; + } + + if (device_nums > static_cast(dev_count)) { + GELOGE(ge::PARAM_INVALID, "[Check][Param]Device num %u is not in range [1,%d]", + device_nums, dev_count); + REPORT_INNER_ERROR("E19999", "Device num %u check invalid, it is not in range [1,%d]", + device_nums, dev_count); + return false; + } + + std::set record; + for (size_t i = 0; i < device_nums; ++i) { + uint32_t dev_id = deviceid_list[i]; + if (dev_id >= static_cast(dev_count)) { + GELOGE(ge::PARAM_INVALID, "[Check][DeviceId]Device id %u is not in range [0,%d)", + dev_id, dev_count); + REPORT_CALL_ERROR("E19999", "Device id %u is not in range [0,%d)", dev_id, dev_count); + return false; + } + if (record.count(dev_id) > 0) { + GELOGE(ge::PARAM_INVALID, "[Check][DeviceId]Device id %u is duplicatedly set", dev_id); + REPORT_CALL_ERROR("E19999", "Device id %u is not unique, duplicatedly set", dev_id); + return false; + } + record.insert(dev_id); + } + return true; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ProfilingManager::TransProfConfigToParam( + const rtProfCommandHandle &profCommand, vector &prof_config_params) { + prof_config_params.clear(); + prof_config_params.emplace_back(kDeviceNums); + prof_config_params.emplace_back(std::to_string(profCommand.devNums)); + prof_config_params.emplace_back(kDeviceIdList); + std::string devID = ""; + if (profCommand.devNums == 0) { + GELOGW("The device num is invalid."); + return false; + } + for (uint32_t i = 0; i < profCommand.devNums; i++) { + devID.append(std::to_string(profCommand.devList[i])); + if (i != profCommand.devNums - 1) { + devID.append(","); + } + } + + prof_config_params.push_back(devID); + return true; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status ProfilingManager::NeedUnsubscribe( + ProfCommandHandleType type, bool is_subscribe, uint32_t graph_id, vector &prof_params) { + if (type == kProfCommandhandleModelUnsubscribe && is_subscribe) { + prof_params.clear(); + prof_params.emplace_back(kPofilingModelId); + uint32_t model_id = 0; + auto ret = GetModelIdFromGraph(graph_id, model_id); + if (ret != ge::SUCCESS) { + GELOGE(ret, "graph_id:%u not not found", graph_id); + return ret; + } + prof_params.emplace_back(std::to_string(model_id)); + } + return ge::SUCCESS; +} + +FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY rtError_t ProfilingManager::CommandHandle( + rtProfCtrlType_t rt_type, void *data, uint32_t len) { + if (data == nullptr) { + GELOGE(ge::PARAM_INVALID, "The prof comand is invalid."); + return ge::FAILED; + } + if (rt_type == RT_PROF_CTRL_REPORTER) { + reporter_callback_ = reinterpret_cast(data); + GELOGD("return with MsprofReporterCallback"); + return ge::SUCCESS; + } + else if (rt_type == RT_PROF_CTRL_SWITCH) { + rtProfCommandHandle_t *prof_config_param = reinterpret_cast(data); + if (!isProfTypeValid(prof_config_param->type)) { + GELOGE(ge::PARAM_INVALID, "The prof comand is invalid."); + return ge::FAILED; + } + auto type = static_cast(prof_config_param->type); + if (type != kProfCommandhandleFinalize) { + GE_CHECK_NOTNULL(data); + } + auto iter = kProfCommandTypeMap.find(type); + if (iter == kProfCommandTypeMap.end()) { + GELOGW("The prof comand type is invalid."); + return ge::PARAM_INVALID; + } + std::vector prof_params; + if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { + if (!isProfConfigValid(prof_config_param->devList, prof_config_param->devNums)) { + return ge::FAILED; + } + if (!TransProfConfigToParam(*prof_config_param, prof_params)) { + GELOGE(ge::PARAM_INVALID, "[Check][Param]Transfer profilerConfig to string vector failed"); + REPORT_CALL_ERROR("E19999", "Transfer profilerConfig to string vector failed"); + return ge::PARAM_INVALID; + } + } + auto is_train = domi::GetContext().train_flag; + if (type == kProfCommandhandleModelSubscribe && is_train) { + SetSubscribeInfo(prof_config_param->profSwitch, prof_config_param->modelId, true); + return ge::SUCCESS; + } + auto is_subscribe = subscribe_info_.is_subscribe; + // GraphId is actually stored in prof_config_param + auto graph_id = prof_config_param->modelId; + ge::Status ret = NeedUnsubscribe(type, is_subscribe, graph_id, prof_params); + if (ret != ge::SUCCESS) { + GELOGE(ret, "graph_id:%u not not found", graph_id); + REPORT_INPUT_ERROR("E10001", std::vector({"value", "parameter", "reason"}), + std::vector({std::to_string(graph_id), + "GraphToModelMap", + "graph_id does not exist!"})); + return ge::FAILED; + } + GraphLoader graph_loader; + Command command; + command.cmd_params.clear(); + command.cmd_type = iter->second; + command.cmd_params = prof_params; + if (type != kProfCommandhandleFinalize) { + command.module_index = prof_config_param->profSwitch; + } + GELOGI("GE commandhandle execute, Command Type: %s, data type config: 0x%lx", iter->second.c_str(), + command.module_index); + if (type == kProfCommandhandleStart || type == kProfCommandhandleStop) { + GELOGI("Profiling device nums:%s , deviceID:[%s]", prof_params[0].c_str(), prof_params[kDeviceListIndex].c_str()); + } + ret = graph_loader.CommandHandle(command); + if (ret != ge::SUCCESS) { + GELOGE(ret, "[Handle][Command]Handle profiling command failed, command type %s, error_code %u", + iter->second.c_str(), ret); + REPORT_CALL_ERROR("E19999", "Handle profiling command failed, command type %s, error_code %u", + iter->second.c_str(), ret); + return ge::FAILED; + } + + GELOGI("Successfully execute profiling command type: %d, command 0x%lx.", type, command.module_index); + return ge::SUCCESS; + } + return ge::FAILED; +} } // namespace ge diff --git a/ge/common/profiling/profiling_manager.h b/ge/common/profiling/profiling_manager.h index 86371d51..f435d509 100755 --- a/ge/common/profiling/profiling_manager.h +++ b/ge/common/profiling/profiling_manager.h @@ -52,7 +52,14 @@ namespace { const uint64_t PROF_HCCL_TRACE_MASK = 0x1000; const uint64_t PROF_DATA_PROCESS_MASK = 0x2000; const uint64_t PROF_MODEL_LOAD_MASK = 0x8000000000000000; - + enum ProfCommandHandleType { + kProfCommandhandleInit = 0, + kProfCommandhandleStart, + kProfCommandhandleStop, + kProfCommandhandleFinalize, + kProfCommandhandleModelSubscribe, + kProfCommandhandleModelUnsubscribe + }; } // namespace namespace ge { class OpDesc; @@ -68,24 +75,17 @@ struct ProfSubscribeInfo { uint32_t graph_id; }; -struct MsprofCallback { - MsprofCtrlCallback msprofCtrlCallback; - MsprofReporterCallback msprofReporterCallback; -}; - class ProfilingManager { public: ProfilingManager(); virtual ~ProfilingManager(); static ProfilingManager &Instance(); - Status Init(const Options &options); Status ProfInit(uint64_t module); Status ProfFinalize(); Status ProfStartProfiling(uint64_t module, const std::map &config_para); Status ProfStopProfiling(uint64_t module, const std::map &config_para); Status ProfModelSubscribe(uint64_t module, void *model); Status ProfModelUnsubscribe(void *model); - void StopProfiling(); bool ProfilingTrainingTraceOn() const { return is_training_trace_; } // report model load profiling data flag, data contain task desc info, step info, model load fusion op info bool ProfilingModelLoadOn() const { return is_load_profiling_; } @@ -100,9 +100,8 @@ class ProfilingManager { Status PluginInit(); void PluginUnInit() const; Status CallMsprofReport(ReporterData &reporter_data) const; - struct MsprofCallback &GetMsprofCallback() { return prof_cb_; } - void SetMsprofCtrlCallback(MsprofCtrlCallback func) { prof_cb_.msprofCtrlCallback = func; } - void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; } + const MsprofReporterCallback GetMsprofReporterCallback() const { return reporter_callback_; } + void SetMsprofReporterCallback(MsprofReporterCallback func) { reporter_callback_ = func; } void GetFpBpPoint(std::string &fp_point, std::string &bp_point); void GetOpInputOutputInfo(const OpDescPtr &op, TaskDescInfo &task_desc_info) const; void ReportData(const int32_t &device_id, const std::string &data, const std::string &tag_name); @@ -111,11 +110,14 @@ class ProfilingManager { uint64_t GetStepInfoIndex() const { return index_id_; } void SetGraphIdToDeviceMap(uint32_t graph_id, uint32_t device_id) { device_id_map_[graph_id] = device_id; } Status GetDeviceIdFromGraph(uint32_t graph_id, uint32_t &device_id); - void SetSubscribeInfo(uint64_t prof_switch, uint32_t model_id, bool is_subscribe); - const ProfSubscribeInfo &GetSubscribeInfo() const { return subscribe_info_; } + static void SetSubscribeInfo(uint64_t prof_switch, uint32_t model_id, bool is_subscribe); + ProfSubscribeInfo GetSubscribeInfo() { return subscribe_info_; } void CleanSubscribeInfo(); void SetGraphIdToModelMap(uint32_t graph_id, uint32_t model_id) { model_id_map_[graph_id] = model_id; } Status GetModelIdFromGraph(uint32_t graph_id, uint32_t &model_id); + Status SetDeviceByModelId(uint32_t model_id, uint32_t &device_id); + Status UnsetDeviceByModelId(uint32_t model_id, uint32_t &device_id); + static rtError_t CommandHandle(rtProfCtrlType_t rt_type, void *data, uint32_t len); private: Status InitFromOptions(const Options &options, MsprofGeOptions &prof_conf); @@ -129,6 +131,10 @@ class ProfilingManager { void UpdateSubscribeDeviceModuleMap(std::string prof_type, uint32_t device_id, uint64_t module); void GetOpInputInfo(const OpDescPtr &op, TaskDescInfo &task_desc_info) const; void GetOpOutputInfo(const OpDescPtr &op, TaskDescInfo &task_desc_info) const; + static bool isProfTypeValid(uint32_t type); + static bool isProfConfigValid(const uint32_t *deviceid_list, uint32_t device_nums); + static bool TransProfConfigToParam(const rtProfCommandHandle &profCommand, vector &prof_config_params); + static Status NeedUnsubscribe(ProfCommandHandleType type, bool is_subscribe, uint32_t graph_id, vector &prof_params); bool is_load_profiling_; bool is_execute_profiling_; @@ -139,14 +145,14 @@ class ProfilingManager { uint32_t subscribe_count_; std::mutex mutex_; std::mutex mutex_report_; - MsprofCallback prof_cb_; std::string fp_point_; std::string bp_point_; uint32_t reporter_max_len_ = 0; uint64_t index_id_; std::map device_id_map_; // key: graph_id, value: device_id std::map model_id_map_; // key: graph_id, value: model_id - ProfSubscribeInfo subscribe_info_; + static ProfSubscribeInfo subscribe_info_; + static MsprofReporterCallback reporter_callback_; }; } // namespace ge #endif // GE_COMMON_PROFILING_PROFILING_MANAGER_H_ diff --git a/ge/common/profiling/profiling_properties.cc b/ge/common/profiling/profiling_properties.cc new file mode 100644 index 00000000..b7955d03 --- /dev/null +++ b/ge/common/profiling/profiling_properties.cc @@ -0,0 +1,110 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "profiling_properties.h" +#include "framework/common/debug/ge_log.h" +#include "framework/common/debug/log.h" +#include "graph/ge_context.h" + +namespace { +const uint64_t kMsProfOptionsMaxlen = 2048; +const char *const kFpPoint = "fp_point"; +const char *const kBpPoint = "bp_point"; +} // namespace ge + +namespace ge{ +ProfilingProperties& ProfilingProperties::Instance() { + static ProfilingProperties profiling_properties; + return profiling_properties; +} + +void ProfilingProperties::SetLoadProfiling(bool is_load_profiling) { + std::lock_guardlock(mutex_); + is_load_profiling_ = is_load_profiling; +} + +bool ProfilingProperties::IsLoadProfiling() { + std::lock_guardlock(mutex_); + return is_load_profiling_; +} + +void ProfilingProperties::SetExecuteProfiling(bool is_exec_profiling) { + std::lock_guardlock(mutex_); + is_execute_profiling_ = is_exec_profiling; +} + +bool ProfilingProperties::IsExecuteProfiling() { + std::lock_guardlock(mutex_); + return is_execute_profiling_; +} + +void ProfilingProperties::SetTrainingTrace(bool is_train_trance) { + std::lock_guardlock(mutex_); + is_training_trace_ = is_train_trance; +} + +void ProfilingProperties::GetFpBpPoint(std::string &fp_point, std::string &bp_point) { + // Env or options mode, fp_point_/bp_point_ have initiliazed on profiling init + std::lock_guardlock(mutex_); + if (!fp_point_.empty() && !bp_point_.empty()) { + fp_point = fp_point_; + bp_point = bp_point_; + GELOGI("Bp Fp have been initialized in env or options. bp_point: %s, fp_point: %s", bp_point.c_str(), + fp_point.c_str()); + return; + } + // ProfApi mode and training trace is set + // Parse options first + char env_profiling_options[kMsProfOptionsMaxlen] = {0x00}; + bool is_profiling_valid = false; + std::string profiling_options; + if (ge::GetContext().GetOption(OPTION_EXEC_PROFILING_OPTIONS, profiling_options) == SUCCESS && + !profiling_options.empty()) { + is_profiling_valid = true; + } else { + INT32 ret = mmGetEnv("PROFILING_OPTIONS", env_profiling_options, kMsProfOptionsMaxlen); + if (ret != EN_OK) { + GELOGI("PROFILING_OPTIONS env is not exist."); + return; + } + GELOGI("Parse env PROFILING_OPTIONS:%s.", env_profiling_options); + profiling_options = env_profiling_options; + is_profiling_valid = true; + } + if (is_profiling_valid) { + try { + Json prof_options = Json::parse(profiling_options); + if (prof_options.contains(kFpPoint)) { + fp_point_ = prof_options[kFpPoint]; + } + if (prof_options.contains(kBpPoint)) { + bp_point_ = prof_options[kBpPoint]; + } + fp_point = fp_point_; + bp_point = bp_point_; + if (!fp_point_.empty() && !bp_point_.empty()) { + GELOGI("Training trace bp fp is set, bp_point:%s, fp_point:%s.", bp_point_.c_str(), fp_point_.c_str()); + } + } catch (...) { + GELOGW("Json prof options is invalid."); + return; + } + } + + return; +} + +} // namespace ge \ No newline at end of file diff --git a/ge/common/profiling/profiling_properties.h b/ge/common/profiling/profiling_properties.h new file mode 100644 index 00000000..102555ac --- /dev/null +++ b/ge/common/profiling/profiling_properties.h @@ -0,0 +1,62 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GE_COMMON_PROFILING_PROPERTIES_H_ +#define GE_COMMON_PROFILING_PROPERTIES_H_ + +#include +#include +#include +#include + +#include "framework/common/ge_types.h" + +using Json = nlohmann::json; + +namespace ge { +class ProfilingProperties { + public: + static ProfilingProperties &Instance(); + void SetLoadProfiling(bool is_load_profiling); + bool IsLoadProfiling(); + void SetExecuteProfiling(bool is_execute_profiling); + bool IsExecuteProfiling(); + void SetTrainingTrace(bool is_train_trance); + bool ProfilingTrainingTraceOn() const { return is_training_trace_; } + bool IsTrainTrace(); + void SetFpBpPoint(const std::string &fp_point, const std::string &bp_point); + void SetDeviceId(const std::vector &device_id); + std::vector GetDeviceID(); + bool ProfilingOn() const { return is_load_profiling_ && is_execute_profiling_; } + void GetFpBpPoint(std::string &fp_point, std::string &bp_point); + + private: + ProfilingProperties() =default; + ~ProfilingProperties() = default; + std::mutex mutex_; + std::mutex point_mutex_; + bool is_load_profiling_ = false; + bool is_execute_profiling_ = false; + bool is_training_trace_ = false; + std::string fp_point_; + std::string bp_point_; + std::vector device_id_; + + +}; +} // namespace ge + +#endif // GE_COMMON_PROFILING_PROPERTIES_H_ diff --git a/ge/executor/ge_executor.cc b/ge/executor/ge_executor.cc index 76cde2b9..964b95c3 100755 --- a/ge/executor/ge_executor.cc +++ b/ge/executor/ge_executor.cc @@ -277,7 +277,6 @@ Status GeExecutor::Initialize() { profiling_options.device_id = 0; // job id need to be set, the value is meaningless; profiling_options.job_id = "1"; - ProfilingManager::Instance().Init(profiling_options); isInit_ = true; GELOGI("Init GeExecutor over."); @@ -295,7 +294,6 @@ Status GeExecutor::Finalize() { // Stop profiling if (ProfilingManager::Instance().ProfilingOn()) { - ProfilingManager::Instance().StopProfiling(); ProfilingManager::Instance().PluginUnInit(); } diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc index abb409c4..9d49eb49 100755 --- a/ge/graph/build/task_generator.cc +++ b/ge/graph/build/task_generator.cc @@ -17,7 +17,7 @@ #include "graph/build/task_generator.h" #include #include -#include "common/profiling/profiling_manager.h" +#include "common/profiling/profiling_properties.h" #include "framework/common/types.h" #include "framework/common/util.h" #include "framework/common/debug/ge_log.h" @@ -945,7 +945,7 @@ Status TaskGenerator::GetFpBpIndex(const ComputeGraphPtr &graph, ProfilingPoint vector &all_reduce_nodes, std::string &fp_point_str, std::string &bp_point_str) const { - ProfilingManager::Instance().GetFpBpPoint(fp_point_str, bp_point_str); + ProfilingProperties::Instance().GetFpBpPoint(fp_point_str, bp_point_str); Status ret = SUCCESS; if (fp_point_str.empty()) { @@ -976,8 +976,8 @@ Status TaskGenerator::FindProfilingTaskIndex(const ComputeGraphPtr &graph, Profi vector &all_reduce_nodes) const { GE_CHECK_NOTNULL(graph); const char *profiling_mode = std::getenv(kProfilingMode); - bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || - ProfilingManager::Instance().ProfilingTrainingTraceOn(); + bool is_profiling = (profiling_mode != nullptr) || ProfilingProperties::Instance().ProfilingOn() || + ProfilingProperties::Instance().ProfilingTrainingTraceOn(); if (!is_profiling) { GELOGD("Profiling is not open."); return SUCCESS; @@ -1071,8 +1071,8 @@ Status TaskGenerator::InsertProfilingTaskBefore(const OpDescPtr &op_desc, const vector &all_reduce_nodes, uint32_t node_index, vector &task_def_list) { const char *profiling_mode = std::getenv(kProfilingMode); - bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || - ProfilingManager::Instance().ProfilingTrainingTraceOn(); + bool is_profiling = (profiling_mode != nullptr) || ProfilingProperties::Instance().ProfilingOn() || + ProfilingProperties::Instance().ProfilingTrainingTraceOn(); bool is_insert_fp_profiling_task = false; (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_FP_PROFILILNG_TASK, is_insert_fp_profiling_task); bool is_insert_bp_profiling_task = false; @@ -1167,8 +1167,8 @@ Status TaskGenerator::InsertProfilingTaskAfter(const OpDescPtr &op_desc, const P vector &task_def_list) { GE_CHECK_NOTNULL(op_desc); const char *profiling_mode = std::getenv(kProfilingMode); - bool is_profiling = (profiling_mode != nullptr) || ProfilingManager::Instance().ProfilingOn() || - ProfilingManager::Instance().ProfilingTrainingTraceOn(); + bool is_profiling = (profiling_mode != nullptr) || ProfilingProperties::Instance().ProfilingOn() || + ProfilingProperties::Instance().ProfilingTrainingTraceOn(); bool is_insert_bp_profiling_task = false; (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_INSERT_BP_PROFILILNG_TASK, is_insert_bp_profiling_task); bool is_insert_end_profiling_task = false; diff --git a/ge/graph/build/task_generator.h b/ge/graph/build/task_generator.h index 5d204c3c..f2fa48c9 100755 --- a/ge/graph/build/task_generator.h +++ b/ge/graph/build/task_generator.h @@ -26,7 +26,7 @@ #include "framework/common/types.h" #include "graph/compute_graph.h" #include "graph/model.h" -#include "proto/task.pb.h" +#include "ge_runtime/proto/task.pb.h" #include "runtime/rt.h" namespace ge { diff --git a/ge/graph/execute/graph_execute.cc b/ge/graph/execute/graph_execute.cc index 03abf91f..90db9a8d 100755 --- a/ge/graph/execute/graph_execute.cc +++ b/ge/graph/execute/graph_execute.cc @@ -796,7 +796,7 @@ Status GraphExecutor::GetModelByID(uint32_t model_id, std::shared_ptr davinci_model = nullptr; uint32_t model_id = 0; diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc index fa140bfe..eecf00e7 100755 --- a/ge/graph/manager/graph_manager.cc +++ b/ge/graph/manager/graph_manager.cc @@ -109,7 +109,6 @@ #include "register/custom_pass_helper.h" #include "external/graph/types.h" #include "common/util/error_manager/error_manager.h" -#include "common/profiling/profiling_manager.h" namespace { const char *const kSummary = "Summary"; @@ -462,9 +461,6 @@ Status GraphManager::AddGraph(const GraphId &graph_id, const Graph &graph, const std::map &options, const OmgContext &omg_context) { IncreaseGraphCount(graph_id); - auto device_id = GetContext().DeviceId(); - GELOGD("Device id is %u", device_id); - ProfilingManager::Instance().SetGraphIdToDeviceMap(graph_id, device_id); // validation for adding graphs of same graph_id in multi-thread secenario // 1.previous thread owns same graph_id has finished the AddGraph procession if (GetAddGraphCondition(graph_id) == kDoneAdded) { diff --git a/ge/init/gelib.cc b/ge/init/gelib.cc index 2491715b..22dc9877 100644 --- a/ge/init/gelib.cc +++ b/ge/init/gelib.cc @@ -25,7 +25,6 @@ #include "common/ge/ge_util.h" #include "common/ge/plugin_manager.h" -#include "common/profiling/profiling_manager.h" #include "common/properties_manager.h" #include "framework/common/debug/ge_log.h" #include "framework/common/debug/log.h" @@ -43,6 +42,7 @@ #include "runtime/kernel.h" #include "opskernel_manager/ops_kernel_builder_manager.h" #include "external/runtime/rt_error_codes.h" +#include "common/profiling/profiling_init.h" using Json = nlohmann::json; @@ -194,7 +194,7 @@ Status GELib::SystemInitialize(const map &options) { InitOptions(options); // In train and infer, profiling is always needed. - InitProfiling(this->options_); + //InitProfiling(this->options_); // 1.`is_train_mode_` means case: train // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer // these two case with logical device id @@ -206,15 +206,15 @@ Status GELib::SystemInitialize(const map &options) { return status; } -void GELib::InitProfiling(Options &options) { +/*void GELib::InitProfiling(Options &options) { GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id); std::lock_guard lock(status_mutex_); GetContext().Init(); // Profiling init - if (ProfilingManager::Instance().Init(options) != SUCCESS) { - GELOGW("Profiling init failed."); - } -} + //if (ProfilingInit::Instance().Init(options) != SUCCESS) { + //GELOGW("Profiling init failed."); + // } +}*/ void GELib::SetDefaultPrecisionMode(map &new_options) { auto iter = new_options.find(PRECISION_MODE); @@ -496,10 +496,8 @@ Status GELib::Finalize() { void GELib::ShutDownProfiling() { std::lock_guard lock(status_mutex_); - - if (ProfilingManager::Instance().ProfilingOn()) { - ProfilingManager::Instance().StopProfiling(); - ProfilingManager::Instance().PluginUnInit(); + if (ProfilingProperties::Instance().ProfilingOn()) { + ge::ProfilingInit::Instance().StopProfiling(); } } diff --git a/ge/session/inner_session.cc b/ge/session/inner_session.cc index b9c44ef1..e7ee41d4 100755 --- a/ge/session/inner_session.cc +++ b/ge/session/inner_session.cc @@ -36,6 +36,7 @@ #include "runtime/mem.h" #include "ir_build/option_utils.h" #include "common/profiling/profiling_manager.h" +#include "common/profiling/profiling_init.h" namespace ge { namespace { @@ -288,6 +289,9 @@ Status InnerSession::RunGraph(uint32_t graph_id, const std::vector &inpu GELOGI("[InnerSession:%lu] run graph on session, graph_id=%u.", session_id_, graph_id); if (mutex_.try_lock()) { std::lock_guard lock(mutex_, std::adopt_lock); + auto device_id = GetContext().DeviceId(); + GELOGD("device_id is %u", device_id); + ProfilingInit::Instance().SetDeviceIdByModelId(graph_id, device_id); if (!init_flag_) { GELOGE(GE_SESS_INIT_FAILED, "[Run][Graph]failed because GraphManager not Init, InnerSession:%lu, graph_id:%u.", session_id_, graph_id); @@ -339,6 +343,9 @@ Status InnerSession::RunGraphWithStreamAsync(uint32_t graph_id, rtStream_t strea "session id = %lu, graph id = %u, stream = %p.", session_id_, graph_id, stream); return GE_SESS_INIT_FAILED; } + auto device_id = GetContext().DeviceId(); + GELOGD("device_id is %u", device_id); + ProfilingInit::Instance().SetDeviceIdByModelId(graph_id, device_id); UpdateThreadContext(graph_id); vector ge_inputs; for (auto &item : inputs) { @@ -382,6 +389,9 @@ Status InnerSession::RemoveGraph(uint32_t graph_id) { session_id_, graph_id); return GE_SESS_INIT_FAILED; } + auto device_id = GetContext().DeviceId(); + GELOGD("remove device_id is %u", device_id); + ProfilingInit::Instance().UnsetDeviceIdByModelId(graph_id, device_id); UpdateThreadContext(graph_id); Status ret = graph_manager_.RemoveGraph(graph_id); if (ret != SUCCESS) { diff --git a/inc/framework/common/profiling/ge_profiling.h b/inc/framework/common/profiling/ge_profiling.h index c87c082c..b585d023 100644 --- a/inc/framework/common/profiling/ge_profiling.h +++ b/inc/framework/common/profiling/ge_profiling.h @@ -18,32 +18,8 @@ #define INC_FRAMEWORK_COMMON_GE_PROFILING_H_ #include "ge/ge_api_error_codes.h" -#include "toolchain/prof_callback.h" #include "runtime/base.h" -const int MAX_DEV_NUM = 64; - -enum ProfCommandHandleType { - kProfCommandhandleInit = 0, - kProfCommandhandleStart, - kProfCommandhandleStop, - kProfCommandhandleFinalize, - kProfCommandhandleModelSubscribe, - kProfCommandhandleModelUnsubscribe -}; - -struct ProfCommandHandleData { - uint64_t profSwitch; - uint32_t devNums; // length of device id list - uint32_t devIdList[MAX_DEV_NUM]; - uint32_t modelId; -}; - -GE_FUNC_VISIBILITY ge::Status RegProfCtrlCallback(MsprofCtrlCallback func); -GE_FUNC_VISIBILITY ge::Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func); -GE_FUNC_VISIBILITY ge::Status RegProfReporterCallback(MsprofReporterCallback func); -GE_FUNC_VISIBILITY ge::Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len); - /// /// @brief Output the profiling data of single operator in Pytorch, and does not support multithreading /// @return Status result diff --git a/inc/framework/common/profiling/ge_runner_profiling.h b/inc/framework/common/profiling/profiling_init.h similarity index 100% rename from inc/framework/common/profiling/ge_runner_profiling.h rename to inc/framework/common/profiling/profiling_init.h diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h index 7fc1cdea..be5e8dbd 100644 --- a/third_party/fwkacllib/inc/runtime/base.h +++ b/third_party/fwkacllib/inc/runtime/base.h @@ -33,6 +33,7 @@ extern "C" { #endif #endif +#define RT_PROF_MAX_DEV_NUM 64 typedef int32_t rtError_t; static const int32_t RT_ERROR_NONE = 0; // success @@ -80,6 +81,13 @@ typedef enum tagRtLimitType { RT_LIMIT_TYPE_LOW_POWER_TIMEOUT = 0, // timeout for power down , ms } rtLimitType_t; +typedef enum { + RT_PROF_CTRL_INVALID = 0, + RT_PROF_CTRL_SWITCH, + RT_PROF_CTRL_REPORTER, + RT_PROF_CTRL_BUTT, +} rtProfCtrlType_t; + typedef struct rtExceptionInfo { uint32_t taskid; uint32_t streamid; @@ -88,6 +96,15 @@ typedef struct rtExceptionInfo { uint32_t retcode; } rtExceptionInfo; +typedef struct rtProfCommandHandle { + uint64_t profSwitch; + uint64_t profSwitchHi; + uint32_t devNums; + uint32_t devList[RT_PROF_MAX_DEV_NUM]; + uint32_t modelId; + uint32_t type; +} rtProfCommandHandle_t; + typedef void (*rtErrorCallback)(rtExceptionType); typedef void (*rtTaskFailCallback)(rtExceptionInfo *exceptionInfo); @@ -118,6 +135,8 @@ typedef void *rtLabel_t; */ typedef void *rtModel_t; +typedef rtError_t (*MsprofCtrlHandle)(rtProfCtrlType_t type, void *data, uint32_t len); + /** * @ingroup profiling_base * @brief runtime handle. @@ -357,6 +376,14 @@ RTS_API rtError_t rtLabelCreateExV2(rtLabel_t *label, rtModel_t model, rtStream_ */ RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId); +RTS_API rtError_t rtProfRegisterCtrlCallback(uint32_t logId, MsprofCtrlHandle callback); + +RTS_API rtError_t rtSetDeviceIdByModelIdx(uint32_t modelIdx, uint32_t &deviceId); + +RTS_API rtError_t rtUnsetDeviceIdByModelIdx(uint32_t modelIdx, uint32_t &deviceId); + +RTS_API rtError_t rtGetDeviceIdByModelIdx(uint32_t modelIdx, uint32_t &deviceId); + #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE) } #endif diff --git a/third_party/fwkacllib/inc/toolchain/prof_callback.h b/third_party/fwkacllib/inc/toolchain/prof_callback.h index 5073cfb1..969dc1cb 100644 --- a/third_party/fwkacllib/inc/toolchain/prof_callback.h +++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h @@ -114,15 +114,6 @@ enum MsprofCtrlCallbackType { MSPROF_CTRL_PROF_SWITCH_OFF // for prof switch off }; -#define MSPROF_MAX_DEV_NUM (64) - -struct MsprofCommandHandle { - uint64_t profSwitch; - uint32_t devNums; // length of device id list - uint32_t devIdList[MSPROF_MAX_DEV_NUM]; - uint32_t modelId; -}; - /** * @name MsprofCtrlCallback * @brief callback to start/stop profiling