From 82e6f4774f05078820445232eca776d5f4866649 Mon Sep 17 00:00:00 2001 From: yanghaoran Date: Thu, 23 Dec 2021 19:21:13 +0800 Subject: [PATCH] upgrade Ascend package 23 Dec 21 --- inc/framework/common/profiling_definitions.h | 12 +- inc/framework/omg/model_tool.h | 35 ++ metadef | 2 +- third_party/fwkacllib/inc/cce/fwk_adpt_struct.h | 6 +- .../inc/mmpa/sub_inc/mmpa_typedef_linux.h | 3 - .../fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h | 169 ++++---- third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h | 68 ++++ third_party/fwkacllib/inc/ops/reduce_ops.h | 86 ++++ .../fwkacllib/inc/toolchain/prof_callback.h | 19 +- third_party/fwkacllib/inc/toolchain/prof_common.h | 450 +++++++++++++++++++++ 10 files changed, 751 insertions(+), 99 deletions(-) create mode 100644 inc/framework/omg/model_tool.h create mode 100644 third_party/fwkacllib/inc/toolchain/prof_common.h diff --git a/inc/framework/common/profiling_definitions.h b/inc/framework/common/profiling_definitions.h index e6da4ca8..81a50f9d 100644 --- a/inc/framework/common/profiling_definitions.h +++ b/inc/framework/common/profiling_definitions.h @@ -22,6 +22,8 @@ #include #include #include "graph/profiler.h" +#include "external/ge/ge_api_types.h" +#include "toolchain/prof_callback.h" namespace ge { namespace profiling { enum { @@ -46,6 +48,7 @@ enum { kCopyH2D, kProfilingIndexEnd }; +constexpr uint64_t kInvalidHashId = 0ULL; class ProfilingContext { public: @@ -100,9 +103,16 @@ class ProfilingContext { } int64_t RegisterString(const std::string &str); + int64_t RegisterStringHash(const uint64_t hash_id, const std::string &str); + void UpdateElementHashId(MsprofReporterCallback reporter_callback); + static Status QueryHashId(const MsprofReporterCallback reporter_callback, const std::string &src_str, + uint64_t &hash_id); + size_t GetRegisterStringNum() const { + return strings_to_index_.size(); + } private: - void RegisterString(int64_t index, const std::string &str); + void UpdateHashByStr(const std::string &str, const uint64_t hash); void Init(); private: diff --git a/inc/framework/omg/model_tool.h b/inc/framework/omg/model_tool.h new file mode 100644 index 00000000..24554e65 --- /dev/null +++ b/inc/framework/omg/model_tool.h @@ -0,0 +1,35 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INC_FRAMEWORK_OMG_MODEL_TOOL_H_ +#define INC_FRAMEWORK_OMG_MODEL_TOOL_H_ + +#include +#include + +#include "framework/common/debug/ge_log.h" +#include "proto/ge_ir.pb.h" + +namespace ge { +class GE_FUNC_VISIBILITY ModelTool { + public: + static Status GetModelInfoFromOm(const char *model_file, ge::proto::ModelDef &model_def, uint32_t &modeldef_size); + + static Status GetModelInfoFromPbtxt(const char *model_file, ge::proto::ModelDef &model_def); +}; +} // namespace ge + +#endif // INC_FRAMEWORK_OMG_MODEL_TOOL_H_ diff --git a/metadef b/metadef index 2659f49d..dc5ac26a 160000 --- a/metadef +++ b/metadef @@ -1 +1 @@ -Subproject commit 2659f49dcb14c0773e10e17ee9896b7be4d8e7be +Subproject commit dc5ac26aac4c49b4e72cd91d4e6d6a57bbe03af4 diff --git a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h index ec92a036..5255383f 100644 --- a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h +++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h @@ -145,9 +145,9 @@ struct ResultSummary { #pragma pack(push, 1) struct AsyncWait { - uint8_t waitType; // wait type, FWk_ADPT_WAIT_TPYE_EVENT: event wait - uint32_t waitId; // wait id, GE refresh - uint32_t timeOut; // reserved + uint8_t waitType; // wait type, FWK_ADPT_WAIT_TYPE_EVENT: event wait + uint32_t waitId; // wait id, GE refresh + uint32_t timeOut; // reserved uint64_t reserved; }; #pragma pack(pop) diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h index 9c6f6499..9df5b9ce 100644 --- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h +++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h @@ -79,9 +79,6 @@ typedef long LONG; #define MMPA_THREAD_SCHED_OTHER SCHED_OTHER #define MMPA_THREAD_MIN_STACK_SIZE PTHREAD_STACK_MIN -#define MMPA_PATH_SEPARATOR_STR "/" -#define MMPA_PATH_SEPARATOR_CHAR '/' - #define MM_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define MMPA_MAX_NI 19 diff --git a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h index 9f8a72cd..1627d7a9 100644 --- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h +++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h @@ -1,86 +1,83 @@ -/** - * Copyright 2019-2020 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MMPA_TYPEDEF_WIN_H -#define MMPA_TYPEDEF_WIN_H - -#ifdef __cplusplus -#if __cplusplus -extern "C" { -#endif // __cpluscplus -#endif // __cpluscplus - -#ifndef FALSE -#define FALSE 0 -#endif - -#ifndef TRUE -#define TRUE 1 -#endif - -#define EN_OK 0 -#define EN_ERR 1 -#define EN_ERROR (-1) -#define EN_INVALID_PARAM (-2) -#define EN_TIMEOUT (-3) - -#define HANDLE_INVALID_VALUE (-1) -#define INVALID_SOCKET_HANDLE INVALID_SOCKET -#define MMPA_MEM_MAX_LEN (0x7fffffff) -#define MMPA_PROCESS_ERROR (0x7fffffff) - -#define MMPA_ONE_THOUSAND 1000 -#define MMPA_COMPUTER_BEGIN_YEAR 1900 -#define SUMMER_TIME_OR_NOT (-1) -#define MMPA_ZERO 0 -#define MMPA_VALUE_ONE 1 -#define MMPA_SOCKET_MAIN_EDITION 2 -#define MMPA_SOCKET_SECOND_EDITION 0 -#define MMPA_PIPE_BUF_SIZE 1024 -#define MMPA_MAX_SCANDIR_COUNT 1024 -#define MAX_IOVEC_SIZE 32 -#define MMPA_PIPE_COUNT 2 -#define MMPA_THREADNAME_SIZE 16 -#define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1) -#define MMPA_MIN_OS_VERSION_SIZE 64 - -#define MMPA_MAX_NI 19 -#define MMPA_MIDDLE_NI 5 -#define MMPA_LOW_NI (-5) -#define MMPA_MIN_NI (-20) -#define MMPA_MAX_FILE 128 - -#define MMPA_PATH_SEPARATOR_STR "\\" -#define MMPA_PATH_SEPARATOR_CHAR '\\' - -#define MMPA_MAX_THREAD_PIO 99 -#define MMPA_MIDDLE_THREAD_PIO 66 -#define MMPA_LOW_THREAD_PIO 33 -#define MMPA_MIN_THREAD_PIO 1 - -#define MMPA_THREAD_SCHED_RR 0 -#define MMPA_THREAD_SCHED_FIFO 0 -#define MMPA_THREAD_SCHED_OTHER 0 -#define MMPA_THREAD_MIN_STACK_SIZE 0 - -#define MM_MUTEX_INITIALIZER NULL - -#ifdef __cplusplus -#if __cplusplus -} -#endif // __cpluscplus -#endif // __cpluscplus -#endif // _MMPA_TYPEDEF_WIN_H_ +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MMPA_TYPEDEF_WIN_H +#define MMPA_TYPEDEF_WIN_H + +#ifdef __cplusplus +#if __cplusplus +extern "C" { +#endif // __cpluscplus +#endif // __cpluscplus + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#define EN_OK 0 +#define EN_ERR 1 +#define EN_ERROR (-1) +#define EN_INVALID_PARAM (-2) +#define EN_TIMEOUT (-3) + +#define HANDLE_INVALID_VALUE (-1) +#define INVALID_SOCKET_HANDLE INVALID_SOCKET +#define MMPA_MEM_MAX_LEN (0x7fffffff) +#define MMPA_PROCESS_ERROR (0x7fffffff) + +#define MMPA_ONE_THOUSAND 1000 +#define MMPA_COMPUTER_BEGIN_YEAR 1900 +#define SUMMER_TIME_OR_NOT (-1) +#define MMPA_ZERO 0 +#define MMPA_VALUE_ONE 1 +#define MMPA_SOCKET_MAIN_EDITION 2 +#define MMPA_SOCKET_SECOND_EDITION 0 +#define MMPA_PIPE_BUF_SIZE 1024 +#define MMPA_MAX_SCANDIR_COUNT 1024 +#define MAX_IOVEC_SIZE 32 +#define MMPA_PIPE_COUNT 2 +#define MMPA_THREADNAME_SIZE 16 +#define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1) +#define MMPA_MIN_OS_VERSION_SIZE 64 + +#define MMPA_MAX_NI 19 +#define MMPA_MIDDLE_NI 5 +#define MMPA_LOW_NI (-5) +#define MMPA_MIN_NI (-20) +#define MMPA_MAX_FILE 128 + +#define MMPA_MAX_THREAD_PIO 99 +#define MMPA_MIDDLE_THREAD_PIO 66 +#define MMPA_LOW_THREAD_PIO 33 +#define MMPA_MIN_THREAD_PIO 1 + +#define MMPA_THREAD_SCHED_RR 0 +#define MMPA_THREAD_SCHED_FIFO 0 +#define MMPA_THREAD_SCHED_OTHER 0 +#define MMPA_THREAD_MIN_STACK_SIZE 0 + +#define MM_MUTEX_INITIALIZER NULL + +#ifdef __cplusplus +#if __cplusplus +} +#endif // __cpluscplus +#endif // __cpluscplus +#endif // _MMPA_TYPEDEF_WIN_H_ diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h index 398c6568..7a28a738 100644 --- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h +++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h @@ -143,6 +143,74 @@ REG_OP(BatchNorm) .OP_END_FACTORY_REG(BatchNorm) /** +* @brief After the mean and reciprocal of standard deviation(invert_std) are separately calculated on each device, +* the mena and reciprocal of standard deviation(invert_std) data on each device are normlized, +* a total mean and reciprocal of standard deviation(invert_std) are returned, and running_var are updated. + +* @par Inputs: +* include: +* @li mean_all: A Tensor. The mean of each device. Must be one of the following types: float16, float32. +* @li invert_std_all: A Tensor. Reciprocal of the variances of each device. Must be one of the following types: float16, float32. +* @li count_all: A Tensor. Number of data for each device. Must be one of the following types: float16, float32. +* @li mean_broadcast: A Tensor. The overall average and broadcast. Must be one of the following types: float16, float32. +* @li count_sum: A Tensor. General statistics. Must be one of the following types: float16, float32. +* @li running_var: A Tensor. Runtime variance. Must be one of the following types: float16, float32. \n + +* @par Attributes: +* Two Attributes, including: +* @li momentum: A optional float. Defaults to 0.01. \n +* @li epsilon: An optional float. Defaults to 0.00001. \n + +* @par Outputs: +* include: +* @li invert_std: A Tensor. It's inverse of total variance. +* @li running_var_update: A Tensor. It's moving variance of each device after the update. \n + +* @par Third-party framework compatibility +* ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate +* compatible with the Pytorch operator BatchNormGatherStatsWithCounts. +*/ +REG_OP(SyncBatchNormGatherStatsWithCounts) + .INPUT(mean_all, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(invert_std_all, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(count_all, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(mean_broadcast, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(running_var, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(invert_std, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(running_var_update, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(momentum, Float, 0.1) + .ATTR(epsilon, Float, 0.001) + .OP_END_FACTORY_REG(SyncBatchNormGatherStatsWithCounts) + +/** +* @brief update running_mean. + +* @par Inputs: +* include: +* @li mean: A Tensor. The mean of each device. Must be one of the following types: float16, float32. +* @li running_mean: A Tensor. Runtime Mean. Must be one of the following types: float16, float32. \n + +* @par Attributes: +* One Attribute, including: +* @li momentum: A optional float. Defaults to 0.01. \n + +* @par Outputs: +* include: +* @li running_mean_update: A Tensor. It's moving mean of each device after the update. \n + +* @par Third-party framework compatibility +* ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate +* compatible with the Pytorch operator BatchNormGatherStatsWithCounts. +*/ +REG_OP(SyncBNTrainingUpdate) + .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(running_mean, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(running_mean_update, TensorType({DT_FLOAT, DT_FLOAT16})) + .ATTR(momentum, Float, 0.1) + .OP_END_FACTORY_REG(SyncBNTrainingUpdate) + +/** *@brief part of SyncBatchNormBackward . \n *@par Inputs: diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h index a273953c..78a423b5 100644 --- a/third_party/fwkacllib/inc/ops/reduce_ops.h +++ b/third_party/fwkacllib/inc/ops/reduce_ops.h @@ -516,6 +516,34 @@ REG_OP(ReduceSumD) .OP_END_FACTORY_REG(ReduceSumD) /** +*@brief Calculate the total mean based on the mean of each device . \n + +*@par Inputs: +* Three inputs, including: +*@li x: A Tensor. Must be one of the following types: float16, float32 . +*@li count: A Tensor. Must be one of the following types: float16, float32 . +*@li count_sum: A Tensor. Must be one of the following types: float16, float32 . \n + +*@par Attributes: +*@li axes: A required 1D list or tuple of int32 or int64. Specifies the dimensions to reduce. +*@li keepdims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n + +*@par Outputs: +*y: The reduced tensor. Has the same type and format as input "x" . \n + +*@par Third-party framework compatibility +* Compatible with the TensorFlow operator Sum. +*/ +REG_OP(ReduceMeanWithCount) + .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(count, TensorType({DT_FLOAT, DT_FLOAT16})) + .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16})) + .REQUIRED_ATTR(axes, ListInt) + .ATTR(keep_dims, Bool, false) + .OP_END_FACTORY_REG(ReduceMeanWithCount) + +/** *@brief Calculates the "logical sum" of elements of a tensor in a dimension . \n *@par Inputs: @@ -1363,6 +1391,64 @@ REG_OP(ReduceStdV2Update) .ATTR(unbiased, Bool, true) .ATTR(keepdim, Bool, false) .OP_END_FACTORY_REG(ReduceStdV2Update) + +/** +*@brief Computes the log and sum and exp of elements across dimensions of a tensor. +* Reduces "x" along the dimensions given in "axes". +* Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each +* entry in "axes". If "keep_dims" is true, the reduced dimensions +* are retained with length 1. +* +*@par Inputs: +* Two inputs, including: +*@li x: A Tensor. Must be one of the following types: +* float32, float16, int32, int64, uint32, uint64, double +*@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n +* +*@par Attributes: +*keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n +* +*@par Outputs: +*y: The reduced tensor. Has the same type and format as input "x" . \n +* +*@par Third-party framework compatibility +* Compatible with the Onnx operator ReduceLogSumExp. +*/ +REG_OP(ReduceLogSumExp) + .INPUT(x, TensorType::NumberType()) + .INPUT(axes, TensorType::IndexNumberType()) + .OUTPUT(y, TensorType::NumberType()) + .ATTR(keep_dims, Bool, false) + .OP_END_FACTORY_REG(ReduceLogSumExp) + +/** +*@brief Computes the log and sum of elements across dimensions of a tensor. +* Reduces "x" along the dimensions given in "axes". +* Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each +* entry in "axes". If "keep_dims" is true, the reduced dimensions +* are retained with length 1. +* +*@par Inputs: +* Two inputs, including: +*@li x: A Tensor. Must be one of the following types: +* float32, float16, int32, int64, uint32, uint64, double +*@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n +* +*@par Attributes: +*keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n +* +*@par Outputs: +*y: The reduced tensor. Has the same type and format as input "x" . \n +* +*@par Third-party framework compatibility +* Compatible with the Onnx operator ReduceLogSum. +*/ +REG_OP(ReduceLogSum) + .INPUT(x, TensorType::NumberType()) + .INPUT(axes, TensorType::IndexNumberType()) + .OUTPUT(y, TensorType::NumberType()) + .ATTR(keep_dims, Bool, false) + .OP_END_FACTORY_REG(ReduceLogSum) } //namespace ge #endif // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_ diff --git a/third_party/fwkacllib/inc/toolchain/prof_callback.h b/third_party/fwkacllib/inc/toolchain/prof_callback.h index 6398d075..cccf76dc 100644 --- a/third_party/fwkacllib/inc/toolchain/prof_callback.h +++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h @@ -1,8 +1,17 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. - * Description: handle perf data - * Author: xp - * Create: 2019-10-13 +/** + * Copyright 2019-2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #ifndef MSPROFILER_PROF_CALLBACK_H_ diff --git a/third_party/fwkacllib/inc/toolchain/prof_common.h b/third_party/fwkacllib/inc/toolchain/prof_common.h new file mode 100644 index 00000000..93a4cff9 --- /dev/null +++ b/third_party/fwkacllib/inc/toolchain/prof_common.h @@ -0,0 +1,450 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. + * Description: handle perf data + * Author: Huawei Technologies Co., Ltd. + * Create: 2019-10-13 + */ +#ifndef MSPROFILER_PROF_COMMON_H_ +#define MSPROFILER_PROF_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include + +#define MSPROF_DATA_HEAD_MAGIC_NUM 0x5a5a + +enum MsprofDataTag { + MSPROF_ACL_DATA_TAG = 0, //acl data tag, range: 0~19 + MSPROF_GE_DATA_TAG_MODEL_LOAD = 20, //ge data tag, range: 20~39 + MSPROF_GE_DATA_TAG_FUSION = 21, + MSPROF_GE_DATA_TAG_INFER = 22, + MSPROF_GE_DATA_TAG_TASK = 23, + MSPROF_GE_DATA_TAG_TENSOR = 24, + MSPROF_GE_DATA_TAG_STEP = 25, + MSPROF_GE_DATA_TAG_ID_MAP = 26, + MSPROF_GE_DATA_TAG_HOST_SCH = 27, + MSPROF_RUNTIME_DATA_TAG_API = 40, //runtime data tag, range: 40~59 + MSPROF_RUNTIME_DATA_TAG_TRACK = 41, + MSPROF_AICPU_DATA_TAG = 60, //aicpu data tag, range: 60~79 + MSPROF_HCCL_DATA_TAG = 80, //hccl data tag, range: 80~99 + MSPROF_DP_DATA_TAG = 100, //dp data tag, range: 100~119 + MSPROF_MSPROFTX_DATA_TAG = 120, //hccl data tag, range: 120~139 + MSPROF_DATA_TAG_MAX = 65536, //data tag value type is uint16_t +}; + +/** + * @brief struct of mixed data + */ +#define MSPROF_MIX_DATA_RESERVE_BYTES 7 +#define MSPROF_MIX_DATA_STRING_LEN 120 +enum MsprofMixDataType { + MSPROF_MIX_DATA_HASH_ID = 0, + MSPROF_MIX_DATA_STRING, +}; +struct MsprofMixData { + uint8_t type; // MsprofMixDataType + uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES]; + union { + uint64_t hashId; + char dataStr[MSPROF_MIX_DATA_STRING_LEN]; + } data; +}; +using MixData = struct MsprofMixData; + +/** + * @brief profiling command info + */ +#define MSPROF_MAX_DEV_NUM 64 +struct MsprofCommandHandle { + uint64_t profSwitch; + uint64_t profSwitchHi; + uint32_t devNums; + uint32_t devIdList[MSPROF_MAX_DEV_NUM]; + uint32_t modelId; + uint32_t type; +}; + +/** + * @brief struct of data reported by acl + */ +#define MSPROF_ACL_DATA_RESERVE_BYTES 32 +#define MSPROF_ACL_API_NAME_LEN 64 +enum MsprofAclApiType { + MSPROF_ACL_API_TYPE_OP = 1, + MSPROF_ACL_API_TYPE_MODEL, + MSPROF_ACL_API_TYPE_RUNTIME, + MSPROF_ACL_API_TYPE_OTHERS, +}; +struct MsprofAclProfData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_ACL_DATA_TAG; + uint32_t apiType; // enum MsprofAclApiType + uint64_t beginTime; + uint64_t endTime; + uint32_t processId; + uint32_t threadId; + char apiName[MSPROF_ACL_API_NAME_LEN]; + uint8_t reserve[MSPROF_ACL_DATA_RESERVE_BYTES]; +}; + +/** + * @brief struct of data reported by GE + */ +#define MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES 104 +struct MsprofGeProfModelLoadData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_MODEL_LOAD; + uint32_t modelId; + MixData modelName; + uint64_t startTime; + uint64_t endTime; + uint8_t reserve[MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_FUSION_DATA_RESERVE_BYTES 8 +#define MSPROF_GE_FUSION_OP_NUM 8 +struct MsprofGeProfFusionData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_FUSION; + uint32_t modelId; + MixData fusionName; + uint64_t inputMemSize; + uint64_t outputMemSize; + uint64_t weightMemSize; + uint64_t workspaceMemSize; + uint64_t totalMemSize; + uint64_t fusionOpNum; + uint64_t fusionOp[MSPROF_GE_FUSION_OP_NUM]; + uint8_t reserve[MSPROF_GE_FUSION_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_INFER_DATA_RESERVE_BYTES 64 +struct MsprofGeProfInferData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_INFER; + uint32_t modelId; + MixData modelName; + uint32_t requestId; + uint32_t threadId; + uint64_t inputDataStartTime; + uint64_t inputDataEndTime; + uint64_t inferStartTime; + uint64_t inferEndTime; + uint64_t outputDataStartTime; + uint64_t outputDataEndTime; + uint8_t reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_TASK_DATA_RESERVE_BYTES 16 +#define MSPROF_GE_OP_TYPE_LEN 56 +enum MsprofGeTaskType { + MSPROF_GE_TASK_TYPE_AI_CORE = 0, + MSPROF_GE_TASK_TYPE_AI_CPU, + MSPROF_GE_TASK_TYPE_AIV, +}; +enum MsprofGeShapeType { + MSPROF_GE_SHAPE_TYPE_STATIC = 0, + MSPROF_GE_SHAPE_TYPE_DYNAMIC, +}; +struct MsprofGeOpType { + uint8_t type; // MsprofMixDataType + uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES]; + union { + uint64_t hashId; + char dataStr[MSPROF_GE_OP_TYPE_LEN]; + } data; +}; +struct MsprofGeProfTaskData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_TASK; + uint32_t taskType; // MsprofGeTaskType + MixData opName; + MsprofGeOpType opType; + uint64_t curIterNum; + uint64_t timeStamp; + uint32_t shapeType; // MsprofGeShapeType + uint32_t blockDims; + uint32_t modelId; + uint32_t streamId; + uint32_t taskId; + uint32_t threadId; + uint8_t reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_TENSOR_DATA_RESERVE_BYTES 8 +#define MSPROF_GE_TENSOR_DATA_SHAPE_LEN 8 +#define MSPROF_GE_TENSOR_DATA_NUM 5 +enum MsprofGeTensorType { + MSPROF_GE_TENSOR_TYPE_INPUT = 0, + MSPROF_GE_TENSOR_TYPE_OUTPUT, +}; +struct MsprofGeTensorData { + uint32_t tensorType; // MsprofGeTensorType + uint32_t format; + uint32_t dataType; + uint32_t shape[MSPROF_GE_TENSOR_DATA_SHAPE_LEN]; +}; + +struct MsprofGeProfTensorData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_TENSOR; + uint32_t modelId; + uint64_t curIterNum; + uint32_t streamId; + uint32_t taskId; + uint32_t tensorNum; + MsprofGeTensorData tensorData[MSPROF_GE_TENSOR_DATA_NUM]; + uint8_t reserve[MSPROF_GE_TENSOR_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_STEP_DATA_RESERVE_BYTES 27 +enum MsprofGeStepTag { + MSPROF_GE_STEP_TAG_BEGIN = 0, + MSPROF_GE_STEP_TAG_END, +}; +struct MsprofGeProfStepData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_STEP; + uint32_t modelId; + uint32_t streamId; + uint32_t taskId; + uint64_t timeStamp; + uint64_t curIterNum; + uint32_t threadId; + uint8_t tag; // MsprofGeStepTag + uint8_t reserve[MSPROF_GE_STEP_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES 6 +struct MsprofGeProfIdMapData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_ID_MAP; + uint32_t graphId; + uint32_t modelId; + uint32_t sessionId; + uint64_t timeStamp; + uint16_t mode; + uint8_t reserve[MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES 24 +struct MsprofGeProfHostSchData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_GE_DATA_TAG_HOST_SCH; + uint32_t threadId; // record in start event + uint64_t element; + uint64_t event; + uint64_t startTime; // record in start event + uint64_t endTime; // record in end event + uint8_t reserve[MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES]; +}; + +/** + * @brief struct of data reported by RunTime + */ +#define MSPROF_RUNTIME_API_DATA_RESERVE_BYTES 106 +#define MSPROF_RUNTIME_TASK_ID_NUM 10 +#define MSPROF_RUNTIME_API_NAME_LEN 64 +struct MsprofRuntimeProfApiData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_API; + uint32_t threadId; + uint64_t entryTime; + uint64_t exitTime; + uint64_t dataSize; + uint8_t apiName[MSPROF_RUNTIME_API_NAME_LEN]; + uint32_t retCode; + uint32_t streamId; + uint32_t taskNum; + uint32_t taskId[MSPROF_RUNTIME_TASK_ID_NUM]; + uint16_t memcpyDirection; + uint8_t reserve[MSPROF_RUNTIME_API_DATA_RESERVE_BYTES]; +}; + +#define MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES 10 +#define MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN 32 +struct MsprofRuntimeProfTrackData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_TRACK; + uint32_t threadId; + uint64_t timeStamp; + char taskType[MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN]; + uint32_t taskId; + uint16_t streamId; + uint8_t reserve[MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES]; +}; + +/** + * @brief struct of data reported by RunTime + */ +#define MSPROF_AICPU_DATA_RESERVE_BYTES 9 +struct MsprofAicpuProfData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_AICPU_DATA_TAG; + uint16_t streamId; + uint16_t taskId; + uint64_t runStartTime; + uint64_t runStartTick; + uint64_t computeStartTime; + uint64_t memcpyStartTime; + uint64_t memcpyEndTime; + uint64_t runEndTime; + uint64_t runEndTick; + uint32_t threadId; + uint32_t deviceId; + uint64_t submitTick; + uint64_t scheduleTick; + uint64_t tickBeforeRun; + uint64_t tickAfterRun; + uint32_t kernelType; + uint32_t dispatchTime; + uint32_t totalTime; + uint16_t fftsThreadId; + uint8_t version; + uint8_t reserve[MSPROF_AICPU_DATA_RESERVE_BYTES]; +}; + +/** + * @brief struct of data reported by DP + */ +#define MSPROF_DP_DATA_RESERVE_BYTES 16 +#define MSPROF_DP_DATA_ACTION_LEN 16 +#define MSPROF_DP_DATA_SOURCE_LEN 64 +struct MsprofDpProfData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_DP_DATA_TAG; + uint32_t rsv; // Ensure 8-byte alignment + uint64_t timeStamp; + char action[MSPROF_DP_DATA_ACTION_LEN]; + char source[MSPROF_DP_DATA_SOURCE_LEN]; + uint64_t index; + uint64_t size; + uint8_t reserve[MSPROF_DP_DATA_RESERVE_BYTES]; +}; + +/** + * @brief struct of data reported by HCCL + */ +#pragma pack(4) +struct MsprofHcclProfNotify { + uint32_t taskID; + uint64_t notifyID; + uint32_t stage; + uint32_t remoteRank; + uint32_t transportType; + uint32_t role; // role {0: dst, 1:src} + double durationEstimated; +}; + +struct MsprofHcclProfReduce { + uint32_t taskID; + uint64_t src; + uint64_t dst; + uint64_t size; + uint32_t op; // {0: sum, 1: mul, 2: max, 3: min} + uint32_t dataType; // data type {0: INT8, 1: INT16, 2: INT32, 3: FP16, 4:FP32, 5:INT64, 6:UINT64} + uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} + uint32_t remoteRank; + uint32_t transportType; // transport type {0: SDMA, 1: RDMA, 2:LOCAL} + uint32_t role; // role {0: dst, 1:src} + double durationEstimated; +}; + +struct MsprofHcclProfRDMA { + uint32_t taskID; + uint64_t src; + uint64_t dst; + uint64_t size; + uint64_t notifyID; + uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} + uint32_t remoteRank; + uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL} + uint32_t role; // role {0: dst, 1:src} + uint32_t type; // RDMA type {0: RDMASendNotify, 1:RDMASendPayload} + double durationEstimated; +}; + +struct MsprofHcclProfMemcpy { + uint32_t taskID; + uint64_t src; + uint64_t dst; + uint64_t size; + uint64_t notifyID; + uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'} + uint32_t remoteRank; + uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL} + uint32_t role; // role {0: dst, 1:src} + double durationEstimated; +}; + +struct MsprofHcclProfStageStep { + uint32_t rank; + uint32_t rankSize; +}; + +struct MsprofHcclProfFlag { + uint64_t cclTag; + uint64_t groupName; + uint32_t localRank; + uint32_t workFlowMode; +}; + +/** + * @name MsprofHcclProfData + * @brief struct of data reported by hccl + */ +struct MsprofHcclProfData { + uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM; + uint16_t dataTag = MSPROF_HCCL_DATA_TAG; + uint32_t planeID; + uint32_t deviceID; + uint32_t streamID; + double ts; + char name[16]; + union { + MsprofHcclProfNotify notify; + MsprofHcclProfReduce reduce; + MsprofHcclProfStageStep stageStep; + MsprofHcclProfMemcpy forMemcpy; + MsprofHcclProfRDMA RDMA; + MsprofHcclProfFlag flag; + } args; +}; +#pragma pack() + +/** + * @name MsprofStampInfo + * @brief struct of data reported by msproftx + */ +struct MsprofStampInfo { + uint16_t magicNumber; + uint16_t dataTag; + uint32_t processId; + uint32_t threadId; + uint32_t category; //marker category + uint32_t eventType; + int32_t payloadType; + union PayloadValue //payload info for marker + { + uint64_t ullValue; + int64_t llValue; + double dValue; + uint32_t uiValue[2]; + int32_t iValue[2]; + float fValue[2]; + } payload; + uint64_t startTime; + uint64_t endTime; + int32_t messageType; + char message[128]; + uint8_t reserve0[4]; + uint8_t reserve1[72]; +}; + +#ifdef __cplusplus +} +#endif + +#endif // MSPROFILER_PROF_COMMON_H_