Pre Merge pull request !2102 from yanghaoran/r1.6

3 years ago · aaa0c3012a
--- a/inc/framework/common/profiling_definitions.h
+++ b/inc/framework/common/profiling_definitions.h
@@ -22,6 +22,8 @@
 #include <mutex>
 #include <unordered_map>
 #include "graph/profiler.h"
 #include "external/ge/ge_api_types.h"
 #include "toolchain/prof_callback.h"
 namespace ge {
 namespace profiling {
 enum {
@@ -46,6 +48,7 @@ enum {
  kCopyH2D,
  kProfilingIndexEnd
 };
 constexpr uint64_t kInvalidHashId = 0ULL;

 class ProfilingContext {
 public:
@@ -100,9 +103,16 @@ class ProfilingContext {
  }

  int64_t RegisterString(const std::string &str);
  int64_t RegisterStringHash(const uint64_t hash_id, const std::string &str);
  void UpdateElementHashId(MsprofReporterCallback reporter_callback);
  static Status QueryHashId(const MsprofReporterCallback reporter_callback, const std::string &src_str,
                            uint64_t &hash_id);
  size_t GetRegisterStringNum() const {
    return strings_to_index_.size();
  }

 private:
  void RegisterString(int64_t index, const std::string &str);
  void UpdateHashByStr(const std::string &str, const uint64_t hash);
  void Init();

 private:
--- a/inc/framework/omg/model_tool.h
+++ b/inc/framework/omg/model_tool.h
@@ -0,0 +1,35 @@
 /**
 * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_OMG_MODEL_TOOL_H_
 #define INC_FRAMEWORK_OMG_MODEL_TOOL_H_

 #include <memory>
 #include <string>

 #include "framework/common/debug/ge_log.h"
 #include "proto/ge_ir.pb.h"

 namespace ge {
 class GE_FUNC_VISIBILITY ModelTool {
 public:
  static Status GetModelInfoFromOm(const char *model_file, ge::proto::ModelDef &model_def, uint32_t &modeldef_size);

  static Status GetModelInfoFromPbtxt(const char *model_file, ge::proto::ModelDef &model_def);
 };
 }  // namespace ge

 #endif  // INC_FRAMEWORK_OMG_MODEL_TOOL_H_
--- a/+ 1
+++ b/+ 1
@@ -1 +1 @@
 Subproject commit 2659f49dcb14c0773e10e17ee9896b7be4d8e7be
 Subproject commit dc5ac26aac4c49b4e72cd91d4e6d6a57bbe03af4
--- a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
+++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
@@ -145,9 +145,9 @@ struct ResultSummary {

 #pragma pack(push, 1)
 struct AsyncWait {
  uint8_t waitType;  // wait type, FWk_ADPT_WAIT_TPYE_EVENT: event wait
  uint32_t waitId;  // wait id, GE refresh
  uint32_t timeOut;  // reserved
  uint8_t waitType; // wait type, FWK_ADPT_WAIT_TYPE_EVENT: event wait
  uint32_t waitId; // wait id, GE refresh
  uint32_t timeOut; // reserved
  uint64_t reserved;
 };
 #pragma pack(pop)
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_linux.h
@@ -79,9 +79,6 @@ typedef long LONG;
 #define MMPA_THREAD_SCHED_OTHER SCHED_OTHER
 #define MMPA_THREAD_MIN_STACK_SIZE PTHREAD_STACK_MIN

 #define MMPA_PATH_SEPARATOR_STR "/"
 #define MMPA_PATH_SEPARATOR_CHAR '/'

 #define MM_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER

 #define MMPA_MAX_NI 19
--- a/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h
+++ b/third_party/fwkacllib/inc/mmpa/sub_inc/mmpa_typedef_win.h
@@ -1,86 +1,83 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MMPA_TYPEDEF_WIN_H
 #define MMPA_TYPEDEF_WIN_H

 #ifdef __cplusplus
 #if __cplusplus
 extern "C" {
 #endif  // __cpluscplus
 #endif  // __cpluscplus

 #ifndef FALSE
 #define FALSE 0
 #endif

 #ifndef TRUE
 #define TRUE 1
 #endif

 #define EN_OK 0
 #define EN_ERR 1
 #define EN_ERROR (-1)
 #define EN_INVALID_PARAM (-2)
 #define EN_TIMEOUT (-3)

 #define HANDLE_INVALID_VALUE (-1)
 #define INVALID_SOCKET_HANDLE INVALID_SOCKET
 #define MMPA_MEM_MAX_LEN (0x7fffffff)
 #define MMPA_PROCESS_ERROR (0x7fffffff)

 #define MMPA_ONE_THOUSAND 1000
 #define MMPA_COMPUTER_BEGIN_YEAR 1900
 #define SUMMER_TIME_OR_NOT (-1)
 #define MMPA_ZERO 0
 #define MMPA_VALUE_ONE 1
 #define MMPA_SOCKET_MAIN_EDITION 2
 #define MMPA_SOCKET_SECOND_EDITION 0
 #define MMPA_PIPE_BUF_SIZE 1024
 #define MMPA_MAX_SCANDIR_COUNT 1024
 #define MAX_IOVEC_SIZE 32
 #define MMPA_PIPE_COUNT 2
 #define MMPA_THREADNAME_SIZE 16
 #define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1)
 #define MMPA_MIN_OS_VERSION_SIZE 64

 #define MMPA_MAX_NI 19
 #define MMPA_MIDDLE_NI 5
 #define MMPA_LOW_NI (-5)
 #define MMPA_MIN_NI (-20)
 #define MMPA_MAX_FILE 128

 #define MMPA_PATH_SEPARATOR_STR "\\"
 #define MMPA_PATH_SEPARATOR_CHAR '\\'

 #define MMPA_MAX_THREAD_PIO 99
 #define MMPA_MIDDLE_THREAD_PIO 66
 #define MMPA_LOW_THREAD_PIO 33
 #define MMPA_MIN_THREAD_PIO 1

 #define MMPA_THREAD_SCHED_RR 0
 #define MMPA_THREAD_SCHED_FIFO 0
 #define MMPA_THREAD_SCHED_OTHER 0
 #define MMPA_THREAD_MIN_STACK_SIZE 0

 #define MM_MUTEX_INITIALIZER NULL

 #ifdef __cplusplus
 #if __cplusplus
 }
 #endif  // __cpluscplus
 #endif  // __cpluscplus
 #endif  // _MMPA_TYPEDEF_WIN_H_
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MMPA_TYPEDEF_WIN_H
 #define MMPA_TYPEDEF_WIN_H

 #ifdef __cplusplus
 #if __cplusplus
 extern "C" {
 #endif  // __cpluscplus
 #endif  // __cpluscplus

 #ifndef FALSE
 #define FALSE 0
 #endif

 #ifndef TRUE
 #define TRUE 1
 #endif

 #define EN_OK 0
 #define EN_ERR 1
 #define EN_ERROR (-1)
 #define EN_INVALID_PARAM (-2)
 #define EN_TIMEOUT (-3)

 #define HANDLE_INVALID_VALUE (-1)
 #define INVALID_SOCKET_HANDLE INVALID_SOCKET
 #define MMPA_MEM_MAX_LEN (0x7fffffff)
 #define MMPA_PROCESS_ERROR (0x7fffffff)

 #define MMPA_ONE_THOUSAND 1000
 #define MMPA_COMPUTER_BEGIN_YEAR 1900
 #define SUMMER_TIME_OR_NOT (-1)
 #define MMPA_ZERO 0
 #define MMPA_VALUE_ONE 1
 #define MMPA_SOCKET_MAIN_EDITION 2
 #define MMPA_SOCKET_SECOND_EDITION 0
 #define MMPA_PIPE_BUF_SIZE 1024
 #define MMPA_MAX_SCANDIR_COUNT 1024
 #define MAX_IOVEC_SIZE 32
 #define MMPA_PIPE_COUNT 2
 #define MMPA_THREADNAME_SIZE 16
 #define MMPA_MIN_OS_NAME_SIZE (MAX_COMPUTERNAME_LENGTH + 1)
 #define MMPA_MIN_OS_VERSION_SIZE 64

 #define MMPA_MAX_NI 19
 #define MMPA_MIDDLE_NI 5
 #define MMPA_LOW_NI (-5)
 #define MMPA_MIN_NI (-20)
 #define MMPA_MAX_FILE 128

 #define MMPA_MAX_THREAD_PIO 99
 #define MMPA_MIDDLE_THREAD_PIO 66
 #define MMPA_LOW_THREAD_PIO 33
 #define MMPA_MIN_THREAD_PIO 1

 #define MMPA_THREAD_SCHED_RR 0
 #define MMPA_THREAD_SCHED_FIFO 0
 #define MMPA_THREAD_SCHED_OTHER 0
 #define MMPA_THREAD_MIN_STACK_SIZE 0

 #define MM_MUTEX_INITIALIZER NULL

 #ifdef __cplusplus
 #if __cplusplus
 }
 #endif  // __cpluscplus
 #endif  // __cpluscplus
 #endif  // _MMPA_TYPEDEF_WIN_H_
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -143,6 +143,74 @@ REG_OP(BatchNorm)
    .OP_END_FACTORY_REG(BatchNorm)

 /**
 * @brief After the mean and reciprocal of standard deviation(invert_std) are separately calculated on each device,
 * the mena and reciprocal of standard deviation(invert_std) data on each device are normlized,
 * a total mean and reciprocal of standard deviation(invert_std) are returned, and running_var are updated.

 * @par Inputs:
 * include:
 * @li mean_all: A Tensor. The mean of each device. Must be one of the following types: float16, float32.
 * @li invert_std_all: A Tensor. Reciprocal of the variances of each device. Must be one of the following types: float16, float32.
 * @li count_all: A Tensor. Number of data for each device. Must be one of the following types: float16, float32.
 * @li mean_broadcast: A Tensor. The overall average and broadcast. Must be one of the following types: float16, float32.
 * @li count_sum: A Tensor. General statistics. Must be one of the following types: float16, float32.
 * @li running_var: A Tensor. Runtime variance. Must be one of the following types: float16, float32. \n

 * @par Attributes:
 * Two Attributes, including:
 * @li momentum: A optional float. Defaults to 0.01. \n
 * @li epsilon: An optional float. Defaults to 0.00001. \n

 * @par Outputs:
 * include:
 * @li invert_std: A Tensor. It's inverse of total variance.
 * @li running_var_update: A Tensor. It's moving variance of each device after the update. \n

 * @par Third-party framework compatibility
 * ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate
 * compatible with the Pytorch operator BatchNormGatherStatsWithCounts.
 */
 REG_OP(SyncBatchNormGatherStatsWithCounts)
    .INPUT(mean_all, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(invert_std_all, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(count_all, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(mean_broadcast, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(running_var, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(invert_std, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(running_var_update, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(momentum, Float, 0.1)
    .ATTR(epsilon, Float, 0.001)
    .OP_END_FACTORY_REG(SyncBatchNormGatherStatsWithCounts)

 /**
 * @brief update running_mean.

 * @par Inputs:
 * include:
 * @li mean: A Tensor. The mean of each device. Must be one of the following types: float16, float32.
 * @li running_mean: A Tensor. Runtime Mean. Must be one of the following types: float16, float32. \n

 * @par Attributes:
 * One Attribute, including:
 * @li momentum: A optional float. Defaults to 0.01. \n

 * @par Outputs:
 * include:
 * @li running_mean_update: A Tensor. It's moving mean of each device after the update. \n

 * @par Third-party framework compatibility
 * ReduceMeanWithCount and SyncBatchNormGatherStatsWithCounts and SyncBNTrainingUpdate
 * compatible with the Pytorch operator BatchNormGatherStatsWithCounts.
 */
 REG_OP(SyncBNTrainingUpdate)
    .INPUT(mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(running_mean, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(running_mean_update, TensorType({DT_FLOAT, DT_FLOAT16}))
    .ATTR(momentum, Float, 0.1)
    .OP_END_FACTORY_REG(SyncBNTrainingUpdate)

 /**
 *@brief part of SyncBatchNormBackward . \n

 *@par Inputs:
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -516,6 +516,34 @@ REG_OP(ReduceSumD)
    .OP_END_FACTORY_REG(ReduceSumD)

 /**
 *@brief Calculate the total mean based on the mean of each device . \n

 *@par Inputs:
 * Three inputs, including:
 *@li x: A Tensor. Must be one of the following types: float16, float32 .
 *@li count: A Tensor. Must be one of the following types: float16, float32 .
 *@li count_sum: A Tensor. Must be one of the following types: float16, float32 . \n

 *@par Attributes:
 *@li axes: A required 1D list or tuple of int32 or int64. Specifies the dimensions to reduce.
 *@li keepdims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n

 *@par Outputs:
 *y: The reduced tensor. Has the same type and format as input "x" . \n

 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Sum.
 */
 REG_OP(ReduceMeanWithCount)
    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(count, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(count_sum, TensorType({DT_FLOAT, DT_FLOAT16}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
    .REQUIRED_ATTR(axes, ListInt)
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ReduceMeanWithCount)

 /**
 *@brief Calculates the "logical sum" of elements of a tensor in a dimension . \n

 *@par Inputs:
@@ -1363,6 +1391,64 @@ REG_OP(ReduceStdV2Update)
    .ATTR(unbiased, Bool, true)
    .ATTR(keepdim, Bool, false)
    .OP_END_FACTORY_REG(ReduceStdV2Update)

 /**
 *@brief Computes the log and sum and exp of elements across dimensions of a tensor.
 * Reduces "x" along the dimensions given in "axes".
 * Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each
 * entry in "axes". If "keep_dims" is true, the reduced dimensions
 * are retained with length 1.
 *
 *@par Inputs:
 * Two inputs, including:
 *@li x: A Tensor. Must be one of the following types:
 *     float32, float16, int32, int64, uint32, uint64, double
 *@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n
 *
 *@par Attributes:
 *keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n
 *
 *@par Outputs:
 *y: The reduced tensor. Has the same type and format as input "x" . \n
 *
 *@par Third-party framework compatibility
 * Compatible with the Onnx operator ReduceLogSumExp.
 */
 REG_OP(ReduceLogSumExp)
    .INPUT(x, TensorType::NumberType())
    .INPUT(axes, TensorType::IndexNumberType())
    .OUTPUT(y, TensorType::NumberType())
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ReduceLogSumExp)

 /**
 *@brief Computes the log and sum of elements across dimensions of a tensor.
 * Reduces "x" along the dimensions given in "axes".
 * Unless "keep_dims" is true, the rank of the tensor is reduced by 1 for each
 * entry in "axes". If "keep_dims" is true, the reduced dimensions
 * are retained with length 1.
 *
 *@par Inputs:
 * Two inputs, including:
 *@li x: A Tensor. Must be one of the following types:
 *     float32, float16, int32, int64, uint32, uint64, double
 *@li axes: A 1D list or tuple of int32 or int64. Specifies the dimensions to reduce . \n
 *
 *@par Attributes:
 *keep_dims: An optional bool. If "true", retains reduced dimensions with length 1. Defaults to "false" . \n
 *
 *@par Outputs:
 *y: The reduced tensor. Has the same type and format as input "x" . \n
 *
 *@par Third-party framework compatibility
 * Compatible with the Onnx operator ReduceLogSum.
 */
 REG_OP(ReduceLogSum)
    .INPUT(x, TensorType::NumberType())
    .INPUT(axes, TensorType::IndexNumberType())
    .OUTPUT(y, TensorType::NumberType())
    .ATTR(keep_dims, Bool, false)
    .OP_END_FACTORY_REG(ReduceLogSum)
 } //namespace ge

 #endif  // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
--- a/third_party/fwkacllib/inc/toolchain/prof_callback.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_callback.h
@@ -1,8 +1,17 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved.
 * Description: handle perf data
 * Author: xp
 * Create: 2019-10-13
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MSPROFILER_PROF_CALLBACK_H_
--- a/third_party/fwkacllib/inc/toolchain/prof_common.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_common.h
@@ -0,0 +1,450 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved.
 * Description: handle perf data
 * Author: Huawei Technologies Co., Ltd.
 * Create: 2019-10-13
 */
 #ifndef MSPROFILER_PROF_COMMON_H_
 #define MSPROFILER_PROF_COMMON_H_

 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus

 #include <stdint.h>

 #define MSPROF_DATA_HEAD_MAGIC_NUM  0x5a5a

 enum MsprofDataTag {
    MSPROF_ACL_DATA_TAG = 0,            //acl data tag, range: 0~19
    MSPROF_GE_DATA_TAG_MODEL_LOAD = 20, //ge data tag, range: 20~39
    MSPROF_GE_DATA_TAG_FUSION = 21,
    MSPROF_GE_DATA_TAG_INFER = 22,
    MSPROF_GE_DATA_TAG_TASK = 23,
    MSPROF_GE_DATA_TAG_TENSOR = 24,
    MSPROF_GE_DATA_TAG_STEP = 25,
    MSPROF_GE_DATA_TAG_ID_MAP = 26,
    MSPROF_GE_DATA_TAG_HOST_SCH = 27,
    MSPROF_RUNTIME_DATA_TAG_API = 40,   //runtime data tag, range: 40~59
    MSPROF_RUNTIME_DATA_TAG_TRACK = 41,
    MSPROF_AICPU_DATA_TAG = 60,         //aicpu data tag, range: 60~79
    MSPROF_HCCL_DATA_TAG = 80,          //hccl data tag, range: 80~99
    MSPROF_DP_DATA_TAG = 100,           //dp data tag, range: 100~119
    MSPROF_MSPROFTX_DATA_TAG = 120,     //hccl data tag, range: 120~139
    MSPROF_DATA_TAG_MAX = 65536,        //data tag value type is uint16_t
 };

 /**
 * @brief struct of mixed data
 */
 #define MSPROF_MIX_DATA_RESERVE_BYTES 7
 #define MSPROF_MIX_DATA_STRING_LEN 120
 enum MsprofMixDataType {
    MSPROF_MIX_DATA_HASH_ID = 0,
    MSPROF_MIX_DATA_STRING,
 };
 struct MsprofMixData {
    uint8_t type;  // MsprofMixDataType
    uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES];
    union {
        uint64_t hashId;
        char dataStr[MSPROF_MIX_DATA_STRING_LEN];
    } data;
 };
 using MixData = struct MsprofMixData;

 /**
 * @brief profiling command info
 */
 #define MSPROF_MAX_DEV_NUM 64
 struct MsprofCommandHandle {
    uint64_t profSwitch;
    uint64_t profSwitchHi;
    uint32_t devNums;
    uint32_t devIdList[MSPROF_MAX_DEV_NUM];
    uint32_t modelId;
    uint32_t type;
 };

 /**
 * @brief struct of data reported by acl
 */
 #define MSPROF_ACL_DATA_RESERVE_BYTES 32
 #define MSPROF_ACL_API_NAME_LEN 64
 enum MsprofAclApiType {
    MSPROF_ACL_API_TYPE_OP = 1,
    MSPROF_ACL_API_TYPE_MODEL,
    MSPROF_ACL_API_TYPE_RUNTIME,
    MSPROF_ACL_API_TYPE_OTHERS,
 };
 struct MsprofAclProfData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_ACL_DATA_TAG;
    uint32_t apiType;       // enum MsprofAclApiType
    uint64_t beginTime;
    uint64_t endTime;
    uint32_t processId;
    uint32_t threadId;
    char apiName[MSPROF_ACL_API_NAME_LEN];
    uint8_t  reserve[MSPROF_ACL_DATA_RESERVE_BYTES];
 };

 /**
 * @brief struct of data reported by GE
 */
 #define MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES 104
 struct MsprofGeProfModelLoadData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_MODEL_LOAD;
    uint32_t modelId;
    MixData  modelName;
    uint64_t startTime;
    uint64_t endTime;
    uint8_t  reserve[MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_FUSION_DATA_RESERVE_BYTES 8
 #define MSPROF_GE_FUSION_OP_NUM 8
 struct MsprofGeProfFusionData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_FUSION;
    uint32_t modelId;
    MixData  fusionName;
    uint64_t inputMemSize;
    uint64_t outputMemSize;
    uint64_t weightMemSize;
    uint64_t workspaceMemSize;
    uint64_t totalMemSize;
    uint64_t fusionOpNum;
    uint64_t fusionOp[MSPROF_GE_FUSION_OP_NUM];
    uint8_t  reserve[MSPROF_GE_FUSION_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_INFER_DATA_RESERVE_BYTES 64
 struct MsprofGeProfInferData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_INFER;
    uint32_t modelId;
    MixData  modelName;
    uint32_t requestId;
    uint32_t threadId;
    uint64_t inputDataStartTime;
    uint64_t inputDataEndTime;
    uint64_t inferStartTime;
    uint64_t inferEndTime;
    uint64_t outputDataStartTime;
    uint64_t outputDataEndTime;
    uint8_t  reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_TASK_DATA_RESERVE_BYTES 16
 #define MSPROF_GE_OP_TYPE_LEN 56
 enum MsprofGeTaskType {
    MSPROF_GE_TASK_TYPE_AI_CORE = 0,
    MSPROF_GE_TASK_TYPE_AI_CPU,
    MSPROF_GE_TASK_TYPE_AIV,
 };
 enum MsprofGeShapeType {
    MSPROF_GE_SHAPE_TYPE_STATIC = 0,
    MSPROF_GE_SHAPE_TYPE_DYNAMIC,
 };
 struct MsprofGeOpType {
    uint8_t type;  // MsprofMixDataType
    uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES];
    union {
        uint64_t hashId;
        char dataStr[MSPROF_GE_OP_TYPE_LEN];
    } data;
 };
 struct MsprofGeProfTaskData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_TASK;
    uint32_t taskType;      // MsprofGeTaskType
    MixData  opName;
    MsprofGeOpType opType;
    uint64_t curIterNum;
    uint64_t timeStamp;
    uint32_t shapeType;     // MsprofGeShapeType
    uint32_t blockDims;
    uint32_t modelId;
    uint32_t streamId;
    uint32_t taskId;
    uint32_t threadId;
    uint8_t  reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_TENSOR_DATA_RESERVE_BYTES 8
 #define MSPROF_GE_TENSOR_DATA_SHAPE_LEN 8
 #define MSPROF_GE_TENSOR_DATA_NUM 5
 enum MsprofGeTensorType {
    MSPROF_GE_TENSOR_TYPE_INPUT = 0,
    MSPROF_GE_TENSOR_TYPE_OUTPUT,
 };
 struct MsprofGeTensorData {
    uint32_t tensorType;    // MsprofGeTensorType
    uint32_t format;
    uint32_t dataType;
    uint32_t shape[MSPROF_GE_TENSOR_DATA_SHAPE_LEN];
 };

 struct MsprofGeProfTensorData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_TENSOR;
    uint32_t modelId;
    uint64_t curIterNum;
    uint32_t streamId;
    uint32_t taskId;
    uint32_t tensorNum;
    MsprofGeTensorData tensorData[MSPROF_GE_TENSOR_DATA_NUM];
    uint8_t  reserve[MSPROF_GE_TENSOR_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_STEP_DATA_RESERVE_BYTES 27
 enum MsprofGeStepTag {
    MSPROF_GE_STEP_TAG_BEGIN = 0,
    MSPROF_GE_STEP_TAG_END,
 };
 struct MsprofGeProfStepData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_STEP;
    uint32_t modelId;
    uint32_t streamId;
    uint32_t taskId;
    uint64_t timeStamp;
    uint64_t curIterNum;
    uint32_t threadId;
    uint8_t  tag;           // MsprofGeStepTag
    uint8_t  reserve[MSPROF_GE_STEP_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES 6
 struct MsprofGeProfIdMapData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_ID_MAP;
    uint32_t graphId;
    uint32_t modelId;
    uint32_t sessionId;
    uint64_t timeStamp;
    uint16_t mode;
    uint8_t  reserve[MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES];
 };

 #define MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES 24
 struct MsprofGeProfHostSchData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_GE_DATA_TAG_HOST_SCH;
    uint32_t threadId;      // record in start event
    uint64_t element;
    uint64_t event;
    uint64_t startTime;     // record in start event
    uint64_t endTime;       // record in end event
    uint8_t  reserve[MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES];
 };

 /**
 * @brief struct of data reported by RunTime
 */
 #define MSPROF_RUNTIME_API_DATA_RESERVE_BYTES 106
 #define MSPROF_RUNTIME_TASK_ID_NUM 10
 #define MSPROF_RUNTIME_API_NAME_LEN 64
 struct MsprofRuntimeProfApiData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_API;
    uint32_t threadId;
    uint64_t entryTime;
    uint64_t exitTime;
    uint64_t dataSize;
    uint8_t  apiName[MSPROF_RUNTIME_API_NAME_LEN];
    uint32_t retCode;
    uint32_t streamId;
    uint32_t taskNum;
    uint32_t taskId[MSPROF_RUNTIME_TASK_ID_NUM];
    uint16_t memcpyDirection;
    uint8_t  reserve[MSPROF_RUNTIME_API_DATA_RESERVE_BYTES];
 };

 #define MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES 10
 #define MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN 32
 struct MsprofRuntimeProfTrackData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_TRACK;
    uint32_t threadId;
    uint64_t timeStamp;
    char taskType[MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN];
    uint32_t taskId;
    uint16_t streamId;
    uint8_t  reserve[MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES];
 };

 /**
 * @brief struct of data reported by RunTime
 */
 #define MSPROF_AICPU_DATA_RESERVE_BYTES 9
 struct MsprofAicpuProfData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_AICPU_DATA_TAG;
    uint16_t streamId;
    uint16_t taskId;
    uint64_t runStartTime;
    uint64_t runStartTick;
    uint64_t computeStartTime;
    uint64_t memcpyStartTime;
    uint64_t memcpyEndTime;
    uint64_t runEndTime;
    uint64_t runEndTick;
    uint32_t threadId;
    uint32_t deviceId;
    uint64_t submitTick;
    uint64_t scheduleTick;
    uint64_t tickBeforeRun;
    uint64_t tickAfterRun;
    uint32_t kernelType;
    uint32_t dispatchTime;
    uint32_t totalTime;
    uint16_t fftsThreadId;
    uint8_t  version;
    uint8_t  reserve[MSPROF_AICPU_DATA_RESERVE_BYTES];
 };

 /**
 * @brief struct of data reported by DP
 */
 #define MSPROF_DP_DATA_RESERVE_BYTES 16
 #define MSPROF_DP_DATA_ACTION_LEN 16
 #define MSPROF_DP_DATA_SOURCE_LEN 64
 struct MsprofDpProfData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_DP_DATA_TAG;
    uint32_t rsv;   // Ensure 8-byte alignment
    uint64_t timeStamp;
    char action[MSPROF_DP_DATA_ACTION_LEN];
    char source[MSPROF_DP_DATA_SOURCE_LEN];
    uint64_t index;
    uint64_t size;
    uint8_t  reserve[MSPROF_DP_DATA_RESERVE_BYTES];
 };

 /**
 * @brief struct of data reported by HCCL
 */
 #pragma pack(4)
 struct MsprofHcclProfNotify {
    uint32_t taskID;
    uint64_t notifyID;
    uint32_t stage;
    uint32_t remoteRank;
    uint32_t transportType;
    uint32_t role; // role {0: dst, 1:src}
    double durationEstimated;
 };

 struct MsprofHcclProfReduce {
    uint32_t taskID;
    uint64_t src;
    uint64_t dst;
    uint64_t size;
    uint32_t op;       // {0: sum, 1: mul, 2: max, 3: min}
    uint32_t dataType; // data type {0: INT8, 1: INT16, 2: INT32, 3: FP16, 4:FP32, 5:INT64, 6:UINT64}
    uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
    uint32_t remoteRank;
    uint32_t transportType; //  transport type {0: SDMA, 1: RDMA, 2:LOCAL}
    uint32_t role;          // role {0: dst, 1:src}
    double durationEstimated;
 };

 struct MsprofHcclProfRDMA {
    uint32_t taskID;
    uint64_t src;
    uint64_t dst;
    uint64_t size;
    uint64_t notifyID;
    uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
    uint32_t remoteRank;
    uint32_t transportType; //  transport type {0: RDMA, 1:SDMA, 2:LOCAL}
    uint32_t role;          // role {0: dst, 1:src}
    uint32_t type;          // RDMA type {0: RDMASendNotify, 1:RDMASendPayload}
    double durationEstimated;
 };

 struct MsprofHcclProfMemcpy {
    uint32_t taskID;
    uint64_t src;
    uint64_t dst;
    uint64_t size;
    uint64_t notifyID;
    uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
    uint32_t remoteRank;
    uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL}
    uint32_t role;          // role {0: dst, 1:src}
    double durationEstimated;
 };

 struct MsprofHcclProfStageStep {
    uint32_t rank;
    uint32_t rankSize;
 };

 struct MsprofHcclProfFlag {
    uint64_t cclTag;
    uint64_t groupName;
    uint32_t localRank;
    uint32_t workFlowMode;
 };

 /**
 * @name MsprofHcclProfData
 * @brief struct of data reported by hccl
 */
 struct MsprofHcclProfData {
    uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
    uint16_t dataTag = MSPROF_HCCL_DATA_TAG;
    uint32_t planeID;
    uint32_t deviceID;
    uint32_t streamID;
    double ts;
    char name[16];
    union {
        MsprofHcclProfNotify notify;
        MsprofHcclProfReduce reduce;
        MsprofHcclProfStageStep stageStep;
        MsprofHcclProfMemcpy forMemcpy;
        MsprofHcclProfRDMA RDMA;
        MsprofHcclProfFlag flag;
    } args;
 };
 #pragma pack()

 /**
 * @name  MsprofStampInfo
 * @brief struct of data reported by msproftx
 */
 struct MsprofStampInfo {
    uint16_t magicNumber;
    uint16_t dataTag;
    uint32_t processId;
    uint32_t threadId;
    uint32_t category;         //marker category
    uint32_t  eventType;
    int32_t payloadType;
    union PayloadValue         //payload info for marker
    {
        uint64_t ullValue;
        int64_t llValue;
        double dValue;
        uint32_t uiValue[2];
        int32_t iValue[2];
        float fValue[2];
    } payload;
    uint64_t startTime;
    uint64_t endTime;
    int32_t messageType;
    char message[128];
    uint8_t reserve0[4];
    uint8_t reserve1[72];
 };

 #ifdef __cplusplus
 }
 #endif

 #endif  // MSPROFILER_PROF_COMMON_H_