From f85064db077260757db51d71cc96d2e65baa8ac3 Mon Sep 17 00:00:00 2001
From: yanghaoran <yanghaoran2@huawei.com>
Date: Thu, 2 Dec 2021 21:00:03 +0800
Subject: [PATCH] add cce back

---
 third_party/fwkacllib/inc/cce/aicpu_engine.h       |   63 +
 .../fwkacllib/inc/cce/aicpu_engine_struct.h        |   56 +
 third_party/fwkacllib/inc/cce/blas_struct.h        |   31 +
 third_party/fwkacllib/inc/cce/cce.h                |  101 +
 third_party/fwkacllib/inc/cce/cce_def.hpp          |  152 +
 third_party/fwkacllib/inc/cce/common/attr_list.hpp |   82 +
 third_party/fwkacllib/inc/cce/common/catch.hpp     |   95 +
 third_party/fwkacllib/inc/cce/compiler_stub.h      |   36 +
 third_party/fwkacllib/inc/cce/customize.h          |   60 +
 third_party/fwkacllib/inc/cce/dnn.h                |   23 +
 third_party/fwkacllib/inc/cce/dnn_base.h           |  676 +++
 third_party/fwkacllib/inc/cce/dnn_base_def.hpp     |  994 ++++
 third_party/fwkacllib/inc/cce/dnn_op.h             | 4838 ++++++++++++++++++++
 third_party/fwkacllib/inc/cce/dnn_struct.hpp       |   23 +
 third_party/fwkacllib/inc/cce/dnn_struct_base.hpp  |  894 ++++
 third_party/fwkacllib/inc/cce/fwk_adpt_struct.h    |  155 +
 third_party/fwkacllib/inc/cce/l2fusion_struct.hpp  |   56 +
 .../fwkacllib/inc/cce/optimizer/fusion_engine.h    |   65 +
 third_party/fwkacllib/inc/cce/taskdown_api.h       |   54 +
 third_party/fwkacllib/inc/cce/taskdown_common.hpp  |  108 +
 20 files changed, 8562 insertions(+)
 create mode 100644 third_party/fwkacllib/inc/cce/aicpu_engine.h
 create mode 100644 third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
 create mode 100644 third_party/fwkacllib/inc/cce/blas_struct.h
 create mode 100644 third_party/fwkacllib/inc/cce/cce.h
 create mode 100644 third_party/fwkacllib/inc/cce/cce_def.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/common/attr_list.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/common/catch.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/compiler_stub.h
 create mode 100644 third_party/fwkacllib/inc/cce/customize.h
 create mode 100644 third_party/fwkacllib/inc/cce/dnn.h
 create mode 100644 third_party/fwkacllib/inc/cce/dnn_base.h
 create mode 100644 third_party/fwkacllib/inc/cce/dnn_base_def.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/dnn_op.h
 create mode 100644 third_party/fwkacllib/inc/cce/dnn_struct.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/dnn_struct_base.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
 create mode 100644 third_party/fwkacllib/inc/cce/l2fusion_struct.hpp
 create mode 100644 third_party/fwkacllib/inc/cce/optimizer/fusion_engine.h
 create mode 100644 third_party/fwkacllib/inc/cce/taskdown_api.h
 create mode 100644 third_party/fwkacllib/inc/cce/taskdown_common.hpp

diff --git a/third_party/fwkacllib/inc/cce/aicpu_engine.h b/third_party/fwkacllib/inc/cce/aicpu_engine.h
new file mode 100644
index 00000000..bc2e415f
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/aicpu_engine.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_ENGINE_H__
+#define AICPU_ENGINE_H__
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  AE_STATUS_SUCCESS = 0,
+  AE_STATUS_BAD_PARAM = 1,
+  AE_STATUS_OPEN_SO_FAILED = 2,
+  AE_STATUS_GET_KERNEL_NAME_FAILED = 3,
+  AE_STATUS_INNER_ERROR = 4,
+  AE_STATUS_KERNEL_API_INNER_ERROR = 5,
+  AE_STATUS_END_OF_SEQUENCE = 6,
+  AE_STATUS_DUMP_FAILED = 7,
+  AE_STATUS_TASK_WAIT = 101,
+  AE_STATUS_RESERVED
+} aeStatus_t;
+
+/**
+ * @ingroup aicpu engine
+ * @brief aeCallInterface:
+ *          a interface to call a function in a op kernfel lib
+ * @param [in] addr     void *,  should be STR_KERNEL * format
+ * @return aeStatus_t
+ */
+aeStatus_t aeCallInterface(void *addr);
+
+/**
+ * @ingroup aicpu engine
+ * @brief aeBatchLoadKernelSo:
+ *          a interface to load kernel so
+ * @param [in] loadSoNum  load so number
+ * @param [in] soPaths    load so paths
+ * @param [in] soNames    load so names
+ * @return aeStatus_t
+ */
+aeStatus_t aeBatchLoadKernelSo(const uint32_t loadSoNum, const char *soPaths[], const char *soNames[]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AICPU_ENGINE_H__
diff --git a/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
new file mode 100644
index 00000000..8c0c1847
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/aicpu_engine_struct.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AICPU_ENGINE_STRUCT_H__
+#define AICPU_ENGINE_STRUCT_H__
+
+#include "fwk_adpt_struct.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+    The different framwork we adapted for.
+*/
+typedef enum {
+  FMK_KERNEL_TYPE_TF = 0,
+  FMK_KERNEL_TYPE_CF = 10,
+  FMK_KERNEL_TYPE_PT = 20,
+  FMK_KERNEL_TYPE_RESERVED
+} FwkkernelType_t;
+
+#pragma pack(push, 1)
+typedef struct {
+  uint32_t fwkKernelType;  // FwkkernelType_t
+  union {
+    ::aicpu::FWKAdapter::FWKOperateParam fwk_kernel;
+  } fwkKernelBase;
+} STR_FWK_OP_KERNEL;
+#pragma pack(pop)
+
+#pragma pack(push, 1)
+struct SessionInfo {
+  uint64_t sessionId;
+  uint64_t kernelId;
+  bool sessFlag;
+};
+#pragma pack(pop)
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // AICPU_ENGINE_STRUCT_H__
diff --git a/third_party/fwkacllib/inc/cce/blas_struct.h b/third_party/fwkacllib/inc/cce/blas_struct.h
new file mode 100644
index 00000000..e0bcee4c
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/blas_struct.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CC_BLAS_STRUCT_API__
+#define CC_BLAS_STRUCT_API__
+
+#include <stdint.h>
+
+typedef enum { CCBLAS_FILL_MODE_LOWER = 0, CCBLAS_FILL_MODE_UPPER = 1 } ccblasFillMode_t;
+
+typedef enum {
+  CCBLAS_OP_N = 0,
+  CCBLAS_OP_T = 1,
+} ccblasOperation_t;
+
+typedef enum { CCBLAS_DIAG_NON_UNIT = 0, CCBLAS_DIAG_UNIT = 1 } ccblasDiagType_t;
+
+#endif  // CC_BLAS_STRUCT_API__
diff --git a/third_party/fwkacllib/inc/cce/cce.h b/third_party/fwkacllib/inc/cce/cce.h
new file mode 100644
index 00000000..0cd9613a
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/cce.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CCE_H__
+#define CCE_H__
+
+#include <stdint.h>
+#include "cce_def.hpp"
+
+namespace cce {
+
+/**
+ * @ingroup cce
+ * @brief create cc handler
+ * @param [in|out] handle   point of cc handler
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreate(ccHandle_t *handle);
+
+/**
+ * @ingroup cce
+ * @brief destroy cc handler
+ * @param [in] *handle   cc handler
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroy(ccHandle_t *handle);
+
+/**
+ * @ingroup cce
+ * @brief bind stream with specified cc handler
+ * @param [in] handle   cc handler
+ * @param [in] streamId   stream
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetStream(ccHandle_t handle, rtStream_t streamId);
+
+/**
+ * @ingroup cce
+ * @brief get the stream from cc handler
+ * @param [in] handle   cc handler
+ * @param [in|out] streamId   point of stream
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetStream(ccHandle_t handle, rtStream_t *streamId);
+
+/**
+ * @ingroup cce
+ * @brief get the stream from cc handler
+ * @param [in] dataTypeTransMode   mode of data type transform
+ * @param [in] inputData   input data point
+ * @param [in] inputDataSize   input data size
+ * @param [in|out] outputData   output data point
+ * @param [in] outputDataSize   output data size
+ * @return ccStatus_t
+ */
+ccStatus_t ccTransDataType(ccDataTypeTransMode_t dataTypeTransMode, const void *inputData, uint32_t inputDataSize,
+                           void *outputData, const uint32_t outputDataSize);
+/**
+ * @ingroup cce
+ * @brief cce sys init func
+ */
+void cceSysInit();
+
+/**
+ * @ingroup cce
+ * @brief cce Log Start up func
+ */
+void cceLogStartup();
+
+/**
+ * @ingroup cce
+ * @brief cce Log Shut down func
+ */
+void cceLogShutdown();
+
+/**
+ * @ingroup cce
+ * @brief set the profiling on or off
+ * @param [in] const unsigned char* target: The engine gets it from ENV. Don't need care about it.
+ * @param const char* job_ctx: identifies profiling job
+ * @param [in] uint32_t flag: value: 0, on ; 1, off.
+ * @return ccStatus_t value: 0, success; 1, fail.
+ */
+ccStatus_t CceProfilingConfig(const char *target, const char *job_ctx, uint32_t flag);
+
+};  // namespace cce
+
+#endif  // CCE_H__
diff --git a/third_party/fwkacllib/inc/cce/cce_def.hpp b/third_party/fwkacllib/inc/cce/cce_def.hpp
new file mode 100644
index 00000000..7b1a1b8a
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/cce_def.hpp
@@ -0,0 +1,152 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CCE_DEF_H__
+#define CCE_DEF_H__
+
+#include "runtime/rt.h"
+
+namespace cce {
+
+/**
+ * @ingroup cce
+ * @brief memory configure for fusion
+ */
+typedef struct TagCceFusionMemCfg {
+  uint64_t memAddr;        /**< memAddr */
+  uint32_t memSize;        /**< memSize */
+  uint32_t addrChangeFlag; /**< op data addr change flag. value:0,valid;1,not valid */
+  uint32_t poolFlag;       /**< mempool flag : value:0,is valid; value: 1, not valid */
+  TagCceFusionMemCfg() {
+    memAddr = 0;
+    memSize = 0;
+    addrChangeFlag = 0;
+    poolFlag = 0;
+  }
+} CceFusionMemCfg_t;
+/**
+ * @ingroup cce
+ * @brief return value
+ */
+typedef enum tagCcStatus {
+  CC_STATUS_SUCCESS = 0,         /**< succ */
+  CC_STATUS_NOT_INITIALIZED = 1, /**< not init */
+  CC_STATUS_ALLOC_FAILED = 2,    /**< alloc mem failed */
+  CC_STATUS_BAD_PARAM = 3,       /**< para check failed */
+  CC_STATUS_INTERNAL_ERROR = 4,  /**< internal error */
+  CC_STATUS_KERNEL_ERROR = 5,    /**< kernel error */
+  CC_STATUS_RUNTIME_ERROR = 6,   /**< runtime error */
+  CC_STATUS_NOT_SUPPORTED = 7,   /**< unsupport error */
+  CC_STATUS_INVALID_VALUE = 7,   /**< invalid value error for blas*/
+  CC_STATUS_RESERVED             /**< just for check */
+} ccStatus_t;
+
+/**
+ * @ingroup cce
+ * @brief original data type
+ */
+typedef enum tagCcDataType {
+  CC_DATA_FLOAT = 0,            /**< float type */
+  CC_DATA_HALF,                 /**< fp16 type */
+  CC_DATA_INT8,                 /**< int8 type */
+  CC_DATA_INT32,                /**< int32 type */
+  CC_DATA_UINT8,                /**< uint8 type */
+  CC_DATA_HALF_UINT16_PROPOSAL, /**<mixed type for proposal*/
+  CC_DATA_INT16,                /**< int16 type */
+  CC_DATA_UINT16,               /**< uint16 type */
+  CC_DATA_UINT32,               /**< uint32 type */
+  CC_DATA_INT64,                /**< int64 type */
+  CC_DATA_UINT64,               /**< uint64 type */
+  CC_DATA_DOUBLE,               /**< double type */
+  CC_DATA_BOOL,                 /**< bool type */
+  CC_DATA_DUAL,                 /**< dual output type */
+  CC_DATA_DUAL_SUB_INT8,        /**< dual output int8 type */
+  CC_DATA_DUAL_SUB_UINT8,       /**< dual output uint8 type */
+  CC_DATA_COMPLEX64,
+  CC_DATA_COMPLEX128,
+  CC_DATA_QINT8,
+  CC_DATA_QINT16,
+  CC_DATA_QINT32,
+  CC_DATA_QUINT8,
+  CC_DATA_QUINT16,
+  CC_DATA_RESERVED
+} ccDataType_t;
+
+/**
+ * @ingroup cce
+ * @brief save context of cce library
+ */
+typedef struct tagCcContext {
+  rtStream_t streamId;
+  uint32_t opIndex;
+} ccContext_t;
+
+typedef struct tagCcContext *ccHandle_t;
+
+/**
+ * @ingroup cce
+ * @brief mode of data type transform
+ */
+typedef enum tagCcDataTypeTransMode {
+  CC_DATATYPE_TRANS_FLOAT_NO_TRANS = 0, /**< origin data is float, no trans */
+  CC_DATATYPE_TRANS_FP16_NO_TRANS,      /**< origin data is fp16, no trans */
+  CC_DATATYPE_TRANS_INT8_NO_TRANS,      /**< origin data is int8, no trans */
+  CC_DATATYPE_TRANS_FLOAT_TO_FP16,      /**< data type float trans to fp16 */
+  CC_DATATYPE_TRANS_FP16_TO_FLOAT,      /**< data type fp16 trans to float */
+  CC_DATATYPE_TRANS_FLOAT_TO_INT8,      /**< data type float trans to int8 */
+  CC_DATATYPE_TRANS_INT8_TO_FLOAT,      /**< data type int8 trans to float */
+  CC_DATATYPE_TRANS_UINT8_TO_FLOAT,     /**< data type uint8 trans to float */
+  CC_DATATYPE_TRANS_UINT8_NO_TRANS,     /**< origin data is uint8, no trans */
+  CC_DATATYPE_TRANS_INT32_NO_TRANS,     /**< data type uint8 trans to float */
+  CC_DATATYPE_TRANS_UINT16_NO_TRANS,    /** < origin data is uint16, no trans*/
+  CC_DATATYPE_TRANS_UINT16_TO_FLOAT,    /** < data type uint16 trans to float*/
+  CC_DATATYPE_TRANS_MODE_RESERVED
+} ccDataTypeTransMode_t;
+
+typedef struct tagContextInfo {
+  ccHandle_t handle;
+  rtStream_t stream;
+  uint8_t *memBase;
+  uint64_t totalMemSize;
+  uint8_t *weightsMemBase;
+  uint64_t weightsMemSize;
+  uint8_t *weightsMemBaseHost;
+} ContextInfo;
+
+/**
+ * @ingroup cce
+ * @brief cce function parameter type
+ */
+typedef enum tagCcFuncType {
+  CC_FUSION_L2,
+  GLOBAL_MEMORY_CLEAR,
+  MAX_NUM,
+} ccFuncParamType_t;
+
+/**
+ * @ingroup cce
+ * @brief cce set function point state
+ */
+ccStatus_t ccSetFuncState(ccFuncParamType_t type, bool isOpen);
+
+/**
+ * @ingroup cce
+ * @brief cce get function point state
+ */
+bool ccGetFuncState(ccFuncParamType_t type);
+
+}  // namespace cce
+#endif  // CCE_DEF_H__
diff --git a/third_party/fwkacllib/inc/cce/common/attr_list.hpp b/third_party/fwkacllib/inc/cce/common/attr_list.hpp
new file mode 100644
index 00000000..bf48e9fc
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/common/attr_list.hpp
@@ -0,0 +1,82 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ATTR_LIST_HPP__
+#define ATTR_LIST_HPP__
+
+#include "catch.hpp"
+
+/**
+ * @ingroup util
+ * @brief frame  Error Value
+ */
+#define ATTR_SUCCESS (0)
+#define ATTR_ERROR_NULL_POINT (1)
+#define ATTR_ERROR_ALREADY_EXIST (2)
+#define ATTR_ERROR_NOT_EXIST (3)
+#define ATTR_ERROR_BUFFER_NOT_ENOUGH (4)
+#define ATTR_ERROR_BAD_PARAM (5)
+#define ATTR_ERROR_ALLOC_FAIL (6)
+#define ATTR_ERROR_FREE_FAIL (7)
+#define ATTR_ERROR_RESERVED (8)
+
+struct AttrListPrivate;
+/**
+ * @ingroup util
+ * @brief attribute list
+ */
+class AttrList {
+ public:
+  AttrList();
+  AttrList(uint32_t initLen);
+  ~AttrList();
+  AttrList(const AttrList &rhs) = delete;
+  AttrList &operator=(const AttrList &rhs);
+
+ public:
+  /**
+   * @ingroup util
+   * @brief add paras
+   * @param [in] attrId   attribute id
+   * @param [in] attrLen   length of attribute
+   * @param [in] attrValue   point to attribute
+   * @return ccStatus_t
+   */
+  uint32_t Add(uint32_t attrId, uint32_t attrLen, const void *attrValue);
+
+  /**
+   * @ingroup util
+   * @brief read paras
+   * @param [in] attrId   attribute id
+   * @param [in] attrLen   point to length of attribute
+   * @param [in] attrValue   reference of point to attribute
+   * @return ccStatus_t
+   */
+  uint32_t Get(uint32_t attrId, uint32_t &attrLen, const void *&attr_value) const;
+
+  /**
+   * @ingroup util
+   * @brief get the length of attribute list
+   * @return length of attribute
+   */
+  uint32_t Length() const;
+
+ private:
+  AttrListPrivate *impl_;
+  uint32_t initLen_;
+  uint32_t Init();
+};
+#endif  // ATTR_LIST_HPP__
diff --git a/third_party/fwkacllib/inc/cce/common/catch.hpp b/third_party/fwkacllib/inc/cce/common/catch.hpp
new file mode 100644
index 00000000..c440be53
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/common/catch.hpp
@@ -0,0 +1,95 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CATCH_HPP_
+#define CATCH_HPP_
+
+#include <stdint.h>
+#include <iostream>
+
+#define ERROR_CODE() __catch_error_code
+#define ERROR_LINE_NO() __catch_error_line_no
+#define ERROR_PROC() __catch_error_line_no = __LINE__;
+
+#define PROC                                   \
+  uint32_t __catch_error_code = 0x7FFFFFCC;    \
+  uint32_t __catch_error_line_no = 0xFFFFFFFF; \
+  {
+#define END_PROC \
+  }              \
+  __tabErrorCode:
+#define THROW(errcode)              \
+  {                                 \
+    __catch_error_code = (errcode); \
+    ERROR_PROC();                   \
+    goto __tabErrorCode;            \
+  }
+#define EXEC(func)                                                    \
+  {                                                                   \
+    if (0 != (__catch_error_code = (func))) THROW(__catch_error_code) \
+  }
+#define EXEC_EX1(func, error_code)     \
+  {                                    \
+    if (0 != (func)) THROW(error_code) \
+  }
+#define EXEC_EX(func, succRet, error_code)                          \
+  {                                                                 \
+    if (succRet != (__catch_error_code = (func))) THROW(error_code) \
+  }
+#define ASSERT_EXEC(func, succRet)                                       \
+  {                                                                      \
+    if (succRet != (__catch_error_code = (func))) /*GO_ASSERT_FALSE();*/ \
+      THROW(__catch_error_code)                                          \
+  }                                                                      \
+  }
+#define NEW_ERROR_EXEC(errcode, func, succRet) \
+  {                                            \
+    if (succRet != (func)) {                   \
+      THROW(errcode)                           \
+    }                                          \
+  }
+#define JUDGE(errcode, expr) \
+  {                          \
+    if (!(expr)) {           \
+      THROW(errcode)         \
+    }                        \
+  }
+#define ASSERT_JUDGE(errcode, expr)       \
+  {                                       \
+    if (!(expr)) { /*GO_ASSERT_FALSE();*/ \
+      THROW(errcode)                      \
+    }                                     \
+  }
+#define JUDGE_FALSE(errcode, expr) \
+  {                                \
+    if (expr) {                    \
+      THROW(errcode)               \
+    }                              \
+  }
+#define JUDGE_CONTINUE(expr) \
+  {                          \
+    if (expr) {              \
+      continue;              \
+    }                        \
+  }
+#define CATCH_ERROR(errcode) if (__catch_error_code == (errcode)) {  // ERROR_LOG();
+#define CATCH_ALL_ERROR {
+#define END_CATCH_ERROR }
+#define FINAL \
+  __tabFinal:
+#define END_FINAL /*GO_ASSERT_FALSE()*/ ;
+#define GOTO_FINAL() goto __tabFinal;
+#endif  // CATCH_HPP_
diff --git a/third_party/fwkacllib/inc/cce/compiler_stub.h b/third_party/fwkacllib/inc/cce/compiler_stub.h
new file mode 100644
index 00000000..00ea467e
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/compiler_stub.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPILER_STUB_H__
+#define COMPILER_STUB_H__
+
+namespace cce {
+
+/**
+ * @ingroup cce
+ * @brief compiler stub init func
+ */
+bool compilerStubInit();
+
+/**
+ * @ingroup cce
+ * @brief compiler stub free func
+ */
+bool compilerStubFree();
+
+};  // namespace cce
+
+#endif  // COMPILER_STUB_H__
diff --git a/third_party/fwkacllib/inc/cce/customize.h b/third_party/fwkacllib/inc/cce/customize.h
new file mode 100644
index 00000000..7dd97af1
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/customize.h
@@ -0,0 +1,60 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CC_CUSTOMIZE_API__
+#define CC_CUSTOMIZE_API__
+
+#include <stdint.h>
+
+#define CC_DEVICE_DIM_MAX 8
+typedef enum tagOpTensorFormat
+{
+    OP_TENSOR_FORMAT_NC1HWC0 = 0,
+    OP_TENSOR_FORMAT_ND,
+    OP_TENSOR_FORMAT_RESERVED,
+
+} opTensorFormat_t;
+
+
+typedef enum tagOpDataType
+{
+    OP_DATA_FLOAT = 0,             /**< float type */
+    OP_DATA_HALF,                  /**< fp16 type */
+    OP_DATA_INT8,                  /**< int8 type */
+    OP_DATA_INT32,                 /**< int32 type */
+    OP_DATA_UINT8,                 /**< uint8 type */
+    OP_DATA_HALF_UINT16_PROPOSAL,  /**<mixed type for proposal*/
+    OP_DATA_RESERVED
+} opDataType_t;
+
+typedef struct tagOpTensor
+{
+    // real dim info
+    opTensorFormat_t format;
+    opDataType_t data_type;
+    int32_t dim_cnt;
+    int32_t mm;
+    int32_t dim[CC_DEVICE_DIM_MAX];
+} opTensor_t;
+
+typedef opTensor_t tagCcAICPUTensor;
+typedef void * rtStream_t;
+typedef void (*aicpu_run_func)(opTensor_t **, void **, int32_t,
+                               opTensor_t **, void **, int32_t, void *, rtStream_t);
+
+
+#endif  // CC_CUSTOMIZE_API__
+
diff --git a/third_party/fwkacllib/inc/cce/dnn.h b/third_party/fwkacllib/inc/cce/dnn.h
new file mode 100644
index 00000000..03ca7d5a
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/dnn.h
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DNN_H__
+#define DNN_H__
+
+#include "cce/dnn_base.h"
+#include "cce/dnn_op.h"
+
+#endif  // DNN_H__
diff --git a/third_party/fwkacllib/inc/cce/dnn_base.h b/third_party/fwkacllib/inc/cce/dnn_base.h
new file mode 100644
index 00000000..912ba671
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/dnn_base.h
@@ -0,0 +1,676 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DNN_BASE_H__
+#define DNN_BASE_H__
+
+#include "cce/blas_struct.h"
+#include "cce/customize.h"
+#include "cce/dnn_base_def.hpp"
+
+namespace cce {
+/**
+  * @ingroup dnn
+  * @brief Minimum epsilon allowed to be used in the Batch Normalization formula
+  */
+#define CC_BN_MIN_EPSILON               (1e-7)
+
+#ifndef NULL
+    #ifdef __cplusplus
+        #define NULL 0
+    #else
+        #define NULL ((void *)0)
+    #endif
+#endif
+
+/**
+  * @ingroup dnn
+  * @brief max number of dimensions
+  */
+#define CC_DIM_MAX (8)
+
+typedef  struct cCTagL2LossDescriptor *   ccL2LossDescriptor_t;
+
+/**
+  * @ingroup dnn
+  * @brief mode of concatfive2fout
+  */
+typedef enum tagTransForLossMode {
+    CC_TRANS_FOR_BOX = 0,
+    CC_TRANS_FOR_SCORE,
+} ccTransForLossMode_t;
+
+/**
+  * @ingroup dnn
+  * @brief descriptor of concatfive2fout
+  */
+typedef struct tagCcConcatFive2Four_t *ccConcatFive2FourDescriptor_t;
+
+}; /* end cce */
+
+namespace cce {
+
+/**
+  * @ingroup dnn
+  * @brief create descriptor of tensor
+  * @param [in|out] tensorDesc   point to descriptor of tensor
+  * @return ccStatus_t
+  */
+ccStatus_t ccCreateTensorDescriptor(ccTensorDescriptor_t *tensorDesc);
+
+/**
+  * @ingroup dnn
+  * @brief destroy descriptor of tensor
+  * @param [in] *tensorDesc   descriptor of tensor
+  * @return ccStatus_t
+  */
+ccStatus_t ccDestroyTensorDescriptor(ccTensorDescriptor_t *tensorDesc);
+
+/**
+  * @ingroup dnn
+  * @brief init tensor to 4d tensor
+  * @param [in|out] tensorDesc   descriptor of tensor
+  * @param [in] format   format of tensor
+  * @param [in] dataType   data type in device
+  * @param [in] n   batch size
+  * @param [in] c   channels
+  * @param [in] h   height of feature map
+  * @param [in] w   width of feature map
+  * @return ccStatus_t
+  */
+ccStatus_t ccSetTensor4dDescriptor(ccTensorDescriptor_t tensorDesc,
+                                   ccTensorFormat_t format,
+                                   ccDataType_t dataType,
+                                   int32_t n,
+                                   int32_t c,
+                                   int32_t h,
+                                   int32_t w);
+
+/**
+  * @ingroup dnn
+  * @brief read 4d tensor
+  * @param [in] tensorDesc   descriptor of tensor
+  * @param [in|out] dataType   point to data type in device
+  * @param [in|out] n   point to batch size
+  * @param [in|out] c   point to channels
+  * @param [in|out] h   point to height of feature map
+  * @param [in|out] w   point to width of feature map
+  * @param [in|out] nStride   point to stride of n
+  * @param [in|out] cStride   point to stride of c
+  * @param [in|out] hStride   point to stride of h
+  * @param [in|out] wStride   point to stride of w
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetTensor4dDescriptor(const ccTensorDescriptor_t tensorDesc,
+                                   ccDataType_t *dataType,
+                                   int32_t *n,
+                                   int32_t *c,
+                                   int32_t *h,
+                                   int32_t *w,
+                                   int32_t *nStride,
+                                   int32_t *cStride,
+                                   int32_t *hStride,
+                                   int32_t *wStride);
+
+/**
+* @ingroup dnn
+* @brief print 4d tensor (just in debug log mode)
+* @param [in] tensorDesc   descriptor of tensor
+* @return ccStatus_t
+*/
+ccStatus_t ccPrintTensor4dDescriptor(const ccTensorDescriptor_t tensorDesc);
+
+/**
+* @ingroup dnn
+* @brief print Nd tensor (just in debug log mode)
+* @param [in] tensorDesc   descriptor of tensor
+* @return ccStatus_t
+*/
+ccStatus_t ccPrintTensorNdDescriptor(const ccTensorDescriptor_t tensorDesc);
+
+/**
+  * @ingroup dnn
+  * @brief init tensor to Nd tensor
+  * @param [in|out] tensorDesc   descriptor of tensor
+  * @param [in] dataType   data type in device
+  * @param [in] dimCnt   Dimension of the tensor
+  * @param [in] dimA   Array of dimension dimCnt that contain the size of the tensor for every dimension. Size along unused dimensions should be set to 1.
+  * @return ccStatus_t
+  */
+ccStatus_t ccSetTensorNdDescriptor(ccTensorDescriptor_t tensorDesc,
+                                   ccDataType_t dataType,
+                                   int32_t dimCnt,
+                                   int32_t dimA[]);
+
+/**
+  * @ingroup dnn
+  * @brief read Nd tensor
+  * @param [in] tensorDesc   descriptor of tensor
+  * @param [in] dimCntReq   point to data type in device
+  * @param [in|out] dataType   point to data type in device
+  * @param [in|out] dimCnt   Dimension of the tensor
+  * @param [in|out] dimA   Array of dimension of at least dimCntReq that will be filled with the dimensions from the provided tensor descriptor.
+  * @param [in|out] strideA   Array of dimension dimCntReq that contain the stride of the tensor for every dimension
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetTensorNdDescriptor(const ccTensorDescriptor_t tensorDesc,
+                                   int32_t dimCntReq,
+                                   ccDataType_t *dataType,
+                                   int32_t *dimCnt,
+                                   int32_t dimA[],
+                                   int32_t strideA[]);
+
+/**
+  * @ingroup dnn
+  * @brief transform tensor between 4d(NCHW) and 5d(NC1HWC0)
+  * @param [in] xDesc   descriptor of input tensor
+  * @param [in] x   point to input data in host memory
+  * @param [in] dataTypeTransmode   mode of data type transform
+  * @param [in] yDesc   descriptor of output tensor
+  * @param [in|out] y   point to output data in host memory
+  * @param [in] ySizeInBytes   size of outputData
+  * @return ccStatus_t
+  */
+ccStatus_t ccTransTensor(const ccTensorDescriptor_t xDesc,
+                         const void *x,
+                         const ccTensorDescriptor_t yDesc,
+                         void *y,
+                         uint32_t ySizeInBytes);
+
+/**
+  * @ingroup dnn
+  * @brief get the format and dimcnt of Tensor
+  * @param [in] tensorDesc   descriptor of tensor
+  * @param [in|out] format   point to format
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetTensorFormat(const ccTensorDescriptor_t tensorDesc,
+                             ccTensorFormat_t  *format);
+
+/**
+  * @ingroup dnn
+  * @brief set the format and dimcnt of Tensor
+  * @param [in] tensorDesc   descriptor of tensor
+  * @param [in|out] format   point to format
+  * @return ccStatus_t
+  */
+ccStatus_t ccSetTensorFormat(ccTensorDescriptor_t tensorDesc,
+                             ccTensorFormat_t  format);
+
+
+/**
+  * @ingroup dnn
+  * @brief get the RealDimCnt of Tensor
+  * @param [in] tensorDesc   descriptor of tensor
+  * @param [in|out] RealDimCnt   point to RealDimCnt
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetTensorRealDimCnt(const ccTensorDescriptor_t tensorDesc,
+                                 int32_t *realDimCnt);
+
+/**
+  * @ingroup dnn
+  * @brief set the RealDimCnt of Tensor
+  * @param [in|out] tensorDesc   descriptor of tensor
+  * @param [in] RealDimCnt   RealDimCnt to set
+  * @return ccStatus_t
+  */
+ccStatus_t ccSetTensorRealDimCnt(ccTensorDescriptor_t tensorDesc,
+                                 int32_t realDimCnt);
+
+
+/**
+  * @ingroup dnn
+  * @brief get data size of 4d tensor
+  * @param [in] tensorDesc   descriptor of tensor
+  * @param [in|out] size   point to data size
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetTensorSizeInBytes(const ccTensorDescriptor_t tensorDesc, uint32_t *size);
+
+/**
+* @ingroup dnn
+* @brief get data size of 4d tensor which is align to 32B
+* @param [in] tensorDesc   descriptor of tensor
+* @param [in|out] size   point to data size
+* @return ccStatus_t
+*/
+ccStatus_t ccGetTensorMemorySizeInBytes(const ccTensorDescriptor_t tensorDesc, uint32_t *size);
+
+
+ccStatus_t ccSetTensorDataSize(ccTensorDescriptor_t xDesc, uint32_t size);
+
+/**
+  * @ingroup dnn
+  * @brief get data size of 4d filter
+  * @param [in] filterDesc   descriptor of filter
+  * @param [in] groupNum number of group
+  * @param [in|out] size   point to data size
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetFilterSizeInBytes(const ccFilterDescriptor_t filterDesc, uint32_t *size);
+
+
+/**
+  * @ingroup dnn
+  * @brief read 4d filter
+  * @param [in] filterDesc   descriptor of filter
+  * @param [in|out] format   point to format of filter
+  * @param [in|out] dataType   point to data type in device
+  * @param [in|out] k   point to number of output feature maps
+  * @param [in|out] c   point to number of input feature maps
+  * @param [in|out] h   point to height of filter
+  * @param [in|out] w   point to width of filter
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetFilter4dDescriptor(const ccFilterDescriptor_t filterDesc,
+                                   ccTensorFormat_t *format,
+                                   ccDataType_t *dataType,
+                                   int32_t *k,
+                                   int32_t *c,
+                                   int32_t *h,
+                                   int32_t *w);
+
+ccStatus_t ccTransFilterFracZToNCHW(const ccFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    ccFilterDescriptor_t yDesc,
+                                    void *y,
+                                    uint32_t ySizeInBytes);
+
+/**
+  * @ingroup dnn
+  * @brief trans weight to fractal format, and trans data type together
+  * @param [in] wDesc   descriptor of input filter
+  * @param [in] w   input data pointer
+  * @param [in] yDesc   descriptor of output filter
+  * @param [in|out] y   output data pointer
+  * @param [in] ySizeInBytes   size of outputData
+  * @return ccStatus_t
+  */
+ccStatus_t ccTransFilter(const ccFilterDescriptor_t wDesc,
+                         const void *w,
+                         const ccFilterDescriptor_t yDesc,
+                         void *y,
+                         uint32_t ySizeInBytes);
+
+/**
+  * @ingroup dnn
+  * @brief trans weight to fractal format, and trans data type together
+  * @param [in] wDesc   descriptor of input filter
+  * @param [in] w   input data pointer
+  * @param [in] dataTypeTransmode   mode of data type transform
+  * @param [in] yDesc   descriptor of output filter
+  * @param [in|out] y   output data pointer
+  * @param [in] ySizeInBytes   size of outputData
+  * @return ccStatus_t
+  */
+ccStatus_t ccTransFilterInt8(const ccFilterDescriptor_t wDesc,
+                         const void *w,
+                         ccFilterDescriptor_t yDesc,
+                         void *y,
+                         uint32_t ySizeInBytes,
+                         ccDataType_t outputDataType);
+
+/**
+  * @ingroup dnn
+  * @brief create descriptor of filter
+  * @param [in|out] filterDesc   point to descriptor of filter
+  * @return ccStatus_t
+  */
+ccStatus_t ccCreateFilterDescriptor(ccFilterDescriptor_t *filterDesc);
+
+/**
+  * @ingroup dnn
+  * @brief destroy descriptor of filter
+  * @param [in] *filterDesc   descriptor of filter
+  * @return ccStatus_t
+  */
+ccStatus_t ccDestroyFilterDescriptor(ccFilterDescriptor_t *filterDesc);
+
+/**
+  * @ingroup dnn
+  * @brief init conv descriptor to 2d conv
+  * @param [in|out] convDesc   descriptor of convolution operator
+  * @param [in] mode   mode of convolution
+  * @param [in] padMode   mode of padding
+  * @param [in] padHHead   zero padding in height head, if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same value
+  * @param [in] padHTail   zero padding in height tail, need set when padMode is CC_PADDING_DIRECTASSIGN.
+  * @param [in] padWHead   zero padding in width head,  if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same value
+  * @param [in] padWTail   zero padding in width tail, need set when padMode is CC_PADDING_DIRECTASSIGN
+  * @param [in] strideH   stride in height
+  * @param [in] strideW   stride in width
+  * @param [in] dilationH   dilation in height
+  * @param [in] dilationW   dilation in width
+  * @return ccStatus_t
+  */
+ccStatus_t ccSetConvolution2dDescriptor(ccConvolutionDescriptor_t convDesc,
+                                        ccConvolutionMode_t mode,
+                                        ccPaddingMode_t padMode,
+                                        int32_t padHHead,
+                                        int32_t padHTail,
+                                        int32_t padWHead,
+                                        int32_t padWTail,
+                                        int32_t group,
+                                        int32_t strideH,
+                                        int32_t strideW,
+                                        int32_t dilationH,
+                                        int32_t dilationW);
+
+/**
+  * @ingroup dnn
+  * @brief read 2d conv
+  * @param [in] convDesc   descriptor of convolution operator
+  * @param [in|out] mode   point to mode of convolution
+  * @param [in] padMode   mode of padding
+  * @param [in] padHHead   zero padding in height head, if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same value
+  * @param [in] padHTail   zero padding in height tail, need set when padMode is CC_PADDING_DIRECTASSIGN.
+  * @param [in] padWHead   zero padding in width head,  if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same value
+  * @param [in] padWTail   zero padding in width tail, need set when padMode is CC_PADDING_DIRECTASSIGN
+  * @param [in|out] strideH   point to stride in height
+  * @param [in|out] strideW   point to stride in width
+  * @param [in|out] dilationH   point to dilation in height
+  * @param [in|out] dilationW   point to dilation in width
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetConvolution2dDescriptor(const ccConvolutionDescriptor_t convDesc,
+                                        ccConvolutionMode_t *mode,
+                                        ccPaddingMode_t *padMode,
+                                        int32_t *padHHead,
+                                        int32_t *padHTail,
+                                        int32_t *padWHead,
+                                        int32_t *padWTail,
+                                        int32_t *group,
+                                        int32_t *strideH,
+                                        int32_t *strideW,
+                                        int32_t *dilationH,
+                                        int32_t *dilationW);
+
+/**
+  * @ingroup dnn
+  * @brief get the output dimension info of 2d convolution
+  * @param [in] convDesc   descriptor of convolution operator
+  * @param [in] xDesc   descriptor of input tensor
+  * @param [in] wDesc   descriptor of filter
+  * @param [in|out] n   point to batch size
+  * @param [in|out] c   point to channels
+  * @param [in|out] h   point to height of feature map
+  * @param [in|out] w   point to width of feature map
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetConvolution2dForwardOutputDim(const ccConvolutionDescriptor_t  convDesc,
+                                              const ccTensorDescriptor_t xDesc,
+                                              const ccFilterDescriptor_t wDesc,
+                                              int32_t *n,
+                                              int32_t *c,
+                                              int32_t *h,
+                                              int32_t *w);
+
+/**
+  * @ingroup dnn
+  * @brief create descriptor of convolution operator
+  * @param [in|out] filterDesc   point to descriptor of convolution operator
+  * @return ccStatus_t
+  */
+ccStatus_t ccCreateConvolutionDescriptor(ccConvolutionDescriptor_t *convDesc);
+
+/**
+  * @ingroup dnn
+  * @brief destroy descriptor of convolution operator
+  * @param [in] *convDesc   descriptor of convolution operator
+  * @return ccStatus_t
+  */
+ccStatus_t ccDestroyConvolutionDescriptor(ccConvolutionDescriptor_t *convDesc);
+
+/**
+  * @ingroup dnn
+  * @brief check specific stride condition flag
+  * @param [in] deconvDesc   descriptor of Deconvolution operator
+  * @param [in] xDesc   descriptor of input tensor
+  * @param [in] yDesc   descriptor of output tensor
+  * @param [in] biasDesc   descriptor of bias tensor
+  * @param [in] wDesc   descriptor of filter
+  * @param [in|out] transMark   output condition flag
+  * @return ccStatus_t
+  */
+ccStatus_t ccDeconvSpStrideCondCheck(const ccConvolutionDescriptor_t deconvDesc,
+                                     const ccTensorDescriptor_t xDesc,
+                                     const ccTensorDescriptor_t yDesc,
+                                     const ccTensorDescriptor_t biasDesc,
+                                     const ccFilterDescriptor_t wDesc,
+                                     uint32_t &transMark);
+
+/**
+  * @ingroup dnn
+  * @brief special deconv stride trans
+  * @param [in] deconvDesc   descriptor of Deconvolution operator
+  * @param [in] xDesc   descriptor of input tensor
+  * @param [in] yDesc   descriptor of output tensor
+  * @param [in] biasDesc   descriptor of bias tensor
+  * @param [in] deconvStPtr   descriptor of filter
+  * @param [in|out] xStPtr   descriptor of trans input tensor
+  * @param [in|out] yStPtr   descriptor of trans output tensor
+  * @param [in|out] wStPtr   descriptor of trans filter tensor
+  * @param [in|out] wDesc   descriptor of trasn filter
+  * @param [in|out] transMark   condition flag
+  * @return ccStatus_t
+  */
+ccStatus_t ccDeconvSpStrideDescTrans(const ccConvolutionDescriptor_t deconvDesc,
+                          const ccTensorDescriptor_t xDesc,
+                          const ccTensorDescriptor_t yDesc,
+                          const ccTensorDescriptor_t biasDesc __attribute__((__unused__)),
+                          const ccFilterDescriptor_t wDesc,
+                          ccConvolutionDescriptor_t deconvStPtr,
+                          ccTensorDescriptor_t xStPtr,
+                          ccTensorDescriptor_t yStPtr,
+                          ccFilterDescriptor_t wStPtr,
+                          uint32_t transMark);
+
+/**
+  * @ingroup dnn
+  * @brief check deconv goto aicore flag
+  * @param [in] deconvDesc   descriptor of Deconvolution operator
+  * @param [in] xDesc   descriptor of input tensor
+  * @param [in] yDesc   descriptor of output tensor
+  * @param [in] wDesc   descriptor of filter
+  * @param [in] isGotoAicore  out flag
+  * @param [in] transMark   condition flag
+  * @return ccStatus_t
+  */
+ccStatus_t ccDeconvCheckGotoAiCore(const ccConvolutionDescriptor_t deconvDesc,
+                           const ccTensorDescriptor_t xDesc,
+                           const ccTensorDescriptor_t yDesc,
+                           const ccFilterDescriptor_t wDesc,
+                           uint32_t *isGotoAicore,
+                           uint32_t transMark);
+
+/**
+  * @ingroup dnn
+  * @brief get the output dimension info of 2d Deconvolution
+  * @param [in] deconvDesc   descriptor of Deconvolution operator
+  * @param [in] xDesc   descriptor of input tensor
+  * @param [in] wDesc   descriptor of filter
+  * @param [in|out] n   point to batch size
+  * @param [in|out] c   point to channels
+  * @param [in|out] h   point to height of feature map
+  * @param [in|out] w   point to width of feature map
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetDeconvolution2dForwardOutputDim(const ccConvolutionDescriptor_t deconvDesc,
+                                                const ccTensorDescriptor_t xDesc,
+                                                const ccFilterDescriptor_t wDesc,
+                                                int32_t *n,
+                                                int32_t *c,
+                                                int32_t *h,
+                                                int32_t *w);
+
+/**
+  * @ingroup dnn
+  * @brief create descriptor of PAD
+  * @param [in|out] padDesc  point to descriptor of pad
+  * @return ccStatus_t
+  */
+ccStatus_t ccCreatePadDescriptor(ccPadDescriptor_t *padDesc);
+
+/**
+  * @ingroup dnn
+  * @brief destroy descriptor of PAD
+  * @param [in] *padDesc descriptor of PAD
+  * @return ccStatus_t
+  */
+ccStatus_t ccDestroyPadDescriptor(ccPadDescriptor_t *padDesc);
+
+/**
+  * @ingroup dnn
+  * @brief set PADDesc
+  * @param [in|out] padDesc descriptor of PAD
+  * @param [in] padMode  mode of PAD
+  * @param [in] padValue  pad value of PAD
+  * @param [in] wleft width left pad of PAD
+  * @param [in] wright width right of PAD
+  * @param [in] htop higth pad of PAD
+  * @param [in] hbottom higth bottom pad of PAD
+  * @return ccStatus_t
+  */
+ccStatus_t ccSetPadDescriptor(ccPadDescriptor_t padDesc,
+                                    ccPadMode_t  padMode,
+                                    float padValue,
+                                    int32_t htop,
+                                    int32_t hbottom,
+                                    int32_t wleft,
+                                    int32_t wright);
+
+/**
+  * @ingroup dnn
+  * @brief read 2d pooling
+  * @param [in] poolingDesc   descriptor of pooling operator
+  * @param [in|out] mode   point to mode of pooling
+  * @param [in|out] maxpoolingNanOpt   point to Nan propagation mode
+  * @param [in|out] windowH   point to height of pooling window
+  * @param [in|out] windowW   point to width of pooling window
+  * @param [in|out] padHHead   point to zero padding in height head, if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same value.
+  * @param [in|out] padHTail   point to zero padding in height tail, need set when padMode is CC_PADDING_DIRECTASSIGN.
+  * @param [in|out] padWHead   point to zero padding in width head, if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same value.
+  * @param [in|out] padWTail   point to zero padding in width tail, need set when padMode is CC_PADDING_DIRECTASSIGN.
+  * @param [in|out] strideH   point to stride in height
+  * @param [in|out] strideW   point to stride in width
+  * @param [in|out] dataMode
+  * @param [in|out] ceilMode  0:Ceil 1:Floor
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetPooling2dDescriptor(const ccPoolingDescriptor_t poolingDesc,
+                                    ccPoolingMode_t *mode,
+                                    ccPaddingMode_t *padMode,
+                                    ccNanPropagation_t *maxpoolingNanOpt,
+                                    int32_t *windowH,
+                                    int32_t *windowW,
+                                    int32_t *padHHead,
+                                    int32_t *padHTail,
+                                    int32_t *padWHead,
+                                    int32_t *padWTail,
+                                    int32_t *strideH,
+                                    int32_t *strideW,
+                                    int32_t *dataMode,
+                                    int32_t *ceilMode,
+                                    ccPooingFwdAlgo_t *algo);
+
+ccStatus_t ccGetCompare5dOutputDim(const ccTensorDescriptor_t xDesc,
+                                   const ccTensorDescriptor_t yDesc,
+                                   int32_t* dimCnt,
+                                   int32_t* dim,
+                                   int32_t dimLen);
+
+ccStatus_t ccGetMaximum5dOutputDim(const ccTensorDescriptor_t xDesc,
+                                   const ccTensorDescriptor_t yDesc,
+                                   int32_t* dimCnt,
+                                   int32_t* dim,
+                                   int32_t dimLen);
+
+ccStatus_t ccGetMinimum5dOutputDim(const ccTensorDescriptor_t xDesc,
+                                   const ccTensorDescriptor_t yDesc,
+                                   int32_t* dimCnt,
+                                   int32_t* dim,
+                                   int32_t dimLen);
+
+ccStatus_t ccGetReduce5dOutputDim(const ccTensorDescriptor_t xDesc,
+                                const ccIntArray_t* axis,
+                                bool keepDims,
+                                int32_t *dimCnt,
+                                int32_t dim[],
+                                int32_t dimLen);
+
+/**
+ * @brief get out put descrition of slice tensor.
+ * @param [in] xDesc         descriptor of input data
+ * @param [in] begin         begin position of tensor
+ * @param [in] size          size to slice
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ ccStatus_t ccGetSliceOutputDim(
+      const ccTensorDescriptor_t xDesc,
+      const ccIntArray_t* begin,
+      const ccIntArray_t* size,
+      int32_t *dimCnt,
+      int32_t dim[],
+      int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get strided slice output dim info.
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] stridedSliceDesc specifies the begin, end, strides of slice
+ * @param [in] attrDesc         reserve for optional attributes.
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetStridedSliceOutputDim(const ccTensorDescriptor_t xDesc,
+                                    const ccStridedSliceDescriptor_t stridedSliceDesc,
+                                    const ccStridedSliceAttrsDescriptor_t attrDesc,
+                                    int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+  * @ingroup dnn
+  * @brief get workspace size for softmax computation
+  * @param [in] handle              cce handle
+  * @param [in] xDesc               descriptor of input tensor
+  * @param [in] yDesc               descriptor of output tensor
+  * @param [in|out] sizeInBytes     workSpace size in bytes
+  * @return ccStatus_t
+  */
+ccStatus_t ccGetSoftmaxForwardWorkspaceSize(ccHandle_t handle,
+                                            const ccTensorDescriptor_t xDesc,
+                                            const ccTensorDescriptor_t yDesc,
+                                            uint32_t *sizeInBytes);
+
+/**
+  * @ingroup dnn
+  * @brief set quantize algorithm type and quantize scale type (vector or scalar)
+  * @param [in] quantizeInfo    descriptor of quantize parameters
+  * @param [in] quantAlgo       enum type for quantize algorithm type
+  * @param [in] scaleType       enum type for quantize scale type
+  * @param [in] reluflag        flag for relu
+  * @return ccStatus_t
+  */
+ccStatus_t   ccSetQuantizeAlgoAndScaleType(ccQuantizeDescriptor_t quantizeInfo, ccQuantizeAlgo_t quantAlgo, ccScaleType_t scaleType);
+ccStatus_t   ccSetQuantizeAlgoAndScaleType(ccQuantizeDescriptor_t quantizeInfo, ccQuantizeAlgo_t quantAlgo, ccScaleType_t scaleType, bool reluFlag);
+
+}; /* end cce */
+
+#endif  // DNN_BASE_H__
diff --git a/third_party/fwkacllib/inc/cce/dnn_base_def.hpp b/third_party/fwkacllib/inc/cce/dnn_base_def.hpp
new file mode 100644
index 00000000..8ce5e933
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/dnn_base_def.hpp
@@ -0,0 +1,994 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DNN_BASE_HPP__
+#define DNN_BASE_HPP__
+
+#include "cce/cce_def.hpp"
+
+namespace cce {
+
+/**
+ * @ingroup dnn
+ * @brief tiling para
+ */
+typedef struct tagCcWeightCompressInfo {
+  uint32_t blockRow;     /**< block row */
+  uint32_t blockCol;     /**< block col */
+  uint32_t fractalK;     /**< fractal K */
+  uint32_t fractalN;     /**< fractal N */
+  uint32_t lastFractalK; /**< K of last fractal */
+  uint32_t lastFractalN; /**< N of last fractal */
+  uint32_t cubeSize;     /**< cube's length */
+  uint32_t loadDir;      /**< data load directtiono 0??col load     1:row load*/
+} ccWeightCompressInfo_t;
+
+/**
+ * @ingroup dnn
+ * @brief compress table info
+ */
+typedef struct tagCcWeightCompressTab {
+  uint16_t dataLen : 14;  /**< 0: data length in 128 Byte */
+  uint16_t storeFlag : 1; /**< 0: compressed addr = original addr, 1: compressed addr = original addr + 256 Byte */
+  uint16_t dataType : 1;  /**< 0: original data, 1: compressed data */
+} ccWeightCompressTab_t;
+
+/**
+ * @conv quantize dnn vector mode/scalar mode
+ */
+typedef enum {
+  QUANT_ALGO_NON_OFFSET = 0,
+  QUANT_ALGO_HALF_OFFSET = 1,
+  QUANT_ALGO_ALL_OFFSET = 2,
+  QUANT_ALGO_BUTT
+} ccQuantizeAlgo_t;
+typedef enum { SCALE_VEC = 0, SCALE_SCALAR = 1, SCALE_TYPE_BUTT } ccConvolutionScaleType_t, ccScaleType_t;
+
+/**
+ * @conv quantize dnn sqrt mode/non sqrt mode
+ */
+typedef enum {
+  SCALE_NORMAL = 0,
+  SCALE_SQRT = 1,
+  SCALE_VALUE_MODE_BUTT
+} ccConvolutionScaleValueMode_t,
+    ccScaleValueMode_t;
+
+typedef struct {
+  float scaleW;
+  float scaleD;
+  float scaleDNext;
+  uint8_t offsetW;
+  uint8_t offsetD;
+  uint8_t offsetDNext;
+} ccQuantAllOffsetPara_t;
+
+typedef struct tagCcVecQuantizePara {
+  float scale;
+  uint16_t offset;
+  uint16_t rrv;  // 32byte align
+} ccVecQuantizePara_t;
+
+/**
+ * @ingroup dnn
+ * @brief format of tensor
+ */
+typedef enum tagCcTensorFormat {
+  CC_TENSOR_NCHW = 0,  /**< NCHW */
+  CC_TENSOR_NHWC,      /**< NHWC */
+  CC_TENSOR_ND,        /**< Nd Tensor */
+  CC_TENSOR_NC1HWC0,   /**< NC1HWC0 */
+  CC_TENSOR_FRACTAL_Z, /**< FRACTAL_Z */
+  CC_TENSOR_NC1C0HWPAD,
+  CC_TENSOR_NHWC1C0,
+  CC_TENSOR_FSR_NCHW,
+  CC_TENSOR_FRACTAL_DECONV,
+  CC_TENSOR_C1HWNC0,
+  CC_TENSOR_FRACTAL_DECONV_TRANSPOSE,
+  CC_TENSOR_FRACTAL_DECONV_SP_STRIDE_TRANS,
+  CC_TENSOR_NC1HWC0_C04,   /**< NC1HWC0, C0 =4*/
+  CC_TENSOR_FRACTAL_Z_C04, /**< FRACZ?????C0 =4 */
+  CC_TENSOR_CHWN,
+  CC_TENSOR_FRACTAL_DECONV_SP_STRIDE8_TRANS,
+  CC_TENSOR_HWCN,
+  CC_TENSOR_NC1KHKWHWC0, /** < KH,KW kernel h& kernel w maxpooling max output format*/
+  CC_TENSOR_HASHTABLE_LOOKUP_LOOKUPS = 20,
+  CC_TENSOR_HASHTABLE_LOOKUP_KEYS,
+  CC_TENSOR_HASHTABLE_LOOKUP_VALUE,
+  CC_TENSOR_HASHTABLE_LOOKUP_OUTPUT,
+  CC_TENSOR_HASHTABLE_LOOKUP_HITS = 24,
+  CC_TENSOR_C1HWNCoC0, /**< C1,H,W,N,Co,C0 6D diagonal format*/
+  CC_TENSOR_RESERVED
+} ccTensorFormat_t;
+
+/**
+ * @ingroup dnn
+ * @brief format of compare
+ */
+typedef enum tagCcCompareType {
+  CC_COMPARE_TYPE_LESS = 0,
+  CC_COMPARE_TYPE_LESS_EQUAL,
+  CC_COMPARE_TYPE_NOT_EQUAL,
+  CC_COMPARE_TYPE_EQUAL,
+  CC_COMPARE_TYPE_GREATER,
+  CC_COMPARE_TYPE_GREATER_EQUAL,
+  CC_COMPARE_TYPE_RESERVED
+} ccCompareType_t;
+
+/**
+ * @ingroup dnn
+ * @brief propagate Nan
+ */
+typedef enum tagCcNanPropagation {
+  CC_NAN_NOT_PROPAGATE = 0, /**< Nan numbers are not propagated */
+  CC_NAN_PROPAGATE,         /**< Nan numbers are propagated */
+  CC_NAN_PROPAGATE_RESERVED
+} ccNanPropagation_t;
+
+/**
+ * @ingroup dnn
+ * @brief algorithm of convolution forward
+ */
+typedef enum tagCcConvolutionFwdAlgo {
+  CC_CONVOLUTION_FWD_ALGO_GEMM = 0, /**< matrix gemm algo */
+  CC_CONVOLUTION_FWD_ALGO_WINOGRAD, /**< Winograd Transform algo */
+  CC_CONVOLUTION_FWD_ALGO_GEMM_ACCU_FLOAT32,
+  CC_CONVOLUTION_FWD_ALGO_RESERVED
+} ccConvolutionFwdAlgo_t;
+
+#define ccCorrelationFwdAlgo_t ccConvolutionFwdAlgo_t
+
+typedef enum tagCcConvolutionBwdAlgo {
+  CC_CONVOLUTION_BWD_ALGO_GEMM = 0, /**< matrix gemm algo */
+  CC_CONVOLUTION_BWD_ALGO_WINOGRAD, /**< Winograd Transform algo */
+  CC_CONVOLUTION_BWD_ALGO_GEMM_CO2IMG,
+  CC_CONVOLUTION_BWD_FILTER_GEM_ALGO,
+  CC_CONVOLUTION_BWD_ALGO_RESERVED
+} ccConvolutionBwdAlgo_t;
+
+#define ccCorrelationBwdAlgo_t ccConvolutionBwdAlgo_t
+
+/**
+ * @ingroup dnn
+ * @brief algorithm of FullConnect forward
+ */
+typedef enum tagCcFullConnectFwdAlgo {
+  CC_FULLCONNECT_FWD_ALGO_HALF = 0,
+  CC_FULLCONNECT_FWD_ALGO_FLOAT32
+} ccFullConnectFwdAlgo_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of convolution
+ */
+typedef enum tagCcConvolutionMode {
+  CC_CONV_CONVOLUTION = 0,   /**< math convolution */
+  CC_CONV_CROSS_CORRELATION, /**< cross-correlation convolution */
+  CC_CONV_DECONVOLUTION,     /**< deconvolution, also named transposed convolution*/
+  CC_CONV_MODE_DEPTHWISE,    /**< depthwise convolution*/
+  CC_CONV_MODE_RESERVED
+} ccConvolutionMode_t;
+
+#define ccCorrelationMode_t ccConvolutionMode_t
+
+/**
+ * @ingroup dnn
+ * @brief mode of pooling
+ */
+typedef enum tagCcPoolingMode {
+  CC_POOLING_MAX = 0,  /**< max pooling */
+  CC_POOLING_AVG,      /**< average pooling */
+  CC_POOLING_L2,       /**< L2 pooling */
+  CC_POOLING_AVG_FP32, /**< average pooling for training */
+  CC_POOLING_RESERVED
+} ccPoolingMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief L0C accumulate algo of AvgPooling
+ */
+typedef enum tagCcPooingFwdAlgo {
+  CC_POOLING_FWD_ALGO_HALF = 0,  // accumulate in L0c with FP16
+  CC_POOLING_FWD_ALGO_FLOAT32    // accumulate in L0c with FP32
+} ccPooingFwdAlgo_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of momentum
+ */
+typedef enum tagMomentumAlgo {
+  CC_MOMENTUM_UPDATE_FP32 = 0,  /**< FP32 out */
+  CC_MOMENTUM_UPDATE_FP32_FP16, /**< FP32 and FP16 out */
+  CC_MOMENTUM_UPDATE_FP32_NESTEROV,
+  CC_MOMENTUM_UPDATE_FP32_FP16_NESTEROV,
+  CC_MOMENTUM_RESERVED
+} ccMomentumAlgo_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of partitionStrategy
+ *attention: if need to motify this struct,please must motify dPartitionStrategy_t
+ */
+typedef enum tagCcPartitionStrategy {
+  CC_PARTITION_STRATEGY_MOD = 0, /**< mod */
+  CC_PARTITION_STRATEGY_DIV,     /**< div */
+  CC_PARTITION_STRATEGY_RESERVED
+} ccPartitionStrategy_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of assignOp
+ */
+typedef enum tagCcAssignOpMode {
+  CC_ASSIGN_ADD = 0, /**< assign add */
+  CC_ASSIGN_SUB,     /**< assign sub */
+  CC_ASSIGN_RESERVED
+} ccAssignOpMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of arcSinCos
+ */
+typedef enum tagCcArcSinCosMode {
+  CC_ARCUS_SIN = 0, /**< asin */
+  CC_ARCUS_COS,     /**< acos */
+  CC_ARCUS_RESERVED
+} ccArcSinCosMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of padding
+ */
+typedef enum tagCcPaddingMode {
+  CC_PADDING_CEIL = 0,
+  CC_PADDING_DIRECTASSIGN,
+  CC_PADDING_VALID,
+  CC_PADDING_SAME,      /**< Padding values of 0 are always used */
+  CC_PADDING_CEIL_NEW,  /*new ceil,use for backward compatibility*/
+  CC_PADDING_VALID_NEW, /*new valid,use for backward compatibility*/
+  CC_PADDING_SAME_NEW,  /*new same,use for backward compatibility*/
+  CC_PADDING_RESERVED
+} ccPaddingMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of activation
+ */
+typedef enum tagCcActivationMode {
+  CC_ACTIVATION_SIGMOID = 0,  /**< sigmoid */
+  CC_ACTIVATION_RELU,         /**< ReLU */
+  CC_ACTIVATION_TANH,         /**< tanh */
+  CC_ACTIVATION_CLIPPED_RELU, /**< clipped ReLU */
+  CC_ACTIVATION_ELU,          /**< ELU */
+  CC_ACTIVATION_LEAKY_RELU,
+  CC_ACTIVATION_ABS,            /**< Abs */
+  CC_ACTIVATION_RELU1,          /**< relu1 */
+  CC_ACTIVATION_SOFTSIGN,       /**< softsign */
+  CC_ACTIVATION_SOFTPLUS,       /**< softplus */
+  CC_ACTIVATION_HARDSIGMOID,    /**< hardsigmoid*/
+  CC_ACTIVATION_THRESHOLD_RELU, /**< threshold */
+  CC_ACTIVATION_SELU,           /**< selu */
+  CC_ACTIVATION_LINEAR,         /**< linear */
+  CC_ACTIVATION_RELU6,          /**< relu6 */
+  CC_ACTIVATION_RESERVED
+} ccActivationMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of logical op mode
+ */
+typedef enum tagCcLogicalOpMode {
+  CC_LOGICAL_OP_NOT = 0, /**logical not**/
+  CC_LOGICAL_OP_AND,     /**logical and**/
+  CC_LOGICAL_OP_OR,      /**logical or**/
+  CC_LOGICAL_OP_XOR,     /**logical xor**/
+  CC_LOGICAL_OP_RESERVED
+} ccLogicalOpMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of batchnorm
+ */
+typedef enum tagCcBatchNormMode {
+  CC_BATCHNORM_PER_ACTIVATION = 0, /**< bnScale, bnBias tensor dims are 1xCxHxW */
+  CC_BATCHNORM_SPATIAL,            /**< bnScale, bnBias tensor dims are 1xCx1x1 */
+  CC_BATCHNORM_RESERVED
+} ccBatchNormMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of instancenorm
+ */
+typedef enum tagCcInstanceNormMode {
+  CC_INSTANCENORM_PER_ACTIVATION = 0, /**< inScale, inBias tensor dims are NxCxHxW */
+  CC_INSTANCENORM_SPATIAL,            /**< inScale, inBias tensor dims are NxCx1x1 */
+  CC_INSTANCENORM_RESERVED
+} ccInstanceNormMode_t;
+/**
+ * @ingroup dnn
+ * @brief mode of layernorm
+ */
+typedef enum tagCcLayerNormMode {
+  CC_LAYERNORM_PER_ACTIVATION = 0, /**< lnScale, lnBias tensor dims are 1xCxHxW */
+  CC_LAYERNORM_SPATIAL,            /**< lnScale, lnBias tensor dims are Nx1x1x1 */
+  CC_LAYERNORM_RESERVED
+} ccLayerNormMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief softmax algorithm
+ */
+typedef enum tagCcSoftmaxAlgo {
+  CC_SOFTMAX_FAST = 0,      /**< straightforward implementation */
+  CC_SOFTMAX_ACCURATE,      /**< subtract max from every point to avoid overflow */
+  CC_SOFTMAX_LOG,           /**< perform the Log softmax operation to avoid overflow */
+  CC_SOFTMAX_ACCURATE_FP32, /**< accurate mode for fp32 */
+  CC_SOFTMAX_RESERVED
+} ccSoftmaxAlgo_t;
+
+/**
+ * @ingroup dnn
+ * @brief softmax mode
+ */
+typedef enum tagCcSoftmaxMode {
+  CC_SOFTMAX_MODE_INSTANCE = 0, /**< compute the softmax over all C, H, W for each N */
+  CC_SOFTMAX_MODE_CHANNEL,      /**< compute the softmax over all C for each H, W, N */
+  CC_SOFTMAX_MODE_HEIGHT,       /**< compute the softmax over all H for each N, C, W */
+  CC_SOFTMAX_MODE_WIDTH,        /**< compute the softmax over all W for each N, C, H */
+  CC_SOFTMAX_MODE_CLASS,        /**< special mode: compute the softmax over all class for each N, H ,W */
+  CC_SOFTMAX_MODE_RESERVED
+} ccSoftmaxMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief cross entropy mode
+ */
+typedef enum tagCcCrossEntropyMode {
+  CC_CROSS_ENTROPY_SPARSE_WITHOUT_REDUCTION = 0, /**< compute the sparse cross entropy without fused reduce mean */
+  CC_CROSS_ENTROPY_SPARSE_WITH_REDUCTION,        /**< compute the sparse cross entropy with fused reduce mean*/
+  CC_CROSS_ENTROPY_WITHOUT_REDUCTION,            /**< compute the cross entropy without fused reduce mean */
+  CC_CROSS_ENTROPY_WITH_REDUCTION,               /**< compute the cross entropy with fused reduce mean */
+  CC_CROSS_ENTROPY_RESERVED
+} ccCrossEntropyMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief concat mode
+ */
+typedef enum tagCcConcatMode {
+  CC_CONCAT_BY_BATCH = 0, /**< concat by batch */
+  CC_CONCAT_BY_FEATURE,   /**< concat by feature */
+  CC_CONCAT_BY_HEIGHT,    /**< concat by height */
+  CC_CONCAT_BY_WIDTH,     /**< concat by width */
+  CC_CONCAT_BY_FLATTEN,
+  CC_CONCAT_RESERVED
+} ccConcatMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief eltwise mode
+ */
+typedef enum tagCcEltwiseMode {
+  CC_ELTWISE_PROD = 0, /**< prod */
+  CC_ELTWISE_SUM,      /**< sum */
+  CC_ELTWISE_MAX,      /**< max */
+  CC_ELTWISE_RESERVED
+} ccEltwiseMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief depthwise filter type
+ */
+typedef enum tagCcDepthwiseFilterType {
+  CC_Depthwise_FILTER_DEPTHWISE = 0, /**< depthwise filter */
+  CC_Depthwise_FILTER_POINTWISE,     /**< pointwise filter */
+  CC_Depthwise_FILTER_RESERVED
+} ccDepthwiseFilterType_t;
+
+/**
+ * @ingroup dnn
+ * @brief sampler type
+ */
+typedef enum tagCcSamplerType {
+  CC_SAMPLER_BILINEAR = 0, /**< bilinear sampler algo */
+  CC_SAMPLER_RESERVED
+} ccSamplerType_t;
+
+/**
+ * @ingroup dnn
+ * @brief NMS type
+ */
+typedef enum tagCcNmsType {
+  CC_NMS_IOU = 0, /**< nms operation type, only IOU for now */
+  CC_NMS_RESERVED
+} ccNmsType_t;
+
+/**
+ * @ingroup dnn
+ * @brief Box Code type
+ */
+typedef enum tagCcBoxCodeType {
+  CC_BOX_CORNER = 1, /**< Box CodeType in detection nets */
+  CC_BOX_CENTER_SIZE,
+  CC_BOX_CORNER_SIZE,
+  CC_BOX_RESERVED
+} ccBoxCodeType_t;
+
+/**
+ * @ingroup dnn
+ * @brief split mode
+ */
+typedef enum tagSplitMode {
+  CC_SPLIT_MODE_SLICE = 0, /**< spilt data of one dim*/
+  CC_SPLIT_MODE_DUPLICATE, /**< copy data of one dim*/
+  CC_SPLIT_MODE_RESERVED
+} ccSplitMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of LRN
+ */
+typedef enum tagCcLRNMode {
+  CC_LRN_CROSS_CHANNELS = 0, /**< CROSS_CHANNELS */
+  CC_LRN_WITHIN_CHANNELS,    /**< WITHIN_CHANNELS */
+  CC_LRN_RESERVED
+} ccLRNMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief format of AIPP input
+ */
+typedef enum tagCcAippInputFormat {
+  CC_AIPP_INPUT_YUV420SP_U8 = 1,
+  /**< YUV420SP */  // mini,lite,tiny
+  CC_AIPP_INPUT_XRGB8888_U8,
+  /**< XRGB8888 */  // mini,lite,tiny
+  CC_AIPP_INPUT_NC1HWC0DI_FP16,
+  /**< NC1HWC0DI_FP16 */  // mini
+  CC_AIPP_INPUT_NC1HWC0DI_S8,
+  /**< NC1HWC0DI_S8 */  // mini
+  CC_AIPP_INPUT_RGB888_U8,
+  /**< RGB888 */  // mini,tiny
+  CC_AIPP_INPUT_ARGB8888_U8,
+  /**< ARGB8888 */  // lite
+  CC_AIPP_INPUT_YUYV_U8,
+  /**< YUYV */  // lite
+  CC_AIPP_INPUT_YUV422SP_U8,
+  /**< YUV422SP */  // lite
+  CC_AIPP_INPUT_AYUV444_U8,
+  /**< AYUV444 */  // lite
+  CC_AIPP_INPUT_YUV400_U8,
+  /**< YUV400 */  // mini,lite,tiny
+  CC_AIPP_INPUT_RESERVED
+} ccAippInputFormat_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of AIPP padding
+ */
+typedef enum tagCcAippPaddingMode {
+  CC_AIPP_PAD_DEFAULT_VALUE = 1, /**< CONFIG_VALUE */
+  CC_AIPP_PAD_LINE_COPY,         /**< ROW_COL_COPY */
+  CC_AIPP_PAD_BLOCK_COPY,        /**< BLOCK_COPY */
+  CC_AIPP_PAD_MIRROR_COPY,       /**< MIRROR_COPY */
+  CC_AIPP_PAD_RESERVED
+} ccAippPaddingMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief format of cmp type
+ */
+typedef enum tagCcccCMPType {
+  CC_CMP_EQ = 0,
+  CC_CMP_NE,
+  CC_CMP_LT,
+  CC_CMP_GT,
+  CC_CMP_GE,
+  CC_CMP_LE,
+  CC_CMP_TYPE_RESERVED
+} ccCMPType_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of logical op mode
+ */
+typedef enum tagCcResultType {
+  CC_Result_AND = 0, /**logical and**/
+  CC_Result_OR,      /**logical or**/
+  CC_Result_RESERVED
+} ccResultType_t;
+
+/**
+ * @ingroup dnn
+ * @brief method of crop_and_resize operator
+ */
+typedef enum tagCcResizeMethod {
+  CC_RESIZE_METHOD_BILINEAR = 0, /** BILINEAR */
+  CC_RESIZE_METHOD_NEAREST,      /** NEAREST */
+  CC_RESIZE_METHOD_RESERVED
+} ccResizeMethod_t;
+
+/**
+ * @ingroup dnn
+ * @brief mode of calculating new size of the images
+ */
+typedef enum tagCcResizeOutputDimMode {
+  RESIZE_OUTPUT_DIM_BY_ZOOM_FACTOR = 0, /**< Output dimension specified by zoom factor*/
+  RESIZE_OUTPUT_DIM_BY_SHRINK_FACTOR,   /**< specified by shrink factor */
+  RESIZE_OUTPUT_DIM_EXPLICIT,           /**< specified explicitly */
+  RESIZE_OUTPUT_DIM_RESERVED
+} ccResizeOutputDimMode_t;
+
+typedef enum tagCcYoloVersion {
+  CC_YOLO_V2 = 1, /**< YOLOv2 */
+  CC_YOLO_V3,     /**< YOLOv3 */
+  CC_YOLO_RESERVED
+} ccYoloVersion_t;
+
+typedef enum tagCcAttentionAlgo {
+  // bahdanau-attention, for detail:https://pravn.wordpress.com/2017/11/14/bahdanau-attention/
+  CC_ATTENTION_ALGO_BAHDANAU = 0,
+  CC_ATTENTION_ALGO_NORMAL_BAHDANAU = 1,
+  CC_ATTENTION_ALGO_LUONG = 2,
+  CC_ATTENTION_ALGO_SCALED_LUONG = 3,
+  CC_ATTENTION_ALGO_RESERVED
+} AttentionAlgo_t;
+/**
+ * @ingroup dnn
+ * @brief desc of data layout
+ */
+typedef enum ccEmAttnDecoderDataLayout {
+  CC_ATTN_5D_TX1BX,  //[max_time,Xt1,1,batch_size,Xt0]
+  CC_ATTN_5D_BTX1X,  //[batch_size*max_time,Xt1,1,1,Xt0]
+  CC_ATTN_DL_RESERVED
+} ccEmAttnDecoderDataLayout_t;
+
+/**
+ * @ingroup dnn
+ * @brief operation of Reduce
+ */
+typedef enum {
+  CC_REDUCE_OP_SUM = 0,    /**< sum */
+  CC_REDUCE_OP_MEAN,       /**< mean */
+  CC_REDUCE_OP_PROD,       /**< product */
+  CC_REDUCE_OP_ALL,        /**< logical and */
+  CC_REDUCE_OP_ABS_SUM,    /**< absolute sum */
+  CC_REDUCE_OP_SQUARE_SUM, /**< square sum */
+  CC_REDUCE_OP_MAX,        /**< max */
+  CC_REDUCE_OP_MIN,        /**< min */
+  CC_REDUCE_OP_LOGSUMEXP,  /**< logsumexp */
+  CC_REDUCE_OP_INVALID
+} ccReduceOpType_t;
+
+/**
+ * @ingroup dnn
+ * @brief desc of tpye layout
+ */
+typedef enum {
+  LSH_PROJECTION_TYPE_UNKNOWN = 0,
+  LSH_PROJECTION_TYPE_SPARSE = 1,
+  LSH_PROJECTION_TYPE_DENSE = 2
+} LSHProjectionType;
+
+/**
+ * @ingroup dnn
+ * @brief activation para
+ */
+typedef struct tagCcActivationRelu {
+  double reluCoef; /* reluCoef for clipped RELU */
+  ccNanPropagation_t reluNanOpt;
+} ccActivationRelu_t;
+typedef union tagCcActivationPara {
+  ccActivationRelu_t actionRelu; /* relu Coef and NanOpt for clipped RELU */
+  double eluAlpha;               /* eluAlpha for ELU */
+  float leakyReluNegativeSlope;
+} ccActivationPara_u;
+
+/**
+ * @ingroup dnn
+ * @bref mode of square
+ */
+typedef enum tagCcSquareMode {
+  CC_SQUARE_2 = 0, /* square */
+} ccSquareMode_t;
+
+/**
+ * @ingroup dnn
+ * @brief append operation type
+ */
+typedef enum tagCcOpType {
+  CC_OP_TYPE_NO_RELU = 0,
+  CC_OP_TYPE_RELU = 1,
+  CC_OP_TYPE_RELU6 = 2,
+  CC_OP_TYPE_INVALID
+} ccOpType_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of fill operator type.
+ */
+typedef enum tagCcFillOpType {
+  CC_CONSTANT = 0,
+  CC_RANGE,
+  CC_LENGTH_RANGE,
+  CC_GIVEN_TENSOR,
+  CC_DIAGONAL,
+  CC_UNIFORM,
+  CC_UNIFORM_INT,
+  CC_UNIQUE_UNIFORM,
+  CC_GAUSSIAN,
+  CC_XAVIER,
+  CC_MSRA,
+  CC_FILL_OP_TYPE_RESERVED
+} ccFillOpType_t;
+
+/**
+ * @ingroup dnn
+ * @brief loss function reduction mode
+ */
+typedef enum tagCcLossReduction {
+  CC_LOSS_REDUCTION_NONE = 0,
+  CC_LOSS_REDUCTION_SUM,
+  CC_LOSS_REDUCTION_RESERVED
+} ccLossReduction_t;
+
+/**
+ * @ingroup dnn
+ * @brief max size of ccIntArray
+ */
+#define CC_INT_ARRAY_MAX_SIZE (8)
+
+/**
+ * @ingroup dnn
+ * @brief struct define of int array less than 8.
+ */
+typedef struct tagIntArray {
+  uint32_t size;
+  int32_t value[CC_INT_ARRAY_MAX_SIZE];
+} ccIntArray_t;
+
+typedef enum tagCcPadMode {
+  CC_PAD_CONSTANT = 0, /*CONSTANT */
+  CC_PAD_REFLECT,      /*REFLECT */
+  CC_PAD_SYMMETRIC,    /*SYMMETRIC*/
+  CC_PAD_EDGE,         /*EDGE */
+  CC_PAD_MODE_RESERVED
+} ccPadMode_t;
+
+/*
+ * @ingroup dnn
+ * @brief pad operation of extractImagePatches
+ */
+typedef enum {
+  CC_EXTRACT_IMAGE_PATCHES_PAD_VALID = 1,
+  CC_EXTRACT_IMAGE_PATCHES_PAD_SAME,
+  CC_EXTRACT_IMAGE_PATCHES_PAD_RESERVED
+} ccExtractImagePatchesPadType_t;
+
+/**
+ * @ingroup dnn
+ * @brief image dimensions of aipp input
+ */
+#define CC_AIPP_IMG_DIM (2)
+
+/**
+ * @ingroup dnn
+ * @brief image channel number of aipp input
+ */
+#define CC_AIPP_IMG_CHN_NUM (4)
+
+/**
+ * @ingroup dnn
+ * @brief element number of aipp color space convertion matrix
+ */
+#define CC_AIPP_CSC_MATRIX_DIM (9)
+
+/**
+ * @ingroup dnn
+ * @brief element number of aipp color space convertion bias
+ */
+#define CC_AIPP_CSC_BIAS_DIM (3)
+
+/**
+ * @ingroup dnn
+ * @brief struct define of AIPP operator
+ */
+
+typedef struct tagCcAipp {
+  ccAippInputFormat_t inputFormat;
+  ccDataType_t outputFormat;
+  int32_t srcImageSize[CC_AIPP_IMG_DIM];
+  int32_t loadStartPos[CC_AIPP_IMG_DIM];
+  int32_t loadSize[CC_AIPP_IMG_DIM];
+  int32_t scfInputSize[CC_AIPP_IMG_DIM];
+  int32_t scfOutputSize[CC_AIPP_IMG_DIM];
+  int32_t cscMatrix[CC_AIPP_CSC_MATRIX_DIM];
+  int32_t cscOutputBias[CC_AIPP_CSC_BIAS_DIM];
+  int32_t cscInputBias[CC_AIPP_CSC_BIAS_DIM];
+  int32_t dtcPixelMean[CC_AIPP_IMG_CHN_NUM];
+  float dtcPixelMin[CC_AIPP_IMG_CHN_NUM];
+  float dtcPixelVarReci[CC_AIPP_IMG_CHN_NUM];
+  ccAippPaddingMode_t paddingMode;
+  int32_t paddingSize[CC_AIPP_IMG_DIM * 2];  // up,down,left,right
+  float cpaddingVaule;
+  bool cscSwitch;  // 0:off,1:on
+  bool scfSwitch;  // 0:off,1:on
+  bool rbuvSwapSwitch;
+  bool axSwapSwitch;
+  bool singleLineMode;
+  bool cscConfigFlag;
+  bool dtcConfigFlag;
+  bool padConfigFlag;
+  bool commConfigFlag;
+  bool aippEn;
+  bool dyncAippFlag;
+  const void *dyncParaAddr;
+  bool rotationFlag;
+} ccConvolutionAipp_t;
+
+
+typedef struct tagCcQuantizePara {
+  ccConvolutionScaleValueMode_t scaleValueMode;
+  uint16_t *scale;
+  uint16_t *offsetq;
+  int32_t *offsetw;
+  uint8_t *allOffsetw;
+  uint8_t *offsetPad;
+} CcQuantizePara_t;
+
+typedef struct tagCcQuantize {
+  ccQuantizeAlgo_t quantAlgo;
+
+  ccConvolutionScaleType_t scaleWType;  // show scaleRq,scaleDq type
+
+  CcQuantizePara_t scaleQ;
+
+  CcQuantizePara_t scaleRq;
+
+  CcQuantizePara_t scaleDq;
+
+  // need relu
+  bool reluFlag;
+
+  // relu6
+  uint16_t *scaleRelu6;
+  bool bConcat;
+} ccQuantize_t;
+
+typedef struct tagCcPad *ccPadDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief operation of Cum
+ */
+typedef enum {
+  CC_CUM_OP_SUM = 0, /**< sum */
+  CC_CUM_OP_PROD,    /**< product */
+  CC_CUM_OP_INVALID
+} CumOpType;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of tensor
+ */
+typedef struct tagCcTensor *ccTensorDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of filter tensor
+ */
+typedef struct tagCcFilter *ccFilterDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of convolution operator
+ */
+typedef struct tagCcConvolution *ccConvolutionDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of correlation operator
+ */
+typedef struct tagCcConvolution *ccCorrelationDescriptor_t;
+typedef struct tagCcFullConnection_t *ccFullConnectionDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of pooling operator
+ */
+typedef struct tagCcPooling *ccPoolingDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of activation operator
+ */
+typedef struct tagCcActivation *ccActivationDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of batchToSpace operator
+ */
+typedef struct tagCcBatchToSpace *ccBatchToSpaceDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of spaceToBatch operator
+ */
+typedef struct tagCcSpaceToBatch *ccSpaceToBatchDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of svdf operator
+ */
+typedef struct tagCcSvdf *ccSvdfDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of crop operator
+ */
+typedef struct tagCcCrop *ccCropDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of interp operator
+ */
+typedef struct tagCcInterp *ccInterpDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of GetRegionBox operator
+ */
+typedef struct tagCcGetRegionBox *ccGetRegionBoxDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of CorrectBoxes operator
+ */
+typedef struct tagCorrectBoxes *ccCorrectBoxesDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of ClsProb operator
+ */
+typedef struct tagClsProb *ccClsProbDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of NMS operator
+ */
+typedef struct tagCcNms *ccNmsDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of MultiClassNms operator
+ */
+typedef struct tagCcMultiClassNms *ccMultiClassNmsDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of MscnnBoxOutput operator
+ */
+typedef struct tagCcMscnnBoxOutput *ccMscnnBoxOutputDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief define of SoftmaxTree
+ */
+typedef void *ccSoftmaxTree_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of exp operator
+ */
+typedef struct tagCcExp *ccExpDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of log operator
+ */
+typedef struct tagCcLog *ccLogDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of pow operator
+ */
+typedef struct tagCcPow *ccPowDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of padv2 operator
+ */
+typedef struct tagCcPadV2 *ccPadV2Descriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief desciptor of ShapeClassify operator
+ */
+typedef struct tagCcShapeClassify *ccShapeClassifyDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of DetectionFull3DOutput operator
+ */
+typedef struct tagCcDetectionFull3DOutput *ccDetectionFull3DOutputDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of Quantize operator
+ */
+typedef struct tagCcQuantize *ccQuantizeDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of StridedSlice operator
+ */
+typedef struct tagCcStridedSlice *ccStridedSliceDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of StridedSliceAttrs operator
+ */
+typedef struct tagCcStridedSliceAttrs *ccStridedSliceAttrsDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of ResizeBilinear operator
+ */
+typedef struct tagCcResizeBilinear *ccResizeBilinearDescriptor_t;
+
+typedef struct tagCcEltwise *ccEltwiseDescriptor_t;
+
+typedef struct tagCcBatchNorm *ccBatchNormDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of Square operator
+ */
+typedef struct tagCcSquare *ccSquareDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of NonMaxSuppression operator
+ */
+typedef struct tagNonMaxSuppression *ccNonMaxSuppressionDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of NonMaxSuppression operator
+ */
+typedef struct tagUpsamplePara *ccUpsampleParaDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of ResizeNearestNeighbor operator
+ */
+typedef struct tagCcResizeNearestNeighbor *ccResizeNearestNeighborDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of Fill operator
+ */
+typedef struct tagCcFillParam *ccFillParamDescriptor_t;
+
+/**
+ * @ingroup dnn
+ * @brief descriptor of Argmaxmin operator
+ */
+typedef struct tagCcArgmaxmin *ccArgmaxminDescriptor_t;
+
+};  // namespace cce
+
+#endif  // DNN_BASE_HPP__
diff --git a/third_party/fwkacllib/inc/cce/dnn_op.h b/third_party/fwkacllib/inc/cce/dnn_op.h
new file mode 100644
index 00000000..627b8593
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/dnn_op.h
@@ -0,0 +1,4838 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DNN_OP_H__
+#define DNN_OP_H__
+
+#include "cce/blas_struct.h"
+#include "cce/cce.h"
+#include "cce/customize.h"
+
+namespace cce {
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of parameters for exponential function
+ * @param [in] point to descriptor of parameters for exponential function
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateExpDescriptor(ccExpDescriptor_t *expDesc);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of parameters for logarithmic function
+ * @param [in] point to descriptor of parameters for logarithmic function
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccCreateLogDescriptor(ccLogDescriptor_t *logDesc);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of parameters for pow function
+ * @param [in] point to descriptor of parameters for pow function
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccCreatePowDescriptor(ccPowDescriptor_t *powDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of parameters for exponential function
+ * @param [in] point to descriptor of parameters for exponential function
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyExpDescriptor(ccExpDescriptor_t *expDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of parameters for logarithmic function
+ * @param [in] point to descriptor of parameters for exponential function
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyLogDescriptor(ccLogDescriptor_t *logDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of parameters for pow function
+ * @param [in] point to descriptor of parameters for pow function
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyPowDescriptor(ccPowDescriptor_t *powDesc);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of parameters for NonMaxSuppress function
+ * @param [in] point to descriptor of parameters for NonMaxSuppress function
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateNonMaxSuppressionDescriptor(ccNonMaxSuppressionDescriptor_t *nonMaxSuppressionDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of parameters for NonMaxSuppress function
+ * @param [in] point to descriptor of parameters for NonMaxSuppress function
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyNonMaxSuppressionDescriptor(ccNonMaxSuppressionDescriptor_t *nonMaxSuppressionDesc);
+
+ccStatus_t ccTransTensorIncertPads(const ccTensorDescriptor_t xDesc, const void *x, const ccTensorDescriptor_t yDesc,
+                                   void *y, uint32_t ySizeInBytes, uint32_t boxTypeNum, bool interweave,
+                                   bool background, uint32_t boxTypeNumMax = 0, bool isScaleVec = false);
+
+ccStatus_t ccTransTensorIncertPadsInt32(const ccTensorDescriptor_t xDesc, const void *x,
+                                        const ccTensorDescriptor_t yDesc, void *y, uint32_t ySizeInBytes,
+                                        uint32_t boxTypeNum, bool interweave, bool background);
+
+ccStatus_t ccTransMskrcnnBbox(const ccTensorDescriptor_t xDesc, const void *x, const ccTensorDescriptor_t yDesc,
+                              void *y, uint32_t ySizeInBytes, uint32_t boxTypeNum);
+
+ccStatus_t ccSetTensorDescriptorQuantizeParam(ccTensorDescriptor_t tensorDesc,
+                                              const ccVecQuantizePara_t *vecQuantizePara);
+
+ccStatus_t ccGetTensorDescriptorQuantizeParam(const ccTensorDescriptor_t tensorDesc,
+                                              ccVecQuantizePara_t *vecQuantizePara);
+
+/**
+ * @ingroup dnn
+ * @brief init tensor to 4d filter
+ * @param [in|out] filterDesc   descriptor of filter
+ * @param [in] format   format of filter
+ * @param [in] dataType   data type in device
+ * @param [in] k   number of output feature maps
+ * @param [in] c   number of input feature maps
+ * @param [in] h   height of filter
+ * @param [in] w   width of filter
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetFilter4dDescriptor(ccFilterDescriptor_t filterDesc, ccTensorFormat_t format, ccDataType_t dataType,
+                                   int32_t k, int32_t c, int32_t h, int32_t w);
+
+ccStatus_t ccSetFilter6dDescriptor(ccTensorDescriptor_t filterDesc, ccTensorFormat_t format, ccDataType_t dataType,
+                                   int32_t c1, int32_t h, int32_t w, int32_t n, int32_t co, int32_t c0);
+/**
+ * @ingroup dnn
+ * @brief init tensor to Fractal filter
+ * @param [in|out] filterDesc   descriptor of filter
+ * @param [in] format   format of filter
+ * @param [in] dataType   data type in device
+ * @param [in] k   number of output feature maps
+ * @param [in] c   number of input feature maps
+ * @param [in] h   height of filter
+ * @param [in] w   width of filter
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetFilterFractalDescriptor(ccFilterDescriptor_t filterDesc, ccTensorFormat_t format, ccDataType_t dataType,
+                                        int32_t k, int32_t c, int32_t h, int32_t w);
+
+/**
+ * @ingroup dnn
+ * @brief init tensor to Fractal filter
+ * @param [in|out] filterDesc   descriptor of filter
+ * @param [in] format   format of filter
+ * @param [in] dataType   data type in device
+ * @param [in] k   number of output feature maps
+ * @param [in] c   number of input feature maps
+ * @param [in] h   height of filter
+ * @param [in] w   width of filter
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetInt8Filter4dDescriptor(ccFilterDescriptor_t filterDesc, ccTensorFormat_t format, ccDataType_t dataType,
+                                       int32_t k, int32_t c, int32_t h, int32_t w, ccDataType_t outputDataType);
+
+/**
+ * @ingroup dnn
+ * @brief read 4d filter
+ * @param [in] filterDesc   descriptor of filter
+ * @param [in|out] format   point to format of filter
+ * @param [in|out] dataType   point to data type in device
+ * @param [in|out] k   point to number of output feature maps
+ * @param [in|out] c   point to number of input feature maps
+ * @param [in|out] h   point to height of filter
+ * @param [in|out] w   point to width of filter
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetFilterFractalDescriptor(const ccFilterDescriptor_t filterDesc, ccTensorFormat_t *format,
+                                        ccDataType_t *dataType, int32_t *k, int32_t *c, int32_t *h, int32_t *w);
+
+/**
+ * @ingroup dnn
+ * @brief get data size of 4d filter
+ * @param [in] filterDesc   descriptor of filter
+ * @param [in|out] size   point to data size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetDepthWiseConvFilterSizeInBytes(const ccFilterDescriptor_t filterDesc, int32_t groupNum, uint32_t *size);
+
+/**
+ * @ingroup dnn
+ * @brief trans group conv filter to fractal format
+ * @param [in] filterSrcInfo   descriptor of input filter
+ * @param [in] filterSrc   input data pointer
+ * @param [in] filterDstInfo   descriptor of output filter
+ * @param [in|out] filterDst   output data pointer
+ * @param [in] group   group size
+ * @return ccStatus_t
+ */
+ccStatus_t ccTransGroupConvFilter(ccFilterDescriptor_t filterSrcInfo, const void *filterSrc,
+                                  ccFilterDescriptor_t filterDstInfo, void *filterDst, uint32_t group,
+                                  uint32_t dstSize);
+
+/**
+ * @ingroup dnn
+ * @brief trans conv filter With BoxTypeNuM to fractal format
+ * @param [in] filterSrcInfo   descriptor of input filter
+ * @param [in] filterSrc   input data pointer
+ * @param [in] filterDstInfo   descriptor of output filter
+ * @param [in|out] filterDst   output data pointer
+ * @param [in] ySizeInBytes the malloc memory size
+ * @param [in] boxTypeNum  the num of boxType
+ * @param [in] interweave whether the axis interweave
+ * @return ccStatus_t
+ */
+ccStatus_t ccTransFilterWithBoxTypeNum(const ccFilterDescriptor_t xDesc, const void *x,
+                                       const ccFilterDescriptor_t yDesc, void *y, uint32_t ySizeInBytes,
+                                       uint32_t boxTypeNum, bool interweave, uint32_t boxTypeNumMax = 0);
+/**
+ * @ingroup dnn
+ * @brief trans conv filter With BoxTypeNuM to fractal format
+ * @param [in] filterSrcInfo   descriptor of input filter
+ * @param [in] filterSrc   input data pointer
+ * @param [in] filterDstInfo   descriptor of output filter
+ * @param [in|out] filterDst   output data pointer
+ * @param [in] ySizeInBytes the malloc memory size
+ * @param [in] boxTypeNum  the num of boxType
+ * @param [in] interweave whether the axis interweave
+ * @param [in] outputDataType  output DataType
+ * @return ccStatus_t
+ */
+ccStatus_t ccTransFilterInt8WithBoxTypeNum(const ccFilterDescriptor_t wDesc, const void *x,
+                                           const ccFilterDescriptor_t yDesc, void *y, uint32_t ySizeInBytes,
+                                           uint32_t boxTypeNum, bool interweave, ccDataType_t outputDataType);
+
+/**
+ * @ingroup dnn
+ * @brief trans depthwise conv filter  to fractal format
+ * @param [in]  wDesc descriptor of input filter
+ * @param [in] w   input data pointer
+ * @param [in] groupNum   groupNum of conv
+ * @param [in]..yDesc descriptor of output filter
+ * @param [in|out] y   output data pointer
+ * @param [in] ySizeInBytes the malloc memory size
+ * @return ccStatus_t
+ */
+
+ccStatus_t transDepthWiseConvFilterNCHWToFractalZ(const ccFilterDescriptor_t wDesc, const void *w, int32_t groupNum,
+                                                  ccFilterDescriptor_t yDesc, void *y, uint32_t ySizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief trans depthwise conv filter  to fractal format
+ * @param [in]  wDesc descriptor of input filter
+ * @param [in] w   input data pointer
+ * @param [in] groupNum   groupNum of conv
+ * @param [in]..yDesc descriptor of output filter
+ * @param [in|out] y   output data pointer
+ * @param [in] ySizeInBytes the malloc memory size
+ * @return ccStatus_t
+ */
+ccStatus_t transDepthWiseConvFilterInt8NCHWToFractalZ(const ccFilterDescriptor_t wDesc, const void *w, int32_t groupNum,
+                                                      ccFilterDescriptor_t yDesc, void *y, uint32_t ySizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief trans depthwise conv filter  to fractal format, input format CHWN
+ * @param [in]  wDesc descriptor of input filter
+ * @param [in]..yDesc descriptor of output filter
+ * @param [in] ySizeInBytes the malloc memory size
+ * @param [in] w   input data pointer
+ * @param [in|out] y   output data pointer
+ * @return ccStatus_t
+ */
+ccStatus_t transDepthWiseConvFilterCHWNToFractalZ(ccFilterDescriptor_t filterSrcInfo, const void *filterSrc,
+                                                  uint32_t group, ccFilterDescriptor_t filterDstInfo, void *filterDst,
+                                                  uint32_t destSize);
+
+/**
+ * @ingroup dnn
+ * @Check if it is surpported by HighPerformance depthwise
+ * @param [in]  inputN,C,H,W   input param
+ * @param [in]  filterN,C,H,W
+ * @param [in]  dilationH,W    dilation param
+ * @param [in]  padHHead,padHtail,padWHead,padWTail pad param
+ * @param [in]  strideH,W  stride param
+ * @param [in] groupNum  Conv groupNum
+ * @param [in|out] isHighPerformance  isHighPerformance flag
+ * @return ccStatus_t
+ */
+ccStatus_t ccIsDepthwiseHighPerformance(int32_t inputN, int32_t inputC, int32_t inputH, int32_t inputW, int32_t filterN,
+                                        int32_t filterC, int32_t filterH, int32_t filterW, int32_t dilationH,
+                                        int32_t dilationW, int32_t padHHead, int32_t padHTail, int32_t padWHead,
+                                        int32_t padWTail, int32_t strideH, int32_t strideW, int32_t groupNum,
+                                        bool &isHighPerformance, bool isquant = false,
+                                        ccDataType_t inputDataType = CC_DATA_HALF,
+                                        ccDataType_t outputDataType = CC_DATA_HALF);
+
+/**
+ * @ingroup dnn
+ * @brief trans depthwise conv filter  to fractal format, input format CHWN
+ * @param [in]  wDesc descriptor of input filter
+ * @param [in]..yDesc descriptor of output filter
+ * @param [in] ySizeInBytes the malloc memory size
+ * @param [in] w   input data pointer
+ * @param [in|out] y   output data pointer
+ * @return ccStatus_t
+ */
+ccStatus_t transDepthWiseConvFilterCHWNToFractalZ(ccFilterDescriptor_t filterSrcInfo, const void *filterSrc,
+                                                  uint32_t group, ccFilterDescriptor_t filterDstInfo, void *filterDst,
+                                                  uint32_t destSize);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of fullconnection operator
+ * @param [in|out] fcDesc   point to descriptor of fullconnection operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateFullConnectionDescriptor(ccFullConnectionDescriptor_t *fcDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of fullconnection operator
+ * @param [in] *fcDesc   descriptor of fullconnection operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyFullConnectionDescriptor(ccFullConnectionDescriptor_t *fcDesc);
+
+/**
+ * @ingroup dnn
+ * @brief init conv descriptor to 2d conv, use for beforeHasPad
+ * @param [in|out] convDesc   descriptor of convolution operator
+ * @param [in] beforepadHHead   before padding in height head
+ * @param [in] beforepadHTail   before padding in height tail
+ * @param [in] beforepadWHead   before padding in width head
+ * @param [in] beforepadWTail   before padding in width tail
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolution2dDescriptorForPad(ccConvolutionDescriptor_t convDesc, int32_t beforepadHHead,
+                                              int32_t beforepadHTail, int32_t beforepadWHead, int32_t beforepadWTail);
+
+/**
+ * @ingroup dnn
+ * @brief init conv descriptor to 2d conv, use for concat batch size
+ * @param [in|out] convDesc   descriptor of convolution operator
+ * @param [in] concatBatchSize   concat batch size
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolution2dDescriptorForConcatBatchSize(ccConvolutionDescriptor_t convDesc, int64_t concatBatchSize);
+
+/**
+ * @ingroup dnn
+ * @brief init conv descriptor to 2d conv
+ * @param [in|out] convDesc   descriptor of convolution operator
+ * @param [in] opType  operation type for append at convolution operation
+ * @param [in] opDesc  operation descritpor for the opType
+ * @return ccStatus_t
+ */
+ccStatus_t ccConvolution2dAppendOp(ccConvolutionDescriptor_t convDesc, ccOpType_t opType, const void *opDesc);
+
+/**
+ * @ingroup dnn
+ * @brief read 2d conv beforeHasPad
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in|out] beforepadHHead   before padding in height head, default is 0
+ * @param [in|out] beforepadHTail   before padding in height tail, default is 0
+ * @param [in|out] beforepadWHead   before padding in width head, default is 0
+ * @param [in|out] beforepadWTail   before padding in width tail, default is 0
+ */
+ccStatus_t ccGetConvolution2dDescriptorForPad(const ccConvolutionDescriptor_t convDesc, int32_t *beforepadHHead,
+                                              int32_t *beforepadHTail, int32_t *beforepadWHead,
+                                              int32_t *beforepadWTail);
+
+/**
+ * @ingroup dnn
+ * @brief read 2d conv concat batch size
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in|out] concatBatchSize   concat batch size, default is 0
+ */
+ccStatus_t ccGetConvolution2dDescriptorForConcatBatchSize(const ccConvolutionDescriptor_t convDesc,
+                                                          int64_t *concatBatchSize);
+
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of convolution forward computation, maybe no need temp space
+ * @param [in] handle   cce handle
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in] algo   algorithm of convolution forward
+ * @param [in|out] sizeInBytes   temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetConvolutionForwardWorkspaceSize(ccHandle_t handle, const ccConvolutionDescriptor_t convDesc,
+                                                const ccTensorDescriptor_t xDesc, const ccFilterDescriptor_t wDesc,
+                                                const ccTensorDescriptor_t yDesc, ccConvolutionFwdAlgo_t algo,
+                                                uint32_t *sizeInBytes);
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of convolution backward computation, maybe no need temp space
+ * @param [in] handle   cce handle
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in] dyDesc   descriptor of input tensor
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] dxDesc   descriptor of output tensor
+ * @param [in] algo   algorithm of convolution forward
+ * @param [in|out] sizeInBytes   temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetConvolutionBackwardDataWorkspaceSize(ccHandle_t handle, const ccConvolutionDescriptor_t convDesc,
+                                                     const ccTensorDescriptor_t dyDesc,
+                                                     const ccFilterDescriptor_t wDesc,
+                                                     const ccTensorDescriptor_t dxDesc, ccConvolutionBwdAlgo_t algo,
+                                                     uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of fc forward computation, maybe no need temp space
+ * @param [in] handle  cce handle
+ * @param [in] fcDesc  descriptor of fc operator
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] sizeInBytes   temp space size need, 0 means no memeory needed
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetFullConnectionForwardWorkspaceSize(ccHandle_t handle, const ccFullConnectionDescriptor_t fcDesc,
+                                                   const ccTensorDescriptor_t xDesc, const ccFilterDescriptor_t wDesc,
+                                                   const ccTensorDescriptor_t yDesc, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief convolution forward computation
+ * @param [in] handle   cce handle
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in] alpha   scaling factors
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] x   input data in device memory
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] w   filter data in device memory
+ * @param [in] biasDesc   descriptor of bias
+ * @param [in] bias   bias data in device memory
+ * @param [in] algo   algorithm of convolution forward
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @param [in] beta   scaling factors
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] y   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccConvolutionForward(ccHandle_t handle, const ccConvolutionDescriptor_t convDesc, const void *alpha,
+                                const ccTensorDescriptor_t xDesc, const void *x, const ccFilterDescriptor_t wDesc,
+                                const void *w, const ccTensorDescriptor_t biasDesc, const void *bias,
+                                ccConvolutionFwdAlgo_t algo, void *workSpace, uint32_t workSpaceSizeInBytes,
+                                const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @ingroup dnn
+ * @brief full alloc float and reset to 0
+ * @param [in] handle      cce handle
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in|out] x       output data in device memory
+ * @param [in] beta        scaling factors
+ * @return ccStatus_t
+ */
+ccStatus_t ccAllocFloatStatus(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                              const void *beta);
+
+/**
+ * @ingroup dnn
+ * @brief full get data set by op
+ * @param [in] handle      cce handle
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in|out] x       output data in device memory
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [out] y          output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccGetFloatStatus(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                            const void *beta, const ccTensorDescriptor_t yDesc, const void *y);
+
+/**
+ * @ingroup dnn
+ * @brief full clear register
+ * @param [in] handle      cce handle
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [out] y          output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccClearFloatStatus(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                              const void *beta, const ccTensorDescriptor_t yDesc, const void *y);
+
+#ifndef DAVINCI_LITE
+/**
+ * @ingroup dnn
+ * @brief convolution backward data computation
+ * @param [in] handle   cce handle
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in] alpha   scaling factors
+ * @param [in] dyDesc   descriptor of input tensor
+ * @param [in] dy   input data in device memory
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] w   filter data in device memory
+ * @param [in] algo   algorithm of convolution backward
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @param [in] beta   scaling factors
+ * @param [in] dxDesc   descriptor of output tensor
+ * @param [in|out] dx   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccConvolutionBackwardData(ccHandle_t handle, const ccConvolutionDescriptor_t convDesc, const void *alpha,
+                                     const ccTensorDescriptor_t dyDesc, const void *dy,
+                                     const ccFilterDescriptor_t wDesc, const void *w, ccConvolutionBwdAlgo_t algo,
+                                     void *workSpace, uint32_t workSpaceSizeInBytes, const void *beta,
+                                     const ccTensorDescriptor_t dxDesc, void *dx);
+#endif
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of pooling operator
+ * @param [in|out] poolingDesc   point to descriptor of pooling operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreatePoolingDescriptor(ccPoolingDescriptor_t *poolingDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of pooling operator
+ * @param [in] *poolingDesc   descriptor of pooling operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyPoolingDescriptor(ccPoolingDescriptor_t *poolingDesc);
+
+/**
+ * @ingroup dnn
+ * @brief init pooling descriptor to 2d pooling
+ * @param [in|out] poolingDesc   descriptor of pooling operator
+ * @param [in] mode   mode of pooling
+ * @param [in] padMode   mode of padding
+ * @param [in] maxpoolingNanOpt   Nan propagation mode
+ * @param [in] windowH   height of pooling window
+ * @param [in] windowW   width of pooling window
+ * @param [in] padHHead   zero padding in height head, if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same
+ * value.
+ * @param [in] padHTail   zero padding in height tail, need set when padMode is CC_PADDING_DIRECTASSIGN.
+ * @param [in] padWHead   zero padding in width head,  if padMode is not CC_PADDING_DIRECTASSIGN head and tail is same
+ * value.
+ * @param [in] padWTail   zero padding in width tail, need set when padMode is CC_PADDING_DIRECTASSIGN..
+ * @param [in] strideH   stride in height
+ * @param [in] strideW   stride in width
+ * @param [in] dataMode
+ * @param [in] ceilMode   0:Floor  1:Ceil
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetPooling2dDescriptor(ccPoolingDescriptor_t poolingDesc, ccPoolingMode_t mode, ccPaddingMode_t padMode,
+                                    ccNanPropagation_t maxpoolingNanOpt, int32_t windowH, int32_t windowW,
+                                    int32_t padHHead, int32_t padHTail, int32_t padWHead, int32_t padWTail,
+                                    int32_t strideH, int32_t strideW, int32_t dataMode, int32_t ceilMode,
+                                    ccPooingFwdAlgo_t algo = CC_POOLING_FWD_ALGO_HALF);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of 2d pooling
+ * @param [in] poolingDesc   descriptor of pooling operator
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in|out] n   point to batch size
+ * @param [in|out] c   point to channels
+ * @param [in|out] h   point to height of feature map
+ * @param [in|out] w   point to width of feature map
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetPooling2dForwardOutputDim(const ccPoolingDescriptor_t poolingDesc, const ccTensorDescriptor_t xDesc,
+                                          int32_t *n, int32_t *c, int32_t *h, int32_t *w);
+
+/**
+ * @ingroup dnn
+ * @brief pooling forward computation
+ * @param [in] handle   cce handle
+ * @param [in] poolingDesc   descriptor of pooling operator
+ * @param [in] alpha   scaling factors
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] x   input data in device memory
+ * @param [in] beta   scaling factors
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] y   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccPoolingForward(ccHandle_t handle, const ccPoolingDescriptor_t poolingDesc, const void *alpha,
+                            const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                            const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief pooling backward computation
+ * @param [in] handle   cce handle
+ * @param [in] poolingDesc   descriptor of pooling operator
+ * @param [in] alpha   scaling factors
+ * @param [in] beta   scaling factors
+ * @param [in] argMaskDesc   descriptor of mask tensor
+ * @param [in] argMask   mask data in device memory
+ * @param [in] dyDesc   descriptor of input tensor
+ * @param [in] dy   input data in device memory
+ * @param [in] dxDesc   descriptor of output tensor
+ * @param [in|out] dx   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccMaxPoolingBackward(ccHandle_t handle, const ccPoolingDescriptor_t poolingDesc, const void *alpha,
+                                const void *beta, const ccTensorDescriptor_t argMaskDesc, const void *argMask,
+                                const ccTensorDescriptor_t dyDesc, const void *dy, const ccTensorDescriptor_t dxDesc,
+                                void *dx);
+/**
+ * @ingroup dnn
+ * @brief create descriptor of activation operator
+ * @param [in|out] activationDesc   point to descriptor of activation operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateActivationDescriptor(ccActivationDescriptor_t *activationDesc);
+
+/**
+ * @ingroup dnn
+ * @brief init activation descriptor to 2d activation
+ * @param [in|out] activationDesc   descriptor of activation operator
+ * @param [in] mode                 mode of activation
+ * @param [in] reluNanOpt           Nan propagation mode
+ * @param [in] coef                 ceiling for clipped RELU, alpha for ELU
+ * @param [in] activationPara       activation parameter union
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetActivationDescriptor(ccActivationDescriptor_t activationDesc, ccActivationMode_t mode,
+                                     ccNanPropagation_t reluNanOpt, double coef,
+                                     ccActivationPara_u activationPara = {{0, CC_NAN_NOT_PROPAGATE}});
+
+/**
+ * @ingroup dnn
+ * @brief read activation param
+ * @param [in] activationDesc     descriptor of activation operator
+ * @param [in|out] mode           point to mode of activation
+ * @param [in|out] reluNanOpt     point to Nan propagation mode
+ * @param [in|out] coef           point to coef
+ * @param [in|out] activationPara point to activation parameter union
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetActivationDescriptor(const ccActivationDescriptor_t activationDesc, ccActivationMode_t *mode,
+                                     ccNanPropagation_t *reluNanOpt, double *coef,
+                                     ccActivationPara_u *activationPara = NULL);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of activation operator
+ * @param [in] *activationDesc   descriptor of activation operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyActivationDescriptor(ccActivationDescriptor_t *activationDesc);
+
+/**
+ * @ingroup dnn
+ * @brief activation forward computation
+ * @param [in] handle   cce handle
+ * @param [in] activationDesc   descriptor of activation operator
+ * @param [in] alpha   scaling factors
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] x   input data in device memory
+ * @param [in] beta   scaling factors
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] y   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccActivationForward(ccHandle_t handle, const ccActivationDescriptor_t activationDesc, const void *alpha,
+                               const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                               const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * @param [in|out] derivedBnDesc   descriptor of mean, variance, bias, scale tensors tensor
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] mode    mode of BatchNormalization
+ * @return ccStatus_t
+ */
+ccStatus_t ccDeriveBNTensorDescriptor(ccTensorDescriptor_t derivedBnDesc, const ccTensorDescriptor_t xDesc,
+                                      ccBatchNormMode_t mode);
+
+/**
+ * @ingroup dnn
+ * @brief batchnorm forward computation
+ * @param [in] handle   cce handle
+ * @param [in] mode     mode of batchnorm
+ * @param [in] alpha    scaling factors
+ * @param [in] beta     scaling factors
+ * @param [in] xDesc    descriptor of input tensor
+ * @param [in] x        input data in device memory
+ * @param [in] yDesc    descriptor of output tensor
+ * @param [in|out] y        output data in device memory
+ * @param [in] bnScaleBiasMeanVarDesc  descriptor of scale, bias, mean, variance tensor
+ * @param [in] bnScale       scaling factor
+ * @param [in] bnBias        bias factor
+ * @param [in] estimatedMean    mean
+ * @param [in] estimatedVariance   variance
+ * @param [in] epsilon     epsilon
+ * @return ccStatus_t
+ */
+ccStatus_t ccBatchNormForwardInference(ccHandle_t handle, ccBatchNormMode_t mode, const void *alpha, const void *beta,
+                                       const ccTensorDescriptor_t xDesc, const void *x,
+                                       const ccTensorDescriptor_t yDesc, void *y,
+                                       const ccTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+                                       const void *bnBias, const void *estimatedMean, const void *estimatedVariance,
+                                       double epsilon);
+
+/**
+ * @ingroup dnn
+ * @brief batchnorm forward computation
+ * @param [in] handle   cce handle
+ * @param [in] mode     mode of batchnorm
+ * @param [in] reluFlag     relu fusion flag
+ * @param [in] alpha    scaling factors
+ * @param [in] beta     scaling factors
+ * @param [in] xDesc    descriptor of input tensor
+ * @param [in] x        input data in device memory
+ * @param [in] yDesc    descriptor of output tensor
+ * @param [in|out] y        output data in device memory
+ * @param [in] bnScaleBiasMeanVarDesc  descriptor of scale, bias, mean, variance tensor
+ * @param [in] bnScale       scaling factor
+ * @param [in] bnBias        bias factor
+ * @param [in] estimatedMean    mean
+ * @param [in] estimatedVariance   variance
+ * @param [in] epsilon     epsilon
+ * @return ccStatus_t
+ */
+ccStatus_t ccBatchNormFusionForwardInference(ccHandle_t handle, ccBatchNormMode_t mode, ccBatchNormDescriptor_t bnDesc,
+                                             const void *alpha, const void *beta, const ccTensorDescriptor_t xDesc,
+                                             const void *x, const ccTensorDescriptor_t yDesc, void *y,
+                                             const ccTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+                                             const void *bnBias, const void *estimatedMean,
+                                             const void *estimatedVariance, double epsilon);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of batchnorm operator
+ * @param [in|out] bnDesc   point to descriptor of batchnorm operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateBatchNormDescriptor(ccBatchNormDescriptor_t *bnDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy batchnorm descriptor
+ * @param [in] descriptor of batchnorm operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyBatchNormDescriptor(ccBatchNormDescriptor_t *bnDesc);
+
+/**
+ * @ingroup dnn
+ * @brief append operation after batchnorm
+ * @param [in|out] bnDesc   descriptor of batchnorm operator
+ * @param [in] opType  operation type for append at batchnorm operation
+ * @param [in] opDesc  operation descritpor for the opType
+ * @return ccStatus_t
+ */
+ccStatus_t ccBatchNormAppendOp(ccBatchNormDescriptor_t bnDesc, ccOpType_t opType, const void *opDesc);
+
+/**
+ * @ingroup dnn
+ * @brief full get the output 4d dimension info of full connection
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] wDesc       descriptor of weight tensor
+ * @param [in|out] n       point to batch size
+ * @param [in|out] c       point to channels
+ * @param [in|out] h       point to height of feature map
+ * @param [in|out] w       point to width of feature map
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetFullConnectionFwdOutputDim(const ccTensorDescriptor_t xDesc, const ccFilterDescriptor_t wDesc,
+                                           int32_t *n, int32_t *c, int32_t *h, int32_t *w);
+
+/**
+ * @ingroup dnn
+ * @brief full connection forward computation
+ * @param [in] handle      cce handle
+ * @param [in] fcDesc      fc desc
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] wDesc       descriptor of weight tensor
+ * @param [in] w           filter data in device memory
+ * @param [in] biasDesc    bias data in device memory
+ * @param [in] bias        descriptor of bias tensor
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccFullConnectionForwardEx2(ccHandle_t handle, const ccFullConnectionDescriptor_t fcDesc, const void *alpha,
+                                      const ccTensorDescriptor_t xDesc, const void *x, const ccFilterDescriptor_t wDesc,
+                                      const void *w, const ccTensorDescriptor_t biasDesc, const void *bias,
+                                      const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief full connection forward computation with workspace
+ * @param [in] handle      cce handle
+ * @param [in] fcDesc      fc desc
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] wDesc       descriptor of weight tensor
+ * @param [in] w           filter data in device memory
+ * @param [in] biasDesc    bias data in device memory
+ * @param [in] bias        descriptor of bias tensor
+ * @param [in] workSpace   workSpace in device memory
+ * @param [in] workSpaceSizeInBytes     workSpace size in bytes
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccFullConnectionForwardWithWorkSpace(ccHandle_t handle, const ccFullConnectionDescriptor_t fcDesc,
+                                                const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                                const ccFilterDescriptor_t wDesc, const void *w,
+                                                const ccTensorDescriptor_t biasDesc, const void *bias, void *workSpace,
+                                                uint32_t workSpaceSizeInBytes, const void *beta,
+                                                const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief full softmax forward computation
+ * @param [in] handle      cce handle
+ * @param [in] algo        softmax algorithm
+ * @param [in] mode        mode of softmax
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] workSpace   workSpace in device memory
+ * @param [in] workSpaceSizeInBytes     workSpace size in bytes
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSoftmaxForward(ccHandle_t handle, ccSoftmaxAlgo_t algo, int32_t softmaxAxis, const void *alpha,
+                            const ccTensorDescriptor_t xDesc, const void *x, void *workSpace,
+                            uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @ingroup dnn
+ * @brief full softmax forward computation
+ * @param [in] handle      cce handle
+ * @param [in] algo        softmax algorithm
+ * @param [in] softmaxAxis mode of softmax
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] workSpace   workSpace in device memory
+ * @param [in] workSpaceSizeInBytes     workSpace size in bytes
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @param [in] classNum    class number
+ * @param [in] padNum      pad Num
+ * @return ccStatus_t
+ */
+ccStatus_t ccSoftmaxClassForward(ccHandle_t handle, ccSoftmaxAlgo_t algo, int32_t softmaxAxis, const void *alpha,
+                                 const ccTensorDescriptor_t xDesc, const void *x, void *workSpace,
+                                 uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t yDesc,
+                                 void *y, uint32_t classNum, uint32_t padNum);
+
+/**
+ * @ingroup dnn
+ * @brief full scale forward computation
+ * @param [in] handle      cce handle
+ * @param [in] scaleBiasDesc  descriptor of scale and bias tensor
+ * @param [in] scale       scaling factor
+ * @param [in] bias        bias factor
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccScaleForward(ccHandle_t handle, const ccTensorDescriptor_t scaleBiasDesc, const void *scale,
+                          const void *bias, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief full scale forward computation
+ * @param [in] handle      cce handle
+ * @param [in] scaleDesc  descriptor of scale and bias tensor
+ * @param [in] scale       scaling factor
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccScaleNoBiasForward(ccHandle_t handle, const ccTensorDescriptor_t scaleDesc, const void *scale,
+                                const void *alpha, const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of depth to space
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] blockSize        the size of block
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccGetDepthToSpaceOutputDim(const ccTensorDescriptor_t xDesc, const int32_t blockSize, int32_t *dimCnt,
+                                      int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief depth to space forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] blockSize        the size of block
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccDepthToSpaceForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                 const int32_t blockSize, const void *beta, const ccTensorDescriptor_t outputDesc,
+                                 void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of space to depth
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] blockSize        the size of block
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccGetSpaceToDepthOutputDim(const ccTensorDescriptor_t xDesc, const int32_t blockSize, int32_t *dimCnt,
+                                      int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief space to depth forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] blockSize        the size of block
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccSpaceToDepthForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                 const int32_t blockSize, const void *beta, const ccTensorDescriptor_t outputDesc,
+                                 void *output);
+
+/**
+ * @ingroup dnn
+ * @brief full eltwise forward computation
+ * @param [in] handle          cce handle
+ * @param [in] eltDesc         eltwise descriptor
+ * @param [in] mode            mode of eltwise
+ * @param [in] alpha           scaling factors
+ * @param [in] broadcast(Reserve) support tensor broadcasting or not
+ * @param [in] xDesc[]         array of descriptor for input tensor
+ * @param [in] x               array of input data in device memory
+ * @param [in] inputNum        the number of input tensors
+ * @param [in] beta            scaling factors
+ * @param [in] yDesc           descriptor of output tensor
+ * @param [in|out] y           output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccEltwiseForwardEx(ccHandle_t handle, ccEltwiseDescriptor_t eltDesc, ccEltwiseMode_t mode, int32_t inputNum,
+                              const void *alpha, bool broadcast, const ccTensorDescriptor_t xDesc[], const void *x[],
+                              const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @ingroup dnn
+ * @brief create descriptor of eltwise operator
+ * @param [in|out] eltwiseDesc   point to descriptor of eltwise operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateEltwiseDescriptor(ccEltwiseDescriptor_t *eltDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy eltwise descriptor
+ * @param [in] descriptor of eltwise operator
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyEltwiseDescriptor(ccEltwiseDescriptor_t *eltDesc);
+
+/**
+ * @ingroup dnn
+ * @brief append operation after eltwise
+ * @param [in|out] eltDesc   descriptor of eltwise operator
+ * @param [in] opType  operation type for append at eltwise operation
+ * @param [in] opDesc  operation descritpor for the opType
+ * @return ccStatus_t
+ */
+ccStatus_t ccEltwiseAppendOp(ccEltwiseDescriptor_t eltDesc, ccOpType_t opType, const void *opDesc);
+
+/**
+ * @ingroup dnn
+ * @brief set eltwise desciptor's quantize  parameters
+ * @param [in] eltDesc        eltwise descriptor
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetEltwiseQuantizeInfo(ccEltwiseDescriptor_t eltDesc, const ccQuantizeDescriptor_t QuantizeInfo);
+
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of reshape forward computation, maybe no need temp space
+ * @param [in] handle   cce handle
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] sizeInBytes   temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetReshapeForwardWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t xDesc,
+                                            const ccTensorDescriptor_t yDesc, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief reshape the input tensor
+ * @param [in] handle  cce handle
+ * @param [in] alpha   scaling factors
+ * @param [in] xDesc   input tensor
+ * @param [in] x   input data
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @param [in] beta   scaling factors
+ * @param [in] yDesc   output tensor
+ * @param [in|out] y   output data
+ * @return ccStatus_t
+ */
+ccStatus_t ccReshapeForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                            void *workSpace, uint32_t workSpaceSizeInBytes, const void *beta,
+                            const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @ingroup dnn
+ * @brief reshape the input tensor for data in ND format
+ * @param [in] handle  cce handle
+ * @param [in] alpha   scaling factors
+ * @param [in] xDesc   input tensor
+ * @param [in] x   input data
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @param [in] beta   scaling factors
+ * @param [in] yDesc   output tensor
+ * @param [in|out] y   output data
+ * @return ccStatus_t
+ */
+ccStatus_t ccNdReshapeForward(ccHandle_t handle, const void *alpha, ccTensorFormat_t rawFormat,
+                              const ccTensorDescriptor_t xDesc, const void *x, void *workSpace,
+                              uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t yDesc,
+                              void *y);
+
+/**
+ * @ingroup dnn
+ * @brief Four2Five forward computation
+ * @param [in] handle          cce handle
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] yDesc           descriptor of output tensor
+ * @param [in | out] y         output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccFour2FiveForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                              const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief Five2Four forward computation
+ * @param [in] handle          cce handle
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] yDesc           descriptor of output tensor
+ * @param [in | out] y         output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccFive2FourForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                              const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of add forward computation
+ * @param [in] handle             cce handle
+ * @param [in] xDesc              descriptor of the first input tensor
+ * @param [in] wDesc              descriptor of the second input tensor
+ * @param [in] yDesc              descriptor of output tensor
+ * @param [in|out] sizeInBytes    temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetAddForwardWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t xDesc,
+                                        const ccTensorDescriptor_t wDesc, const ccTensorDescriptor_t yDesc,
+                                        uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief Add forward computation
+ * @param [in] handle          cce handle
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               one input data in device memory
+ * @param [in] wDesc           descriptor of input tensor
+ * @param [in] w               the other input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] workSpace       the address apply in HBM
+ * @param [in] workSpaceSizeInBytes   the size apply in HBM
+ * @param [in] yDesc           descriptor of output tensor
+ * @param [in|out] y           output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccAddForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                        const ccTensorDescriptor_t wDesc, const void *w, const void *beta, void *workSpace,
+                        uint32_t workSpaceSizeInBytes, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief Stack forward computation
+ * @param [in] handle          cce handle
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x[]             x array is host mem array, the element is device address of input data
+ * @param [in] num             number of input tensor
+ * @param [in] axis            along which axis to stack the input tensor
+ * @param [in] beta            bias factors
+ * @param [in] yDesc           descriptor of output tensor
+ * @param [in|out] y           output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccStackForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x[],
+                          uint32_t num, int32_t axis, const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of stack
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] num              number of input tensor
+ * @param [in] axis             along which axis to stack the input tensor
+ * @param [in|out] n            point to batch size
+ * @param [in|out] c            point to channels
+ * @param [in|out] h            point to height
+ * @param [in|out] w            point to width
+ * @param [in|out] realDimCnt   point to real dimCnt after stack
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetStackOutputDim(const ccTensorDescriptor_t xDesc, uint32_t num, int32_t axis, int32_t *n, int32_t *c,
+                               int32_t *h, int32_t *w, int32_t *realDimCnt);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of stack
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] num              number of input tensor
+ * @param [in] axis             along which axis to stack the input tensor
+ * @param [in|out] dimCnt       dimcnt
+ * @param [in|out] dim          save dim value
+ * @param [in| dimlen           length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetStackOutputDim(const ccTensorDescriptor_t xDesc, uint32_t num, int32_t axis, int32_t *dimCnt,
+                               int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief return need grid generator or not
+ * @param [in] inputH, inputW, outputH, outputW, alignCorner(interp=true,resizeBilinear depends para align corner)
+ * @param [out] bool needGridFlag, true mean need, false mean not need
+ * @return ccStatus_t
+ */
+ccStatus_t ccIsGridGenetatorNeed(int32_t inputH, int32_t inputW, int32_t outputH, int32_t outputW, bool alignCorner,
+                                 bool &needGridFlag);
+
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of Deconvolution forward computation, maybe no need temp space
+ * @param [in] handle   cce handle
+ * @param [in] deconvDesc   descriptor of Deconvolution operator
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in] algo   algorithm of Deconvolution forward
+ * @param [in|out] sizeInBytes   temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetDeconvolutionForwardWorkspaceSize(ccHandle_t handle, const ccConvolutionDescriptor_t deconvDesc,
+                                                  const ccTensorDescriptor_t xDesc, const ccFilterDescriptor_t wDesc,
+                                                  const ccTensorDescriptor_t yDesc, ccConvolutionFwdAlgo_t algo,
+                                                  uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief Deconvolution forward computation
+ * @param [in] handle   cce handle
+ * @param [in] deconvDesc   descriptor of deconvolution operator
+ * @param [in] alpha   scaling factors
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] x   input data in device memory
+ * @param [in] wDesc   descriptor of filter
+ * @param [in] w   filter data in device memory
+ * @param [in] biasDesc   descriptor of bias
+ * @param [in] bias   bias data in device memory
+ * @param [in] algo   algorithm of deconvolution forward
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @param [in] beta   scaling factors
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] y   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccDeconvolutionForward(ccHandle_t handle, const ccConvolutionDescriptor_t deconvDesc, const void *alpha,
+                                  const ccTensorDescriptor_t xDesc, const void *x, const ccFilterDescriptor_t wDesc,
+                                  const void *w, const ccTensorDescriptor_t biasDesc, const void *bias,
+                                  ccConvolutionFwdAlgo_t algo, void *workSpace, uint32_t workSpaceSizeInBytes,
+                                  const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+#define MODE_C_N (0)
+#define MODE_N_C (1)
+
+/**
+ * [ccArgMaxForward]
+ * @param [in] handle        [handle]
+ * @param [in] alpha         [reserved parameters]
+ * @param [in] xDesc         [x tensor descriptor]
+ * @param [in] x             [innput tensor]
+ * @param [in] outMaxVaule   [Whether to return the maximum value, true: return max value; false: return max value index
+ * ]
+ * @param [in] topK          [The number that returns the maximum index or maximum value]
+ * @param [in] axis          [Describes which axis of the input Tensor to reduce across]
+ * @param [in] beta          [reserved parameters]
+ * @param [in] yDesc         [y tensor descriptor]
+ * @param [in] y             [The max value index or max value tensor]
+ */
+ccStatus_t ccArgMaxForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                           bool outMaxVal, uint32_t topK, int32_t axis, const void *beta,
+                           const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * [ccGetArgMaxOutputDim]
+ * @param [in] xDesc         [x tensor descriptor]
+ * @param [in] outMaxVaule   [Whether to return the maximum value, true: return max value; false: return max value index
+ * ]
+ * @param [in] topK          [The number that returns the maximum index or maximum value]
+ * @param [in] axis          [Describes which axis of the input Tensor to reduce across]
+ * @param [in|out] dimCnt    [point to the output dimCnt]
+ * @param [in|out] dim       [arrays to save dims]
+ * @param [in| dimlen        length of dim
+ */
+ccStatus_t ccGetArgMaxOutputDim(const ccTensorDescriptor_t xDesc, bool outMaxVal, uint32_t topK, int32_t axis,
+                                int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * [ccGetArgMaxOutputDim]
+ * @param [in] xDesc         [x tensor descriptor]
+ * @param [in] outMaxVaule   [Whether to return the maximum value, true: return max value; false: return max value index
+ * ]
+ * @param [in] topK          [The number that returns the maximum index or maximum value]
+ * @param [in] axis          [Describes which axis of the input Tensor to reduce across]
+ * @param [in] n             [Batch number of the output tensor]
+ * @param [in] c             [Channel of the output tensor]
+ * @param [in] h             [Height number of the output tensor]
+ * @param [in] w             [Weight number of the output tensor]
+ */
+ccStatus_t ccGetArgMaxOutputDim(const ccTensorDescriptor_t xDesc, bool outMaxVal, uint32_t topK, int32_t axis,
+                                int32_t *n, int32_t *c, int32_t *h, int32_t *w);
+
+/**
+ * @ingroup dnn
+ * @brief Yolo2ReorgForward computation
+ * @param [in] handle   CCE handle
+ * @param [in] stride  scale parameter
+ * @param [in] reverse  reverse parameter
+ * @param [in] alpha  alpha factor
+ * @param [in] beta  beta factor
+ * @param [in] xDesc x-tensor descriptor
+ * @param [in] x  x-tensor in device memory
+ * @param [out] workSpaceSizeInBytes  temporary work sapce size
+ * @param [out] workSpace  temporary work sapce in device memory
+ * @param [in] yDesc y-tensor descriptor
+ * @param [out] y  y-tensor in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccYolo2ReorgForward(ccHandle_t handle, int32_t stride, bool reverse, const void *alpha,
+                               const ccTensorDescriptor_t xDesc, const void *x, uint32_t workSpaceSizeInBytes,
+                               void *workSpace, const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @param [in] stride  scale parameter
+ * @param [in] reverse  reverse parameter
+ * @param [in] xDesc x-tensor descriptor
+ * @param [in|out] n          point to batch size
+ * @param [in|out] c          point to channels
+ * @param [in|out] h          point to height of feature map
+ * @param [in|out] w          point to width of feature map
+ */
+ccStatus_t ccGetReorgOutPutDim(int32_t stride, bool reverse, const ccTensorDescriptor_t xDesc, int32_t *n, int32_t *c,
+                               int32_t *h, int32_t *w);
+
+/**
+ * @param [in] stride  scale parameter
+ * @param [in] reverse  reverse parameter
+ * @param [in] xDesc x-tensor descriptor
+ * @param [out] dimCnt               output tensor dim cnt
+ * @param [out] dim                  output tensor dim
+ * @param [in| dimlen           length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetReorgOutPutDim(int32_t stride, bool reverse, const ccTensorDescriptor_t xDesc, int32_t *dimCnt,
+                               int32_t dim[], int32_t dimLen);
+
+/**
+ * @param [in] xDesc x-tensor descriptor
+ * @param [out] temporary work sapce size
+ */
+ccStatus_t ccGetYolo2ReorgForwardWorkspaceSize(const ccTensorDescriptor_t xDesc, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief full shuffle       channel forward computation
+ * @param [in] handle        cce handle
+ * @param [in] groupNum      number of groups in a channal
+ * @param [in] subgroupNum   number of sub-groups in a group
+ * @param [in] alpha         scaling factors
+ * @param [in] xDesc         descriptor of input tensor
+ * @param [in] x             input data in device memory
+ * @param [in] beta          scaling factors
+ * @param [in] yDesc         descriptor of output tensor
+ * @param [in|out] y         output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccShuffleChannelForward(ccHandle_t handle, int32_t groupNum, int32_t subgroupNum, const void *alpha,
+                                   const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                   const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of permute forward computation, maybe no need temp space
+ * @param [in] handle   cce handle
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] yDesc   descriptor of output tensor
+ * @param [in|out] sizeInBytes   temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetPermuteForwardWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t xDesc,
+                                            const ccTensorDescriptor_t yDesc, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dim of permute forward computation
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] dimIndex    dim Index
+ * @param [in|out] dimCnt  dim count
+ * @param [in|out] dim     dim value
+ * @param [in| dimlen      length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetPermuteOutputDim(const ccTensorDescriptor_t xDesc, const int32_t dimIndex[], const int32_t dimIndexLen,
+                                 int32_t *dimCnt, int32_t *dim, int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief full permute     forward computation
+ * @param [in] handle      cce handle
+ * @param [in] dimIndex    dim Index,only support [0,1,2,3]
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @param [in] beta        scaling factors
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccPermuteForward(ccHandle_t handle, const int32_t dimIndex[], const void *alpha,
+                            const ccTensorDescriptor_t xDesc, const void *x, void *workspace,
+                            uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief full split      forward computation
+ * @param [in] handle     cce handle
+ * @param [in] alpha      scaling factors
+ * @param [in] xDesc      descriptor of input tensor
+ * @param [in] x          input data in device memory
+ * @param [in] axis       the dimension along which to split. Must be in the range [-xDesc->dimCnt, xDesc->dimCnt)
+ * @param [in] num        the number of outputs
+ * @param [in] beta       scaling factors
+ * @param [in] yDescArr      descriptors of output tensors
+ * @param [in|out] yArr      output data array in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSplitForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          int32_t axis, uint32_t num, const void *beta, const ccTensorDescriptor_t yDescArr[],
+                          void *yArr[]);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimensions info of split
+ * @param [in] xDesc      descriptor of input tensor
+ * @param [in] axis       the dimension along which to split. Must be in the range [-xDesc->dimCnt, xDesc->dimCnt)
+ * @param [in] num        the number of outputs
+ * @param [in] sizes      Optional, used to specify the sizes of each output tensor along split dim. The tensor x would
+ * be split evenly along split dim if sizes is NULL
+ * @param [in|out] nArr   point to the first element of batch sizes
+ * @param [in|out] cArr   point to the first element of channels
+ * @param [in|out] hArr   point to the first element of heights of feature map
+ * @param [in|out] wArr   point to the first element of widths of feature map
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetSplitForwardOutputDim(const ccTensorDescriptor_t xDesc, int32_t axis, uint32_t num,
+                                      const uint32_t sizes[], uint32_t nArr[], uint32_t cArr[], uint32_t hArr[],
+                                      uint32_t wArr[]);
+
+/**
+    * @ingroup dnn
+    * @brief Get split output shape(s).
+    * @param [in] xDesc         input tensor, support ND and NC1HWC0
+    * @param [in] axis          split axis, negtive axis will increased by dimCnt once time.
+    * @param [in] num           splited nums.
+    * @param [in] sizes         splited dim size on axis. if NULL was set, The input will be divided into num equally.
+    * @param [output] dimCnt    splited dimCnt array. One to one correspondence with the splited output.
+    * @param [output] dim       array of splited dim array. One to one correspondence with the splited output.
+    * @param [in| dimlen        length of dim(Pass in the length of the entire space pointed to by dim,
+                                              not just the length of the dim array, because dim is a level 2 array
+                                              dimlen = lengthof dim[][], not just lengthof dim[])
+    * @return ccStatus_t
+    */
+ccStatus_t ccGetSplitForwardOutputDim(const ccTensorDescriptor_t xDesc, int32_t axis, uint32_t num,
+                                      const uint32_t sizes[], int32_t *dimCnt, int32_t *dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief create weight compress info
+ * @param [in|out] compressInfo   point to CompressInfo
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateWeightCompressInfo(ccWeightCompressInfo_t **compressInfo);
+
+/**
+ * @ingroup dnn
+ * @brief destory weight compress info
+ * @param [in] *compressInfo   point to CompressInfo
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyWeightCompressInfo(ccWeightCompressInfo_t **compressInfo);
+
+/**
+ * @ingroup dnn
+ * @brief create compress table
+ * @param [in|out] compressTab   point to weight compress table
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateWeightCompressTab(ccWeightCompressTab_t **compressTab);
+
+/**
+ * @ingroup dnn
+ * @brief destory compress table
+ * @param [in] compressTab   point to weight compress table
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyWeightCompressTab(ccWeightCompressTab_t **compressTab);
+
+/**
+ * @ingroup dnn
+ * @brief get fc compress info
+ * @param [in] xDesc               descriptor of input tensor
+ * @param [in] wDesc               descriptor of weight tensor
+ * @param [in] biasDesc            descriptor of bias tensor
+ * @param [in] dataTypeTransmode   mode of data type transform
+ * @param [in] weightCompressInfo  compress info, compute based on tiling method
+ * @param [in|out] outputSize      output data size in byte
+ * @param [in|out] infoTabSize     compress info table
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetCompressedFcWeightInfo(const ccTensorDescriptor_t xDesc, const ccFilterDescriptor_t wDesc,
+                                       const ccTensorDescriptor_t biasDesc, ccDataTypeTransMode_t dataTypeTransmode,
+                                       ccWeightCompressInfo_t *weightCompressInfo, uint32_t *outputSize,
+                                       uint32_t *infoTabSize);
+/**
+ * @ingroup dnn
+ * @brief compress fc
+ * @param [in] wDesc               descriptor of weight tensor
+ * @param [in] w                   filter data in device memory
+ * @param [in] weightCompressInfo  compress info, compute based on tiling method
+ * @param [in] dataTypeTransmode   mode of data type transform
+ * @param [in|out] y               output data in device memory
+ * @param [in] ySize               transformed data size in byte
+ * @param [in|out] yCompressedSize compressed output data size in byte
+ * @param [in|out] infoTab         compressed info table
+ * @param [in] infoTabSize         compressed info table size in byte
+ * @return ccStatus_t
+ */
+ccStatus_t ccCompressWeight(const ccFilterDescriptor_t wDesc, const void *w,
+                            const ccWeightCompressInfo_t *weightCompressInfo, ccDataTypeTransMode_t dataTypeTransmode,
+                            ccFilterDescriptor_t yDesc, void *y, uint32_t ySize, uint32_t *yCompressedSize,
+                            void *infoTab, uint32_t infoTabSize);
+
+/**
+ * @ingroup dnn
+ * @brief restore compressed fc data
+ * @param [in] x               input data in device memory
+ * @param [in] xSizeInBytes    input compressed weight data size in byte
+ * @param [in|out] y           output data in device memory
+ * @param [in] ySizeInBytes    output data size in byte
+ * @return ccStatus_t
+ */
+ccStatus_t ccRestoreCompressedWeight(const void *x, uint32_t xSizeInBytes, void *y, uint32_t ySizeInBytes,
+                                     rtMemcpyKind_t kind);
+
+/**
+ * @ingroup dnn
+ * @brief create quantize parameters struct
+ * @param [in|out] quantizeInfo    descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateQuantizeInfoTab(ccQuantizeDescriptor_t *quantizeInfo);
+
+/**
+ * @ingroup dnn
+ * @brief destroy quantize parameters struct
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestoryQuantizeInfoTab(ccQuantizeDescriptor_t *quantizeInfo);
+
+/**
+ * @ingroup dnn
+ * @brief set quantize parameters
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @param [in] scaleValMode    enmu type for quantize scale value type (normal or sqrt)
+ * @param [in] scale           quantize scale value
+ * @param [in] offset          quantize offset(when quantize algorithm is half offset or full offset,this should be
+ * configed)
+ * @param [in] offsetPad       padding value for load3d (only for half offset or full offset)
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetQuantizeFactors(ccQuantizeDescriptor_t quantizeInfo, ccScaleValueMode_t scaleValMode,
+                                const uint16_t *scale, const uint16_t *offset, const uint8_t *offsetPad);
+
+/**
+ * @ingroup dnn
+ * @brief set Requantize parameters
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @param [in] scaleValMode    enmu type for requantize scale value type (normal or sqrt)
+ * @param [in] scale           quantize scale value
+ * @param [in] offset          quantize offset(when quantize algorithm is half offset or full offset,this should be
+ * configed)
+ * @param [in] offsetw         offset for filter (only config for full offset quantize)
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetReQuantizeFactors(ccQuantizeDescriptor_t quantizeInfo, ccScaleValueMode_t scaleValMode,
+                                  const uint16_t *scaleRq, const uint16_t *nextLayerOffset, const int32_t *offsetw);
+
+/**
+ * @ingroup dnn
+ * @brief set Dequantize parameters
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @param [in] scaleValMode    enmu type for dequantize scale value type (normal or sqrt)
+ * @param [in] scaleDq           quantize scale value
+ * @param [in] offsetw         offset for filter (only config for full offset quantize)
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetDeQuantizeFactors(ccQuantizeDescriptor_t quantizeInfo, ccScaleValueMode_t scaleValMode,
+                                  const uint16_t *scaleDq, const int32_t *offsetw);
+
+/**
+ * @ingroup dnn
+ * @brief set convolution desciptor's quantize  parameters
+ * @param [in] convDesc        convolution descriptor
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionQuantizeInfo(ccConvolutionDescriptor_t convDesc, const ccQuantizeDescriptor_t QuantizeInfo);
+
+/**
+ * @ingroup dnn
+ * @brief set convolution desciptor's all offset quantize  parameters
+ * @param [in] convDesc        convolution descriptor
+ * @param [in] offsetw         descriptor of quantize parameters
+ * @param [in] scaleReq        descriptor of quantize parameters
+ * @param [in] offset_d_next   descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetAllOffsetQuantizeFactors(ccQuantizeDescriptor_t quantizeInfo, const uint8_t *offsetW,
+                                         const uint8_t *offsetD, const uint16_t *scaleReq, const uint16_t *offsetDNext);
+
+/**
+ * @ingroup dnn
+ * @brief set full connection desciptor's quantize  parameters
+ * @param [in] fcDesc          full connection descriptor
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetFullConnectionQuantizeInfo(ccFullConnectionDescriptor_t fcDesc,
+                                           const ccQuantizeDescriptor_t QuantizeInfo);
+
+/**
+ * @ingroup dnn
+ * @brief set pooling desciptor's quantize  parameters
+ * @param [in] poolingDesc     pooling descriptor
+ * @param [in] quantizeInfo    descriptor of quantize parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetPoolingQuantizeInfo(ccPoolingDescriptor_t poolingDesc, const ccQuantizeDescriptor_t QuantizeInfo);
+
+/**
+ * @ingroup dnn
+ * @brief  set full connection  desciptor's info table
+ * @param [in] fcDesc          full connection descriptor
+ * @param [in] infoTabSize     table size
+ * @param [in] infoTab         pointer to info table
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetFullConnectionDescriptor(ccFullConnectionDescriptor_t fcDesc, uint32_t infoTabSize, const void *infoTab,
+                                         ccFullConnectFwdAlgo_t algo = CC_FULLCONNECT_FWD_ALGO_HALF);
+
+/**
+ * @ingroup dnn
+ * @brief  set full connection  desciptor's relu flag
+ * @param [in] fcDesc          full connection descriptor
+ * @param [in] opType  operation type for append at convolution operation
+ * @param [in] opDesc  operation descritpor for the opType
+ * @return ccStatus_t
+ */
+ccStatus_t ccFullConnectionAppendOp(ccFullConnectionDescriptor_t fcDesc, tagCcOpType opType, const void *opDesc);
+
+/**
+ * @ingroup dnn
+ * @brief check aipp basic info
+ * @param [in] inputFormat     format of input image
+ * @param [in] loadStartPosH   vertical start position in source image
+ * @param [in] loadStartPosW   horizontal start position in source image
+ * @param [in] srcImageSizeH   vertical size of source image
+ * @param [in] srcImageSizeW   horizontal size of source image
+ * @param [in] cpaddingValue   C direction padding value
+ * @param [in] cscSwitch       csc enable or not
+ * @param [in] rbuvSwapSwitch  swap R/U and B/V position of the image
+ * @param [in] axSwapSwitch    swap RGBA->ARGB, YUVA->AYUV
+ * @param [in] singleLineMode  when set this bit to 1, only read 1 line. Under this case, vertical size configuration is
+ * not useful.
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckConvolutionAippCommInfo(ccAippInputFormat_t inputFormat, int32_t loadStartPosW, int32_t loadStartPosH,
+                                          int32_t srcImageSizeW, int32_t srcImageSizeH, float cpaddingValue,
+                                          bool cscSwitch, bool rbuvSwapSwitch, bool axSwapSwitch, bool singleLineMode);
+
+/**
+ * @ingroup dnn
+ * @brief check aipp dtc info
+ * @param [in] dtcPixelMeanChnx      Mean value for YUV or RGB data channel x
+ * @param [in] dtcPixelMinChnx       Min value for YUV or RGB data channel x
+ * @param [in] dtcPixelVarReciChnx   Reciprocal of variance or (max-min) for YUV or RGB data channel x
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckConvolutionAippDtcInfo(int32_t dtcPixelMeanChn0, int32_t dtcPixelMeanChn1, int32_t dtcPixelMeanChn2,
+                                         float dtcPixelMinChn0, float dtcPixelMinChn1, float dtcPixelMinChn2,
+                                         float dtcPixelVarReciChn0, float dtcPixelVarReciChn1,
+                                         float dtcPixelVarReciChn2);
+
+/**
+ * @ingroup dnn
+ * @brief check aipp pad info
+ * @param [in] paddingMode              padding mode
+ * @param [in] leftPaddingSize          left hblank/padding size
+ * @param [in] rightPaddingSize         right hblank/padding size
+ * @param [in] topPaddingSize           top padding size
+ * @param [in] bottomPaddingSize        bottom padding size
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckConvolutionAippPadInfo(ccAippPaddingMode_t paddingMode, int32_t leftPaddingSize,
+                                         int32_t rightPaddingSize, int32_t topPaddingSize, int32_t bottomPaddingSize);
+
+/**
+ * @ingroup dnn
+ * @brief check aipp csc info
+ * @param [in] cscMatrixRmCn           3x3 CSC matrix for YUV to RGB or RGB to YUV, element of row m and column n
+ * @param [in] cscOutputBiasm          output Bias for RGB to YUV, element of row m
+ * @param [in] cscInputBiasm           input Bias for YUV to RGB, element of row m
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckConvolutionAippCscInfo(int32_t cscMatrixR0C0, int32_t cscMatrixR0C1, int32_t cscMatrixR0C2,
+                                         int32_t cscMatrixR1C0, int32_t cscMatrixR1C1, int32_t cscMatrixR1C2,
+                                         int32_t cscMatrixR2C0, int32_t cscMatrixR2C1, int32_t cscMatrixR2C2,
+                                         int32_t cscOutputBias0, int32_t cscOutputBias1, int32_t cscOutputBias2,
+                                         int32_t cscInputBias0, int32_t cscInputBias1, int32_t cscInputBias2);
+
+/**
+ * @ingroup dnn
+ * @brief check aipp scf info
+ * @param [in] scfSwitch               scaling enable or not
+ * @param [in] scfInputW               input width of scaling
+ * @param [in] scfInputH               input height of scaling
+ * @param [in] scfOutputW              output width of scaling
+ * @param [in] scfOutputH              output height of scaling
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckConvolutionAippScfInfo(bool scfSwitch, int32_t scfInputW, int32_t scfInputH, int32_t scfOutputW,
+                                         int32_t scfOutputH);
+
+/**
+ * @ingroup dnn
+ * @brief check aipp param
+ * @param [in] convDesc                descriptor of conv operator
+ * @param [in] xDesc                   input tensor info
+ * @param [in] yDesc                   output tensor info
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckConvFwdAippParam(const ccConvolutionDescriptor_t convDesc, const ccTensorDescriptor_t xDesc,
+                                   const ccTensorDescriptor_t yDesc);
+
+/**
+ * @ingroup dnn
+ * @brief init aipp basic info
+ * @param [in|out] convDesc   descriptor of conv operator
+ * @param [in] inputFormat     format of input image
+ * @param [in] loadStartPosH   vertical start position in source image
+ * @param [in] loadStartPosW   horizontal start position in source image
+ * @param [in] srcImageSizeH   vertical size of source image
+ * @param [in] srcImageSizeW   horizontal size of source image
+ * @param [in] cpaddingValue   C direction padding value
+ * @param [in] cscSwitch       csc enable or not
+ * @param [in] rbuvSwapSwitch  swap R/U and B/V position of the image
+ * @param [in] axSwapSwitch    swap RGBA->ARGB, YUVA->AYUV
+ * @param [in] singleLineMode  when set this bit to 1, only read 1 line. Under this case, vertical size configuration is
+ * not useful.
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionAippCommInfo(ccConvolutionDescriptor_t convDesc, ccAippInputFormat_t inputFormat,
+                                        int32_t loadStartPosW, int32_t loadStartPosH, int32_t srcImageSizeW,
+                                        int32_t srcImageSizeH, float cpaddingValue, bool cscSwitch, bool rbuvSwapSwitch,
+                                        bool axSwapSwitch, bool singleLineMode);
+/**
+ * @ingroup dnn
+ * @brief init aipp dtc info
+ * @param [in|out] convDesc   descriptor of conv operator
+ * @param [in] dtcPixelMeanChnx      Mean value for YUV or RGB data channel x
+ * @param [in] dtcPixelMinChnx       Min value for YUV or RGB data channel x
+ * @param [in] dtcPixelVarReciChnx   Reciprocal of variance or (max-min) for YUV or RGB data channel x
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionAippDtcInfo(ccConvolutionDescriptor_t convDesc, int32_t dtcPixelMeanChn0,
+                                       int32_t dtcPixelMeanChn1, int32_t dtcPixelMeanChn2, float dtcPixelMinChn0,
+                                       float dtcPixelMinChn1, float dtcPixelMinChn2, float dtcPixelVarReciChn0,
+                                       float dtcPixelVarReciChn1, float dtcPixelVarReciChn2);
+/**
+ * @ingroup dnn
+ * @brief init aipp pad info
+ * @param [in|out] convDesc   descriptor of conv operator
+ * @param [in] paddingMode              padding mode
+ * @param [in] leftPaddingSize          left hblank/padding size
+ * @param [in] rightPaddingSize         right hblank/padding size
+ * @param [in] topPaddingSize           top padding size
+ * @param [in] bottomPaddingSize        bottom padding size
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionAippPadInfo(ccConvolutionDescriptor_t convDesc, ccAippPaddingMode_t paddingMode,
+                                       int32_t leftPaddingSize, int32_t rightPaddingSize, int32_t topPaddingSize,
+                                       int32_t bottomPaddingSize);
+
+/**
+ * @ingroup dnn
+ * @brief init aipp csc info
+ * @param [in|out] convDesc   descriptor of conv operator
+ * @param [in] cscMatrixRmCn           3x3 CSC matrix for YUV to RGB or RGB to YUV, element of row m and column n
+ * @param [in] cscOutputBiasm          output Bias for RGB to YUV, element of row m
+ * @param [in] cscInputBiasm           input Bias for YUV to RGB, element of row m
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionAippCscInfo(ccConvolutionDescriptor_t convDesc, int32_t cscMatrixR0C0, int32_t cscMatrixR0C1,
+                                       int32_t cscMatrixR0C2, int32_t cscMatrixR1C0, int32_t cscMatrixR1C1,
+                                       int32_t cscMatrixR1C2, int32_t cscMatrixR2C0, int32_t cscMatrixR2C1,
+                                       int32_t cscMatrixR2C2, int32_t cscOutputBias0, int32_t cscOutputBias1,
+                                       int32_t cscOutputBias2, int32_t cscInputBias0, int32_t cscInputBias1,
+                                       int32_t cscInputBias2);
+
+/**
+ * @ingroup dnn
+ * @brief init aipp scf info
+ * @param [in|out] convDesc   descriptor of conv operator
+ * @param [in] scfSwitch               scaling enable or not
+ * @param [in] scfInputW               input width of scaling
+ * @param [in] scfInputH               input height of scaling
+ * @param [in] scfOutputW              output width of scaling
+ * @param [in] scfOutputH              output height of scaling
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionAippScfInfo(ccConvolutionDescriptor_t convDesc, bool scfSwitch, int32_t scfInputW,
+                                       int32_t scfInputH, int32_t scfOutputW, int32_t scfOutputH);
+
+/**
+ * @ingroup dnn
+ * @brief set dynamic aipp parameter address and enflag info
+ * @param [in|out] convDesc   descriptor of conv operator
+ * @param [in] dyncParaAddr            aipp parameter address
+ * @param [in] dyncAippFlag            flag to show whether to use dynamic aipp
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetConvolutionAippDyncParaAddr(ccConvolutionDescriptor_t convDesc, const void *dyncParaAddr,
+                                            bool dyncAippFlag, bool rotationFlag = false);
+
+/**
+ * @ingroup dnn
+ * @brief check dynamic aipp parameter
+ * @param [in] dyncParaAddr            aipp parameter address
+ * @param [in] dataLength              parameter lenght
+ * @param [in] convolutionDimW            convDimW
+ * @param [in] convolutionDimH            convDimH
+ * @return ccStatus_t
+ */
+ccStatus_t ccCheckDynamicAippParam(const void *dynamicParamAddr, uint32_t dataLength, int64_t convolutionDimW,
+                                   int64_t convolutionDimH);
+
+/*** @ingroup dnn
+ * @brief trans mean and var
+ * @param [in|out] mean' = bnScale/sqrt(var)
+ * @param [in|out] var' = -bnScale * mean / sqrt(var) + bnBias
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccTransBatchnormMeanAndVar(void *mean, void *var, const ccTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                      const void *alpha, const void *beta, void *bnScale, void *bnBias, double epsilon);
+
+/**
+ * @ingroup dnn
+ * @brief init deconvolution adj or targetShape info.
+ * @param [in] convDesc  conv descriptor.
+ * @param [in] adjH, adjust H output.
+ * @param [in] adjW, adjust W output.
+ * @param [in] targetShape, values of output shape, if this pointer was set, ignore adj.
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetDeconvolutionOutShapeInfo(ccConvolutionDescriptor_t convDesc, uint32_t adjSize, const uint32_t *adj,
+                                          uint32_t targetShapeSize, const uint32_t *targetShape);
+
+/**
+ * @ingroup dnn
+ * @brief gather elements according to the indices.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the tensor from which to gather elements.
+ * @param [in] x  data point of the tensor from which to gather elements.
+ * @param [in] indicesDesc  description of the tensor of indices.
+ * @param [in] indices  data point of the tensor of indices.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccGatherNdForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                             const ccTensorDescriptor_t indicesDesc, const void *indices, const void *beta,
+                             const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of gather_nd.
+ * @param [in] xDesc  description of the tensor from which to gather elements.
+ * @param [in] indicesDesc  description of the tensor of indices.
+ * @param [output] n dim-size of n-dim.
+ * @param [output] c dim-size of c-dim.
+ * @param [output] h dim-size of h-dim.
+ * @param [output] w dim-size of w-dim.
+ * @param [output] realDimCnt real dim.
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetGatherNdOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t indicesDesc, int32_t *n,
+                                  int32_t *c, int32_t *h, int32_t *w, int32_t *realDimCnt);
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetGatherNdOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t indicesDesc,
+                                  int32_t *dimCnt, int32_t *dim, int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief tile tensor by multiples.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the tensor which to be tiled.
+ * @param [in] x  data point of the tensor which to be tiled.
+ * @param [in] multiples tile coefficient of each dim.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccTileForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                         const ccIntArray_t *multiples, const void *beta, const ccTensorDescriptor_t outputDesc,
+                         void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of tile.
+ * @param [in] xDesc  description of the dividend tensor.
+ * @param [in] multiples  multiples of each dim.
+ * @param [in|out] dimCnt    [point to the output dimCnt]
+ * @param [in|out] dim       [arrays to save dims]
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetTileOutputDim(const ccTensorDescriptor_t xDesc, const ccIntArray_t *multiples, int32_t *dimCnt,
+                              int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of tile.
+ * @param [in] xDesc  description of the dividend tensor.
+ * @param [in] multiples  multiples of each dim.
+ * @param [output] n dim-size of n-dim.
+ * @param [output] c dim-size of c-dim.
+ * @param [output] h dim-size of h-dim.
+ * @param [output] w dim-size of w-dim.
+ * @param [output] realDimCnt real dim.
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetTileOutputDim(const ccTensorDescriptor_t xDesc,
+                              // const ccIntArrayDescriptor_t multiples,
+                              const ccIntArray_t *multiples, int32_t *n, int32_t *c, int32_t *h, int32_t *w,
+                              int32_t *realDimCnt);
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetRealdivOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                 int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief realdiv between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the dividend tensor.
+ * @param [in] x  data point of the dividend tensor.
+ * @param [in] yDesc  description of the divisor tensor.
+ * @param [in] y  data point of the divisor tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccRealdivForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                            const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                            const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the dividend tensor.
+ * @param [in] yDesc  description of the divisor tensor.
+ * @param [output] n dim-size of n-dim.
+ * @param [output] c dim-size of c-dim.
+ * @param [output] h dim-size of h-dim.
+ * @param [output] w dim-size of w-dim.
+ * @param [output] realDimCnt real dim.
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetRealdivOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *n,
+                                 int32_t *c, int32_t *h, int32_t *w, int32_t *realDimCnt);
+
+/**
+ * @ingroup dnn
+ * @brief realdiv between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccFloordivForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                             const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                             const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] realDimCnt real dim.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetFloordivOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                  int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief realdiv between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccGreaterForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                            const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                            const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetGreaterOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                 int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief realdiv between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccLessForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                         const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                         const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLessOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                              int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of LogicalOr.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLogicalOrOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                   int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of LogicalXor.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in] dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLogicalXorOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                    int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief sqrt forward:
+ * data type only support bool
+ * data format only support ND
+ * @param [in] handle cce handle
+ * @param [in] alpha common scale factor
+ * @param [in] xDesc descriptor of input data
+ * @param [in] x input data in device memory
+ * @param [in] beta common scale factor
+ * @param [in] outputDesc descriptor of output data
+ * @param [in|out] output output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccLogicalNotForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                               const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief equal between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccEqualForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                          const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief dump data during inference, only for eng ver.
+ * @param [in] handle        cce handle
+ * @return ccStatus_t
+ */
+ccStatus_t ccDataDumpForward(ccHandle_t handle, const void *buffer, const uint64_t bufLen, const uint32_t taskIndex);
+
+/**
+ * @ingroup dnn
+ * @brief logicaland between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccLogicalAndForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                               const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                               const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief logical or between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccLogicalOrForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                              const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                              const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @ingroup dnn
+ * @brief logical Xor between two tensors(x ^ y = (x | y) & ~(x & y).
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccLogicalXorForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                               const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                               const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of equal.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetEqualOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                               int32_t *dim, int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief get output shape of logicaland.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLogicalAndOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                    int32_t *dim, int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief realdiv between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccFloormodForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                             const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                             const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetFloormodOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                  int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief compare between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t ccCompareForward(ccHandle_t handle, ccCompareType_t compareType, const void *alpha,
+                            const ccTensorDescriptor_t xDesc, const void *x, const ccTensorDescriptor_t yDesc,
+                            const void *y, const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [output] dimCnt dim nums.
+ * @param [output] dim dim size.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetCompareOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                                 int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of FillParam
+ * @param [in|out] fillParamDesc   point to descriptor of fill param
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateFillParamDescriptor(ccFillParamDescriptor_t *fillParamDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of FillParam
+ * @param [in] *fillParamDesc   point to descriptor of fill param
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyFillParamDescriptor(ccFillParamDescriptor_t *fillParamDesc);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of broadcat operations.
+ * @param [in] inputNum  input number of the operation tensors.
+ * @param [in] xDesc[]  description of the input operation tensors list.
+ * @param [output] dimCnt dim-size of output tensor.
+ * @param [output] dim dim of output tensor.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetMultiNdBroadcastOpOutputDim(const int32_t inputNum, const ccTensorDescriptor_t xDesc[], int32_t *dimCnt,
+                                            int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of maximultitensor.
+ * @param [in] inputNum  the num of input operator tensors.
+ * @param [in] xDesc[]  description of the input operator tensors list.
+ * @param [output] dimCnt dim count of output tensor.
+ * @param [output] dim array of output tensor.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetMaxMultitensorOutputDim(const int32_t inputNum, const ccTensorDescriptor_t xDesc[], int32_t *dimCnt,
+                                        int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of minmultitensor.
+ * @param [in] inputNum  the num of input operator tensors.
+ * @param [in] xDesc[]  description of the input operator tensors list.
+ * @param [output] dimCnt dim count of output tensor.
+ * @param [output] dim array of output tensor.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetMinMultitensorOutputDim(const int32_t inputNum, const ccTensorDescriptor_t xDesc[], int32_t *dimCnt,
+                                        int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief MaxMultitensor forward:
+ *          data type only support float float16 and int32
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] inputNum     input tensor number
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc[]      descriptor of input tensors list
+ * @param [in] x[]          input data in device memory list
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccMaxMultitensorForward(const ccHandle_t handle, const int32_t inputNum, const void *alpha,
+                                   const ccTensorDescriptor_t xDesc[], const void *x[], const void *beta,
+                                   const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief MinMultitensor forward:
+ *          data type only support float float16 and int32
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] inputNum     input tensor number
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc[]      descriptor of input data list
+ * @param [in] x[]          input data in device memory list
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccMinMultitensorForward(const ccHandle_t handle, const int32_t inputNum, const void *alpha,
+                                   const ccTensorDescriptor_t xDesc[], const void *x[], const void *beta,
+                                   const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of StridedSlice
+ * @param [in|out] stridedSliceDesc   point to descriptor of StridedSlice param
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateStridedSliceDescriptor(ccStridedSliceDescriptor_t *stridedSliceDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of StridedSlice
+ * @param [in] *stridedSliceDesc   point to descriptor of StridedSlice param
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyStridedSliceDescriptor(ccStridedSliceDescriptor_t *stridedSliceDesc);
+
+/**
+ * @ingroup dnn
+ * @brief init stridedSlice descriptor_t.
+ * @param [out] stridedSliceDesc   struct of stridedslice param
+ * @param [in] dimCnt    dimension of the input tensor
+ * @param [in] begin     slice begin(include)
+ * @param [in] end       slice end index(not include)
+ * @param [in] strides   slice stride
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetStridedSliceDescriptor(ccStridedSliceDescriptor_t stridedSliceDesc, int32_t dimCnt, int32_t begin[],
+                                       int32_t end[], int32_t strides[]);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of StridedSlice
+ * @param [in|out] stridedSliceDesc   point to descriptor of StridedSlice attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateStridedSliceAttrsDescriptor(ccStridedSliceAttrsDescriptor_t *attrDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of StridedSlice
+ * @param [in] *stridedSliceDesc   point to descriptor of StridedSlice attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyStridedSliceAttrsDescriptor(ccStridedSliceAttrsDescriptor_t *attrDesc);
+
+/**
+ * @ingroup dnn
+ * @brief init stridedSlice mask attrs desescriptor.
+ * @param [out] attrDesc   struct of stridedslice mask attrs
+ * @param [in] beginMask     begin mask
+ * @param [in] endMask       end mask
+ * @param [in] ellipsisMask  ellipsis mask
+ * @param [in] newAxisMask   new axis mask
+ * @param [in] shrinkAxisMask  shrink axis mask
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetStridedSliceAttrsDescriptor(ccStridedSliceAttrsDescriptor_t attrDesc, int32_t beginMask,
+                                            int32_t endMask, int32_t ellipsisMask, int32_t newAxisMask,
+                                            int32_t shrinkAxisMask);
+
+/**
+ * @ingroup dnn
+ * @brief Extracts a strided slice of a tensor.
+ * @param [in] xDesc   descriptor of input data
+ * @param [in] stridedSliceDesc specifies the begin, end, strides of slice
+ * @param [in] attrDesc  reserve for optional attributes.
+ * @param [out] n       point to n size
+ * @param [out] c       point to c size
+ * @param [out] h       point to h size
+ * @param [out] w       point to w size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetStridedSliceOutputDim(const ccTensorDescriptor_t xDesc,
+                                      const ccStridedSliceDescriptor_t stridedSliceDesc,
+                                      const ccStridedSliceAttrsDescriptor_t attrDesc, int32_t *n, int32_t *c,
+                                      int32_t *h, int32_t *w, int32_t *realDimCnt);
+
+/**
+ * @ingroup dnn
+ * @brief Extracts a strided slice of a tensor.
+ * @param [in] handle  cce handle
+ * @param [in] stridedSliceDesc specifies the  begin, end, strides of slice
+ * @param [in] attrDesc  reserve for optional attributes.
+ * @param [in] alpha   common scale factor
+ * @param [in] xDesc   descriptor of input data
+ * @param [in] x   input data in device memory
+ * @param [in] beta    common scale factor
+ * @param [in] yDesc   descriptor of output data
+ * @param [in|out] y   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccStridedSliceForward(ccHandle_t handle, const ccStridedSliceDescriptor_t stridedSliceDesc,
+                                 const ccStridedSliceAttrsDescriptor_t attrDesc, const void *alpha,
+                                 const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                 const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @
+ * @brief get out put descrition of slice tensor.
+ * @param [in] xDesc         descriptor of input data
+ * @param [in] begin         begin position of tensor
+ * @param [in] size          size to slice
+ * @param [out] n            point to n size
+ * @param [out] c            point to c size
+ * @param [out] h            point to h size
+ * @param [out] w            point to w size
+ * @param [out] realDimCnt   realdim count
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetSliceOutputDim(const ccTensorDescriptor_t xDesc, const ccIntArray_t *begin, const ccIntArray_t *size,
+                               int32_t *n, int32_t *c, int32_t *h, int32_t *w, int32_t *realDimCnt);
+
+/**
+ * @ingroup dnn
+ * @brief slice of a tensor.
+ * @param [in] handle  cce handle
+ * @param [in] alpha   common scale factor
+ * @param [in] xDesc   descriptor of input data
+ * @param [in] x       input data in device memory
+ * @param [in] begin   begin position of tensor
+ * @param [in] size    size to slice
+ * @param [in] beta    common scale factor
+ * @param [in] yDesc   descriptor of output data
+ * @param [in|out] y   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSliceForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const ccIntArray_t *begin, const ccIntArray_t *size, const void *beta,
+                          const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief gather forward computation
+ * @param [in] handle            cce handle
+ * @param [in] paramsDesc        descriptor of params tensor
+ * @param [in] params            input data in device memory
+ * @param [in] indicesDesc       descriptor of indices tensor
+ * @param [in] indices           indices data in device memory
+ * @param [in] axis              descriptor of roi tensor
+ * @param [in] alpha             reserved
+ * @param [in] beta              reserved
+ * @param [in] outputDesc        descriptor of output tensor
+ * @param [out] output           output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccGatherForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t paramsDesc,
+                           const void *params, const ccTensorDescriptor_t indicesDesc, const void *indices,
+                           const int32_t axis, const void *beta, ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief gather output dim computation, for NC1HWC0
+ * @param [in] paramsDesc        descriptor of params tensor
+ * @param [in] indicesDesc       descriptor of indices tensor
+ * @param [in] axis              descriptor of roi tensor
+ * @param [out] n                dim of n
+ * @param [out] c                dim of c
+ * @param [out] h                dim of h
+ * @param [out] w                dim of w
+ * @param [out] realDimCnt       real dim count
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetGatherOutputDim(const ccTensorDescriptor_t paramsDesc, const ccTensorDescriptor_t indicesDesc,
+                                int32_t axis, int32_t *n, int32_t *c, int32_t *h, int32_t *w, int32_t *realDimCnt);
+
+/**
+ * @ingroup dnn
+ * @brief gather output dim computation
+ * @param [in] paramsDesc        descriptor of params tensor
+ * @param [in] indicesDesc       descriptor of indices tensor
+ * @param [in] axis              descriptor of roi tensor
+ * @param [out] dimCnt           dimcnt of output
+ * @param [out] dim              dim of output
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetGatherOutputDim(const ccTensorDescriptor_t paramsDesc, const ccTensorDescriptor_t indicesDesc,
+                                int32_t axis, int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief exp forward computation
+ * @param [in] handle                    cce handle
+ * @param [in] expDesc                   descriptor of expParam
+ * @param [in] expParam                  a ternary array
+ * @param [in] alpha                     reserved parameter
+ * @param [in] xDesc                     descriptor of input tensor
+ * @param [in] x                         input data in device memory
+ * @param [in] beta                      reserved parameter
+ * @param [in] yDesc                     descriptor of output tensor
+ * @param [out] y                        output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccExpForward(ccHandle_t handle, const ccExpDescriptor_t expDesc, const void *expParam, const void *alpha,
+                        const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                        const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief expm1 forward:
+ *          data type only support float float16 and double
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccExpm1Forward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief log1p forward:
+ *          data type only support float float16 and double
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccLog1pForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief init descriptor for parameter of exp function
+ * @param [in|out] powDesc   descriptor of tensor
+ * @param [in] dataType   data type in device
+ * @param [in] paramCnt   number of parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetExpDescriptor(ccExpDescriptor_t expDesc, ccDataType_t dataType, uint32_t paramCnt);
+
+/**
+ * @ingroup dnn
+ * @brief exp forward computation
+ * @param [in] handle                    cce handle
+ * @param [in] logDesc                   descriptor of logParam
+ * @param [in] logParam                  a ternary array
+ * @param [in] alpha                     reserved parameter
+ * @param [in] xDesc                     descriptor of input tensor
+ * @param [in] x                         input data in device memory
+ * @param [in] beta                      reserved parameter
+ * @param [in] yDesc                     descriptor of output tensor
+ * @param [in] y                         output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccLogForward(ccHandle_t handle, const ccLogDescriptor_t logDesc, const void *logParam, const void *alpha,
+                        const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                        const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief init descriptor for parameter of log function
+ * @param [in|out] logDesc   descriptor of tensor
+ * @param [in] dataType   data type in device
+ * @param [in] paramCnt   number of parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetLogDescriptor(ccLogDescriptor_t logDesc, ccDataType_t dataType, uint32_t paramCnt);
+
+/**
+ * @ingroup dnn
+ * @brief pow forward computation
+ * @param [in] handle                    cce handle
+ * @param [in] powDesc                   descriptor of logParam
+ * @param [in] powParam                  a ternary array
+ * @param [in] alpha                     reserved parameter
+ * @param [in] xDesc                     descriptor of input tensor
+ * @param [in] x                         input data in device memory
+ * @param [in] beta                      reserved parameter
+ * @param [in] yDesc                     descriptor of input tensor
+ * @param [in] y                         input data in device memory
+ * @param [in] zDesc                     descriptor of output tensor
+ * @param [out] z                        output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccPowForward(ccHandle_t handle, const ccPowDescriptor_t powDesc, const void *powParam, const void *alpha,
+                        const ccTensorDescriptor_t xDesc, const void *x, const ccTensorDescriptor_t yDesc,
+                        const void *y, const void *beta, const ccTensorDescriptor_t zDesc, void *z);
+
+/**
+ * @brief init descriptor for parameter of pow function
+ * @param [in|out] powDesc   descriptor of tensor
+ * @param [in] dataType   data type in device
+ * @param [in] paramCnt   number of parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetPowDescriptor(ccPowDescriptor_t powDesc, ccDataType_t dataType, uint32_t paramCnt);
+
+/**
+ * @ingroup dnn
+ * @brief non max suppression forward.
+ * @param [in] handle                 cce handle
+ * @param [in] nonmaxParaDesc         descriptor of para
+ * @param [in] nonmaxPara             input para in host memory
+ * @param [in] maxoutputsizex         input para in host memory
+ * @param [in] alpha                  common scale factor
+ * @param [in] boxesDesc              descriptor of input data boxesDesc
+ * @param [in] boxes                  input data boxes in device memory
+ * @param [in] scoresDesc             descriptor of input data boxesDesc
+ * @param [in] scores                 input data scores in device memory
+ * @param [in] workSpaceSizeInBytes   workspace size
+ * @param [in] workSpace              input workspace in device memory
+ * @param [in] beta                   common scale factor
+ * @param [in] outputDesc             descriptor of output data
+ * @param [in|out] output             output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccNonMaxSuppressionForward(ccHandle_t handle, const ccNonMaxSuppressionDescriptor_t nonmaxParaDesc,
+                                      const void *nonmaxPara, const int *maxoutputsize, const void *alpha,
+                                      const ccTensorDescriptor_t boxesDesc, const void *boxes,
+                                      const ccTensorDescriptor_t scoresDesc, const void *scores,
+                                      const uint32_t workSpaceSizeInBytes, void *workSpace, const void *beta,
+                                      const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @brief init descriptor for parameter of NonMaxSuppression function
+ * @param [in|out] powDesc   descriptor of tensor
+ * @param [in] dataType   data type in device
+ * @param [in] paramCnt   number of parameters
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetNonMaxSuppressionDescriptor(ccNonMaxSuppressionDescriptor_t nonMaxSuppressionDesc,
+                                            ccDataType_t dataType, uint32_t paramCnt);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of resizeBilinear op.
+ * @param [in] xDesc                    descriptor of input data
+ * @param [in] resizeBilinearDesc       descriptor of resize_bilinear operator
+ * @param [out] dimCnt
+ * @param [out] dim[]                   dim of output
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetResizeBilinearOutputDim(const ccTensorDescriptor_t xDesc,
+                                        const ccResizeBilinearDescriptor_t resizeBilinearDesc, int32_t *dimCnt,
+                                        int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of interp op.
+ * @param [in] xDesc                    descriptor of input data
+ * @param [in] resizeBilinearDesc       descriptor of resize_bilinear operator
+ * @param [out] dimCnt
+ * @param [out] dim[]                   dim of output
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetInterpOutputDim(const ccTensorDescriptor_t xDesc, const ccResizeBilinearDescriptor_t resizeBilinearDesc,
+                                int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief resize bilinear forward for t network.
+ * @param [in] handle    cce handle
+ * @param [in] resizeBilinearDesc   descriptor of resize_bilinear operator
+ * @param [in] alpha     common scale factor
+ * @param [in] xDesc     descriptor of input data
+ * @param [in] x         input data in device memory
+ * @param [in] beta      common scale factor
+ * @param [in] yDesc     descriptor of output data
+ * @param [in|out] y     output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccResizeBilinearForward(ccHandle_t handle, const ccResizeBilinearDescriptor_t resizeBilinearDesc,
+                                   const void *alpha, const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                   const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief resize bilinear forward for c network.
+ * @param [in] handle    cce handle
+ * @param [in] resizeBilinearDesc   descriptor of resize_bilinear operator
+ * @param [in] alpha     common scale factor
+ * @param [in] xDesc     descriptor of input data
+ * @param [in] x         input data in device memory
+ * @param [in] beta      common scale factor
+ * @param [in] yDesc     descriptor of output data
+ * @param [in|out] y     output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccInterpForward(ccHandle_t handle, const ccResizeBilinearDescriptor_t resizeBilinearDesc, const void *alpha,
+                           const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                           const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of ResizeBilinear
+ * @param [in|out] resizeBilinearDesc   point to descriptor of resizeBilinear attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateResizeBilinearDescriptor(ccResizeBilinearDescriptor_t *resizeBilinearDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of Interp
+ * @param [in|out] resizeBilinearDesc   point to descriptor of resizeBilinear attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyResizeBilinearDescriptor(ccResizeBilinearDescriptor_t *resizeBilinearDesc);
+
+/**
+ * @ingroup dnn
+ * @brief set descriptor of resizeBilinear.
+ * @param [in|out] resizeBilinearDesc   descriptor of resize_bilinear operator
+ * @param [in] resizeOutputDimMode      way to decide output dimensions
+ * @param [in] alignCorners             whether the centers of input and output are aligned
+ * @param [in] zoom_factor              zoom factor
+ * @param [in] shrink_factor            shrink factor
+ * @param [in] height                   height of output
+ * @param [in] width                    width of output
+ * @param [in] pad_begin                padding at begin of input
+ * @param [in] pad_end                  padding at end of input
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetResizeBilinearDescriptor(ccResizeBilinearDescriptor_t resizeBilinearDesc,
+                                         ccResizeOutputDimMode_t resizeOutputDimMode, bool alignCorners,
+                                         int32_t zoom_factor, int32_t shrink_factor, int32_t height, int32_t width,
+                                         int32_t pad_begin, int32_t pad_end);
+
+/**
+ * @ingroup dnn
+ * @brief fill forward computation
+ * @param [in] handle                  cce handle
+ * @param [in] fillParamDesc           descriptor of fill parameter
+ * @param [in] alpha                   reserved
+ * @param [in] givenDesc               descriptor of given tensor
+ * @param [in] givenData               given data in device memory
+ * @param [in] workspace               space for fill algorithm
+ * @param [in] workSpaceSizeInBytes    space size in byte
+ * @param [in] beta                    reserved
+ * @param [in] outputDesc              descriptor of output tensor
+ * @param [out] output                 output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccFillForward(ccHandle_t handle, const ccFillParamDescriptor_t fillParamDesc, const void *alpha,
+                         const ccTensorDescriptor_t givenDesc, const void *givenData, const void *workspace,
+                         const uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t outputDesc,
+                         void *output);
+
+/**
+ * @ingroup dnn
+ *[ccGetFillWorkspaceSize]
+ *@param fillType         [fill type]
+ *@param givenDesc        [given tensor descriptor]
+ *@param xDesc            [input tensor descriptor]
+ *@param sizeInBytes      [output size]
+ *@return ccStatus_t      [status]
+ */
+ccStatus_t ccGetFillWorkspaceSize(const ccFillOpType_t fillType, const ccTensorDescriptor_t xDesc,
+                                  uint32_t *sizeInBytes);
+
+/**
+ *[ccCast]
+ *@param handle     [cce handler]
+ *@param alpha       [alpha]
+ *@param xDesc      [tensor Description of tensor x]
+ *@param x             [input tensor x]
+ *@param beta         [beta
+ *@param yDesc      [tensor Description of tensor y]
+ *@param y             [output tensor y]
+ *@return ccStatus_t  [status]
+ */
+ccStatus_t ccCast(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                  const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief round forward:
+ *          data type only support float float16 and int32
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccRoundForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief rint forward:
+ *          data type only support float float16
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccRintForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                         const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief sqrt forward:
+ *          data type only support float float16
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSqrtForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                         const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ *[ccCast]
+ *@param filterSrcInfo     [cce filtersrc descriptor]
+ *@param filterSrc       [filterSrc address]
+ *@param filterDstInfo      [cce filterdst descriptor]
+ *@param filterDst             [filterdst address]
+ *@param group         [group]
+ *@param ySizeInBytes      [fraczfilter size]
+ *@param outputDataType            [datatype]
+ *@return ccStatus_t  [status]
+ */
+ccStatus_t ccTransGroupConvFilterInt8(ccFilterDescriptor_t filterSrcInfo, const void *filterSrc,
+                                      ccFilterDescriptor_t filterDstInfo, void *filterDst, uint32_t group,
+                                      uint32_t ySizeInBytes, ccDataType_t outputDataType);
+
+/**
+ *[ccGetConcatOutputDim]
+ *@param xDesc[]     [input tensor descriptor]
+ *@param axis        [concat axis]
+ *@param inputNum    [input tensor numbers]
+ *@param dim[]       [output dim]
+ *@param [in| dimlen        length of dim
+ *@return ccStatus_t [status]
+ */
+ccStatus_t ccGetConcatOutputDim(const ccTensorDescriptor_t xDesc[], int32_t axis, int32_t inputNum, int32_t *dimCnt,
+                                int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of reduce.
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] axis             The dimensions to reduce
+ * @param [in] keepDims         If true, retains reduced dimensions with length 1.
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetReduceOutputDim(const ccTensorDescriptor_t xDesc, const ccIntArray_t *axis, bool keepDims,
+                                int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief reduce sum forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceSumForward(ccHandle_t handle, const ccIntArray_t *axis, bool keepDims, const void *alpha,
+                              const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                              const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reduce max forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceMaxForward(ccHandle_t handle, const ccIntArray_t *axis, bool keepDims, const void *alpha,
+                              const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                              const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reduce min forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceMinForward(ccHandle_t handle, const ccIntArray_t *axis, bool keepDims, const void *alpha,
+                              const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                              const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reduce mean forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceMeanForward(ccHandle_t handle, const ccIntArray_t *axis, bool keepDims, const void *alpha,
+                               const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                               const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reduce prod forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceProdForward(ccHandle_t handle, const ccIntArray_t *axis, bool keepDims, const void *alpha,
+                               const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                               const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reduce all forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceAllForward(ccHandle_t handle, const ccIntArray_t *axis, bool keepDims, const void *alpha,
+                              const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                              const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ *@brief print times stats
+ *@return ccStatus_t  [status]
+ */
+ccStatus_t ccPrintTimeStat();
+
+/**
+ * @ingroup dnn
+ * @brief reduce abs sum forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceAbsSumForward(ccHandle_t handle, const ccIntArray_t *axis, const bool keepDims, const void *alpha,
+                                 const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                 const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reduce square sum forward computation
+ * @param [in] handle          cce handle
+ * @param [in] axis            The dimensions to reduce
+ * @param [in] keepDims        If true, retains reduced dimensions with length 1.
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReduceSquareSumForward(ccHandle_t handle, const ccIntArray_t *axis, const bool keepDims, const void *alpha,
+                                    const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                    const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of crop and resize
+ * @param [in] imageDesc             descriptor of images
+ * @param [in] boxesDesc             descriptor of boxes
+ * @param [in] boxidxDesc            descriptor of boxidx
+ * @param [in] resizeHeight          resize height
+ * @param [in] resizeWidth           resize width
+ * @param [out] dimCnt               dimcnt of output
+ * @param [out] dim                  dim of output
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetCropAndResizeOutputDim(const ccTensorDescriptor_t imageDesc, const ccTensorDescriptor_t boxesDesc,
+                                       const ccTensorDescriptor_t boxidxDesc, const int32_t resizeHeight,
+                                       const int32_t resizeWidth, int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief crop and resize forward.
+ * @param [in] handle                cce handle
+ * @param [in] alpha                 common scale factor
+ * @param [in] imageDesc             descriptor of images
+ * @param [in] image                 input data in device memory
+ * @param [in] boxesDesc             descriptor of boxes
+ * @param [in] boxes                 input data in device memory
+ * @param [in] boxidxDesc            descriptor of boxidx
+ * @param [in] boxidx                input data in device memory
+ * @param [in] method                enum of resize method
+ * @param [in] extrapolationValue    Value used for extrapolation, when applicable
+ * @param [in] beta                  common scale factor
+ * @param [in] outputDesc            descriptor of output data
+ * @param [out] output               output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccCropAndResizeForward(ccHandle_t handle, const ccResizeMethod_t method, const float extrapolationValue,
+                                  const void *alpha, const ccTensorDescriptor_t imageDesc, const void *image,
+                                  const ccTensorDescriptor_t boxesDesc, const void *boxes,
+                                  const ccTensorDescriptor_t boxidxDesc, const void *boxidx, const void *beta,
+                                  const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief select forward computation
+ * @param [in] handle            cce handle
+ * @param [in] alpha             reserved
+ * @param [in] condDesc          descriptor of cond tensor
+ * @param [in] cond              cond data in device memory
+ * @param [in] xDesc             descriptor of x tensor
+ * @param [in] x                 x data in device memory
+ * @param [in] yDesc             descriptor of y tensor
+ * @param [in] y                 y data in device memory
+ * @param [in] beta              reserved
+ * @param [in] outputDesc        descriptor of output tensor
+ * @param [out] output           output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSelect(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t condDesc, const void *cond,
+                    const ccTensorDescriptor_t xDesc, const void *x, const ccTensorDescriptor_t yDesc, const void *y,
+                    const void *beta, const ccTensorDescriptor_t outDesc, void *out);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of where
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetWhereOutputDim(const ccTensorDescriptor_t xDesc, int32_t *dimCnt, int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief where forward computation
+ * @param [in] handle            cce handle
+ * @param [in] alpha             reserved
+ * @param [in] condDesc          descriptor of cond tensor
+ * @param [in] cond              cond data in device memory
+ * @param [in] xDesc             descriptor of x tensor
+ * @param [in] x                 x data in device memory
+ * @param [in] yDesc             descriptor of y tensor
+ * @param [out] y                y data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccWhere(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                   const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief reverse forward.
+ * @param [in] handle       cce handle
+ * @param [in] axis         dim that need reverse
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReverseForward(ccHandle_t handle, const ccIntArray_t *axis, const void *alpha,
+                            const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                            const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief floor forward:
+ *          data type only support float float16
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccFloorForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief ceil forward:
+ *          data type only support float float16
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccCeilForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                         const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of truncate mod
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] yDesc            descriptor of input tensor
+ * @param [out] dimCnt        [dim count of the output tensor]
+ * @param [out] dim[]         [shape of the output tensor]
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetTruncatemodOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc,
+                                     int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief truncate mod forward computation
+ * @param [in] handle          cce handle
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] yDesc           descriptor of input tensor
+ * @param [in] y               input data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [out] output         output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccTruncatemodForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                                const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @ingroup dnn
+ * @brief Spatial Pyramid Pooling
+ * @param [in] handle                cce handle
+ * @param [in] alpha                 reserved
+ * @param [in] xDesc                 descriptor of input tensor
+ * @param [in] x                     input data in device memory
+ * @param [in] workspace             temp workspace
+ * @param [in] workspaceSizeInBytes  temp workspace size
+ * @param [in] pyramidHeight         pyramid height
+ * @param [in] poolingMode           pooling mode
+ * @param [in] beta                  reserved
+ * @param [in] outputDesc            descriptor of output tensor
+ * @param [out] output               output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSPPForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                        void *workspace, const uint32_t workspaceSizeInBytes, const uint32_t pyramidHeight,
+                        const ccPoolingMode_t poolingMode, const void *beta, const ccTensorDescriptor_t outputDesc,
+                        void *output);
+/**
+ * @ingroup dnn
+ * @brief Get Spatial Pyramid Pooling output dim
+ * @param [in] xDesc                 descriptor of input tensor
+ * @param [in] pyramidHeight         pyramid height
+ * @param [in] dimLen                length of dim
+ * @param [out] dimCnt               output tensor dim cnt
+ * @param [out] dim                  output tensor dim
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetSPPOutputDim(const ccTensorDescriptor_t xDesc, const uint32_t pyramidHeight, int32_t *dimCnt,
+                             int32_t dim[], const int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief Get Spatial Pyramid Pooling workspace size
+ * @param [in] xDesc                 descriptor of input tensor
+ * @param [in] pyramidHeight         pyramid height
+ * @param [out] workspaceSizeInBytes workspace size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetSPPWorkspaceSize(const ccTensorDescriptor_t xDesc, const uint32_t pyramidHeight,
+                                 uint32_t *workspaceSizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief BNLL forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccBNLLForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                         const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief bias forward.
+ * @param [in] handle       cce handle
+ * @param [in] axis            axis
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data x
+ * @param [in] x            input data x in device memory
+ * @param [in] biasDesc        descriptor of input data bias
+ * @param [in] bias            input data bias in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccBiasForward(ccHandle_t handle, const int axis, const void *alpha, const ccTensorDescriptor_t xDesc,
+                         const void *x, const ccTensorDescriptor_t biasDesc, const void *bias, const void *beta,
+                         const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief threshold forward computation
+ * @param [in] handle           cce handle
+ * @param [in] threshold        threshold
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccThresholdForward(ccHandle_t handle, const void *threshold, const void *alpha,
+                              const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                              const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief shufflechannel forward.
+ * @param [in] handle    cce handle
+ * @param [in] alpha     common scale factor
+ * @param [in] group     number of groups
+ * @param [in] xDesc     descriptor of input data
+ * @param [in] x         input data in device memory
+ * @param [in] beta      common scale factor
+ * @param [in] outputDesc     descriptor of output data
+ * @param [in|out] output     output data in device memory
+ * @return ccStatus_t
+ */
+// TODO AICPU: please add shufflechannel custom params and comment
+ccStatus_t ccShuffleChannelForward(ccHandle_t handle, const void *alpha, uint32_t group,
+                                   const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                   const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief mvn forward.
+ * @param [in] handle               cce handle
+ * @param [in] acrossChannel        across channel. true: across, false: not
+ * @param [in] normalizeVariance    normalizeVariance. true: normalizeVariance, false: not
+ * @param [in] alpha                common scale factor
+ * @param [in] xDesc                descriptor of input data
+ * @param [in] x                    input data in device memory
+ * @param [in] beta                 common scale factor
+ * @param [in] outputDesc           descriptor of output data
+ * @param [in|out] output           output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccMVNForward(ccHandle_t handle, bool acrossChannel, bool normalizeVariance, const void *alpha,
+                        const ccTensorDescriptor_t xDesc, const void *x, void *workSpace, uint32_t workSpaceSizeInBytes,
+                        const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the workspace size of mvn
+ * @param [in] xDesc                descriptor of input data
+ * @param [in] acrossChannel        across channel. true: across, false: not
+ * @param [in|out] sizeInBytes      Workspace size need for whole computation
+ */
+ccStatus_t ccGetMVNWorkspaceSize(const ccTensorDescriptor_t xDesc, bool acrossChannel, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief heatmap2coord forward output is hotspot value and corresponding coordinates
+ * @param [in] handle        cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x        input data in device memory
+ * @param [in] coordh       calibration high
+ * @param [in] coordw       calibration wide
+ * @param [in] beta        common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccHeatmap2coordForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                  int32_t coordh, int32_t coordw, const void *beta,
+                                  const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of heatmap2coord
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim           arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetHeatmap2coordOutputDim(const ccTensorDescriptor_t xDesc, int32_t *dimCnt, int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief swish forward.
+ * @param [in] handle           cce handle
+ * @param [in] scale            param of swish function, y = x / (1 + sigmoid(scale * x))
+ * @param [in] alpha            common scale factor
+ * @param [in] xDesc            descriptor of input data
+ * @param [in] x                input data in device memory
+ * @param [in] beta             common scale factor
+ * @param [in] outputDesc       descriptor of output data
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccSwishForward(ccHandle_t handle, const float scale, const void *alpha, const ccTensorDescriptor_t xDesc,
+                          const void *x, const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+ccStatus_t ccTeForward(ccHandle_t handle, const void *stubFunc, uint32_t coreDim, const void *args, uint32_t argsSize,
+                       const rtL2Ctrl_t *l2ctrl, int32_t inputNum, const ccTensorDescriptor_t xDesc[], const void *x[],
+                       int32_t outputNum, const ccTensorDescriptor_t yDesc[], void *y[], bool isAiCore);
+
+#ifndef DAVINCI_LITE
+ccStatus_t ccAiCpuCustomizeForward(ccHandle_t handle, aicpu_run_func stubFunc, opTensor_t *xOpDesc[], void *x[],
+                                   int32_t inputNum, opTensor_t *yOpDesc[], void *y[], void *op_attr_handle,
+                                   int32_t outputNum, const ccTensorDescriptor_t xDesc[],
+                                   const ccTensorDescriptor_t yDesc[], const void *op_attr_str, uint32_t op_attr_size);
+#endif
+/**
+ * @ingroup dnn
+ * @brief embedding lookup forward.
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data x
+ * @param [in] x            input data x in device memory
+ * @param [in] idxDesc        descriptor of input data idx
+ * @param [in] idx            input data idx in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccEmbeddingLookupForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc,
+                                    const void *x, const ccTensorDescriptor_t idxDesc, const void *idx,
+                                    const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup
+ * @brief embedding lookup forward.
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] inputNum   inputNum
+ * @param [in] xDesc[]        descriptor array of input data x
+ * @param [in] x[]            input data x array in device memory
+ * @param [in] workSpace    workSpace addr
+ * @param [in] workSpaceSizeInBytes    workSpace size
+ * @param [in] idxDesc        descriptor of input data idx
+ * @param [in] idx            input data idx in device memory
+ * @param [in] partitionStrategy  partitionStrategy
+ * @param [in] maxNorm            addr of maxNorm
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccEmbeddingLookupForward(ccHandle_t handle, const void *alpha, const int32_t inputNum,
+                                    const ccTensorDescriptor_t xDesc[], const void *x[], void *workSpace,
+                                    const uint32_t workSpaceSizeInBytes, const ccTensorDescriptor_t idxDesc,
+                                    const void *idx, ccPartitionStrategy_t partitionStrategy, const void *maxNorm,
+                                    const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ *[ccGetEmbeddingLookupOutputDim]
+ *@param inputNum    [input tensor numbers]
+ *@param xDesc[]     [input tensor descriptor]
+ *@param idxDesc     [idx tensor descriptor]
+ *@param dimCnt      [output dim count]
+ *@param dim[]       [output dim]
+ *@param [in| dimlen        length of dim
+ *@return ccStatus_t [status]
+ */
+ccStatus_t ccGetEmbeddingLookupOutputDim(const int32_t inputNum, const ccTensorDescriptor_t xDesc[],
+                                         const ccTensorDescriptor_t idxDesc, int32_t *dimCnt, int32_t dim[],
+                                         int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ *[ccGetEmbeddingLookupWorkspaceSize]
+ *@param inputNum    [input tensor numbers]
+ *@param idxDesc      [input tensor descriptor]
+ *@param isMaxNormExist      [isMaxNormExist]
+ *@param sizeInBytes      [output size]
+ *@return ccStatus_t [status]
+ */
+ccStatus_t ccGetEmbeddingLookupWorkspaceSize(const int32_t inputNum, const ccTensorDescriptor_t idxDesc,
+                                             const bool isMaxNormExist, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief check if it is the first layer of resnet50 and semecefc
+ * @param [in] tensorDesc           descriptor of input tensor.
+ * @param [in] convDesc             conv descriptor.
+ * @param [in] filterDesc           descriptor of weight tensor.
+ * @return ccStatus_t
+ */
+ccStatus_t c04DescParamCheck(const ccTensorDescriptor_t tensorDesc, const ccConvolutionDescriptor_t convDesc,
+                             const ccFilterDescriptor_t filterDesc);
+
+#ifndef DAVINCI_LITE
+/**
+ * @ingroup dnn
+ * @brief convolution forward computation
+ * @param [in] handle   cce handle
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in] alpha   scaling factors
+ * @param [in] beta   scaling factors
+ * @param [in] xDesc  x descriptor of input tensor
+ * @param [in] x   x data in device memory
+ * @param [in] dyDesc   descriptor of dy
+ * @param [in] dy   dy data in device memory
+ * @param [in] dwDesc   descriptor of dwDesc
+ * @param [out] dw   dw data in device memory
+ * @param [in] algo   algorithm of convolution forward
+ * @param [in] workSpace   temp space, maybe NULL if no need temp space
+ * @param [in] workSpaceSizeInBytes   sizeof workspace
+ * @return ccStatus_t
+ */
+ccStatus_t ccConvolutionBackwardFilter(ccHandle_t handle, const ccConvolutionDescriptor_t convDesc, void *alpha,
+                                       void *beta, const ccTensorDescriptor_t xDesc, const void *x,
+                                       const ccTensorDescriptor_t dyDesc, const void *dy,
+                                       const ccFilterDescriptor_t dwDesc, void *dw, ccConvolutionBwdAlgo_t algo,
+                                       void *workSpace, uint32_t workSpaceSizeInBytes);
+#endif
+
+/**
+ * @ingroup dnn
+ * @brief get the temp space size of convolution forward computation, maybe no need temp space
+ * @param [in] handle   cce handle
+ * @param [in] dyDesc   descriptor of input tensor dy
+ * @param [in] convDesc   descriptor of convolution operator
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in] dwDesc   descriptor of filter
+ * @param [in] algo   algorithm of convolution forward
+ * @param [in|out] sizeInBytes   temp space size need for specified algorithm
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetConvolutionBackwardFilterWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t dyDesc,
+                                                       const ccConvolutionDescriptor_t convDesc,
+                                                       const ccTensorDescriptor_t xDesc,
+                                                       const ccFilterDescriptor_t dwDesc, ccConvolutionBwdAlgo_t algo,
+                                                       uint32_t *sizeInBytes);
+
+#ifndef DAVINCI_LITE
+ccStatus_t ccBatchNormalizationBackward(ccHandle_t handle, ccBatchNormMode_t mode, const void *alphaDataDiff,
+                                        const void *betaDataDiff, const void *alphaParamDiff, const void *betaParamDiff,
+                                        const ccTensorDescriptor_t xDesc, const void *x,
+                                        const ccTensorDescriptor_t dyDesc, const void *dy,
+                                        const ccTensorDescriptor_t dxDesc, void *dx,
+                                        const ccTensorDescriptor_t bnScaleBiasDiffDesc, const void *bnScale,
+                                        void *resultBnScaleDiff, void *resultBnBiasDiff, const void *workSpace,
+                                        const uint32_t workSpaceSizeInBytes, double epsilon, const void *SaveMean,
+                                        const void *SaveInvVariance);
+#endif
+
+ccStatus_t ccGetBatchNormalizationBackwardWorkspaceSize(ccHandle_t handle, ccBatchNormMode_t mode,
+                                                        ccTensorDescriptor_t xDesc, ccTensorDescriptor_t dyDesc,
+                                                        ccTensorDescriptor_t dxDesc,
+                                                        ccTensorDescriptor_t bnScaleBiasDesc, uint32_t *sizeInBytes);
+
+#ifndef DAVINCI_LITE
+ccStatus_t ccBatchNormalizationForwardTraining(ccHandle_t handle, ccBatchNormMode_t mode, const void *alpha,
+                                               const void *beta, const ccTensorDescriptor_t xDesc, const void *x,
+                                               const ccTensorDescriptor_t yDesc, void *y,
+                                               const ccTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+                                               const void *bnBias, double exponentialAverageFactor,
+                                               void *resultRunningMean, void *resultRunningVariance, void *workSpace,
+                                               uint32_t workSpaceSizeInBytes, double epsilon, void *resultSaveMean,
+                                               void *resultSaveInvVariance, const bool isTraining);
+#endif
+
+ccStatus_t ccGetBatchNormalizationForwardTrainingWorkspaceSize(ccHandle_t handle, ccBatchNormMode_t mode,
+                                                               ccTensorDescriptor_t xDesc, ccTensorDescriptor_t yDesc,
+                                                               const ccTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                               uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief generate an random normal Tensor use given on/off scale.
+ * @param [in] handle        Stream handle.
+ * @param [in] alpha         reserved.
+ * @param [in] meanDesc      Mean description of one-hot position.
+ * @param [in] mean          Data pointer of mean.
+ * @param [in] scaleDesc     On/off scale description.
+ * @param [in] scale         Data pointer of on/off scale.
+ * @param [in] seed          random seed used to generate random number
+ * @param [in] seed2         random seed used to generate random number
+ * @param [in] beta          reserved.
+ * @param [in] outputDesc    Description of the generated one-hot tensor.
+ * @param [output] output    Data pointer of output.
+ * @return ccStatus_t
+ */
+ccStatus_t ccRandomNormalForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t meanDesc,
+                                 const void *mean, const ccTensorDescriptor_t scaleDesc, const void *scale,
+                                 const int64_t seed1, const int64_t seed2, const void *beta,
+                                 const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief generate random uniform tensor.
+ * @param [in] handle        Stream handle.
+ * @param [in] alpha         reserved.
+ * @param [in] minvalDesc    Mean description of one-hot position.
+ * @param [in] minval        Data pointer of mean.
+ * @param [in] maxvalDesc    On/off scale description.
+ * @param [in] maxval        Data pointer of on/off scale.
+ * @param [in] seed          random seed used to generate random number
+ * @param [in] seed2         random seed used to generate random number
+ * @param [in] beta          reserved.
+ * @param [in] outputDesc    Description of the generated one-hot tensor.
+ * @param [output] output    Data pointer of output.
+ * @return ccStatus_t
+ */
+ccStatus_t ccRandomUniformForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t minvalDesc,
+                                  const void *minval, const ccTensorDescriptor_t maxvalDesc, const void *maxval,
+                                  const int64_t seed1, const int64_t seed2, const void *beta,
+                                  const ccTensorDescriptor_t outputDesc, void *output);
+
+/**^M
+ * @ingroup dnn^M\r	10932
+ * @brief generate BatchMatMul tensor.^M\r	10933
+ * @param [in] handle        Stream handle.^M\r	10934
+ * @param [in] alpha         reserved.^M\r	10935
+ * @param [in] xDesc         tensorA Desc.^M\r	10936
+ * @param [in] x             Data pointer of tensorA.^M\r	10937
+ * @param [in] yDesc         tensorB Desc.^M\r	10938
+ * @param [in] y             Data pointer of tensorB.^M\r	10939
+ * @param [in] beta          reserved.^M\r	10940
+ * @param [in] adj_x         tensorA transpose flag^M\r	10941
+ * @param [in] adj_y         tensorB transpose flag^M\r	10942
+ * @param [in] outpDesc      Description of the tensor output .^M\r	10943
+ * @param [output] out       Data pointer of output.^M\r	10944
+ * @return ccStatus_t^M
+ */
+ccStatus_t ccBatchMatMulForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                const ccTensorDescriptor_t yDesc, const void *y, const void *beta, const bool adj_x,
+                                const bool adj_y, const ccTensorDescriptor_t outDesc, void *out);
+
+ccStatus_t ccGetBatchMatMulOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, bool adj_x,
+                                     bool adj_y, int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief generator conv int8 all offset factor
+ * @param [in] para   the struct for scale and offset of input, filter and output
+ * @param [in|out] offsetW   offset of filter
+ * @param [in|out] offsetPad   offset of input
+ * @param [in|out] scaledQrq   scale computing result of input , filter and output
+ * @param [in|out] nextoffsetq   offset of output
+ * @return ccStatus_t
+ */
+ccStatus_t ccGenQuantAllOffsetFactor(const ccQuantAllOffsetPara_t *para, uint8_t &offsetW, uint8_t &offsetPad,
+                                     uint16_t &scaledQrq, uint16_t &nextoffsetq);
+
+/**
+ * @ingroup dnn
+ * @brief get conv int8 all offset fracZ size
+ * @param [in] filterDesc   descriptor of filter tensor
+ * @param [in|out] conv int8 all offset fracZ size
+ * @param [in] groupNum   group conv num
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetGroupConvScene(const ccFilterDescriptor_t tensorDesc, ccConvolutionDescriptor_t convDesc);
+
+ccStatus_t ccGetInt8AllOffsetFilterFracZSizeInBytes(const ccFilterDescriptor_t filterSrcDesc,
+                                                    const ccFilterDescriptor_t filterDesc, uint32_t &size,
+                                                    uint32_t groupNum);
+
+/**
+ * @ingroup dnn
+ * @brief transform filter in conv int8 all offset scene
+ * @param [in] filterSrcInfo    descriptor of filter tensor before fracZ transform
+ * @param [in] filterSrc        filter addr before fracZ transform
+ * @param [in] filterDstInfo   descriptor of filter tensor after fracZ transform
+ * @param [in] filterDst   filter addr after fracZ transform
+ * @param [in] quantPara   the struct for scale and offset of input, filter and output
+ * @param [in] ySizeInBytes   filter size after fracZ transform
+ * @param [in|out] outputDataType   output data type
+ * @param [in] groupNum   group conv num
+ * @return ccStatus_t
+ */
+ccStatus_t ccTransFilterInt8AllOffset(ccFilterDescriptor_t filterSrcInfo, const void *filterSrc,
+                                      ccFilterDescriptor_t filterDstInfo, void *filterDst,
+                                      const ccQuantAllOffsetPara_t *quantPara, uint32_t ySizeInBytes,
+                                      ccDataType_t outputDataType, uint32_t groupNum);
+
+/**
+ * @ingroup dnn
+ * @brief transform bias in conv int8 all offset scene
+ * @param [in] filterDesc    descriptor of filter tensor
+ * @param [in] biasDesc     descriptor of bias tensor
+ * @param [in] quantPara   the struct for scale and offset of input, filter and output
+ * @param [in] w      filter addr
+ * @param [in] bias   bias addr
+ * @return ccStatus_t
+ */
+ccStatus_t ccTransInt8AllOffsetBias(const ccFilterDescriptor_t filterDesc, const ccTensorDescriptor_t biasDesc,
+                                    const ccQuantAllOffsetPara_t *quantPara, const void *w, const void *bias);
+
+/**
+ * @ingroup dnn
+ * @get dequantize
+ * @param [in] handle  handle id
+ * @param [in] alpha  alpha addr
+ * @param [in] xDesc the input Desc  descriptor
+ * @param [in] x   x data addr
+ * @param [in] beta beta data addr
+ * @param [in] yDesc the output Desc  descriptor
+ * @param [in] y   y data addr
+ * @return ccStatus_t
+ */
+ccStatus_t ccDequantizeCoreForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc,
+                                   const void *x, const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+/**
+ * @ingroup dnn
+ * @get quantize
+ * @param [in] handle  handle id
+ * @param [in] alpha  alpha addr
+ * @param [in] xDesc the input Desc  descriptor
+ * @param [in] x   x data addr
+ * @param [in] beta beta data addr
+ * @param [in] yDesc the output Desc  descriptor
+ * @param [in] y   y data addr
+ * @return ccStatus_t
+ */
+ccStatus_t ccQuantizeCoreForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                 const void *beta, const ccTensorDescriptor_t yDesc, void *y);
+
+#ifndef DAVINCI_LITE
+ccStatus_t ccActivationBackward(ccHandle_t handle, const ccActivationDescriptor_t activationDesc, const void *alpha,
+                                const ccTensorDescriptor_t dyDesc, const void *dy, const ccTensorDescriptor_t xDesc,
+                                const void *x, const void *beta, const ccTensorDescriptor_t dxDesc, void *dx);
+#endif
+
+ccStatus_t ccL2LossForward(ccHandle_t handle, const ccL2LossDescriptor_t l2lossDesc, const void *alpha,
+                           const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                           const ccTensorDescriptor_t yDesc, void *y);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of top k v2
+ * @param [in] xDesc            descriptor of input tensor x
+ * @param [in] yDesc            descriptor of input tensor y
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetTopKV2OutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t kDesc, const void *k,
+                                const int64_t axis, int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief top k v2 forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor x
+ * @param [in] x                input data x in device memory
+ * @param [in] yDesc            descriptor of input tensor y
+ * @param [in] y                input data y in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccTopKV2Forward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                           const ccTensorDescriptor_t kDesc, const void *k, const void *beta, const bool sorted,
+                           const int64_t axis, void *workSpace, const uint32_t workSpaceSizeInBytes,
+                           const ccTensorDescriptor_t outputValuesDesc, void *outputValues,
+                           const ccTensorDescriptor_t outputIndicesDesc, void *outputIndices);
+
+/**
+ * @ingroup dnn
+ * @brief get the workspace size of top k v2
+ * @param [in] xDesc            descriptor of input tensor x
+ * @param [in] yDesc            descriptor of input tensor y
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] sizeInBytes  point to workspace size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetTopKV2ForwardWorkspaceSize(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t kDesc,
+                                           const ccTensorDescriptor_t indiceDesc, const void *k, const int64_t axis,
+                                           uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief Get unsorted segment reduction output dim
+ * @param [in] xDesc                 descriptor of input tensor
+ * @param [in] segmentIdsDesc        descriptor of input segmentIds tensor
+ * @param [in] segmentsNum           output slice num
+ * @param [out] dimCnt               output tensor dim cnt
+ * @param [out] dim                  output tensor dim
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetUnsortedSegmentReductionOutputDim(const ccTensorDescriptor_t xDesc,
+                                                  const ccTensorDescriptor_t segmentIdsDesc, int32_t segmentsNum,
+                                                  int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief reduce all forward computation
+ * @param [in] handle          cce handle
+ * @param [in] segmentsNum     output slice num
+ * @param [in] alpha           scaling factors
+ * @param [in] xDesc           descriptor of input tensor
+ * @param [in] x               input data in device memory
+ * @param [in] segmentIdsDesc  descriptor of input segmentIds tensor
+ * @param [in] x               input segmentIds data in device memory
+ * @param [in] beta            bias factors
+ * @param [in] outputDesc      descriptor of output tensor
+ * @param [in|out] output      output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccUnsortedSegmentSumForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc,
+                                       const void *x, const ccTensorDescriptor_t segmentIdsDesc, const void *segmentIds,
+                                       const int32_t segmentsNum, const void *beta,
+                                       const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief reverse sequence forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor x
+ * @param [in] x                input data x in device memory
+ * @param [in] yDesc            descriptor of input tensor y
+ * @param [in] y                input data y in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccReverseSequenceForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t inputDesc,
+                                    const void *input, const ccTensorDescriptor_t seqLengthsDesc,
+                                    const void *seqLengths, int64_t seqAxis, int64_t batchAxis, const void *beta,
+                                    const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief realdiv between two tensors.
+ * @param [in] alpha  reserved.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] x  data point of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [in] y  data point of the right operator tensor.
+ * @param [in] beta  reserved.
+ * @param [in] outputDesc  description of the output tensor.
+ * @param [output] output  data point of the output tensor.
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccEqualForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                          const ccTensorDescriptor_t yDesc, const void *y, const void *beta,
+                          const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get output shape of realdiv.
+ * @param [in] xDesc  description of the left operator tensor.
+ * @param [in] yDesc  description of the right operator tensor.
+ * @param [out] dimCnt       output tensor dim cnt
+ * @param [out] dim          output tensor dim
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetEqualOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t yDesc, int32_t *dimCnt,
+                               int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief invert permutation forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccInvertPermutationForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc,
+                                      const void *x, const void *beta, const ccTensorDescriptor_t outputDesc,
+                                      void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the workspace size of non max suppression
+ * @param [in] handle            descriptor of handle
+ * @param [in] scoresDesc        descriptor of input tensor scoresDesc
+ * @param [in] boxesDesc         descriptor of input tensor boxesDesc
+ * @param [in|out] sizeInBytes   point to workspace size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetNonMaxSuppressionWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t scoresDesc,
+                                               const ccTensorDescriptor_t boxesDesc, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dim of non max suppression
+ * @param [in] scoresDesc            descriptor of input tensor scoresDesc
+ * @param [in] maxOutPutSize         the max size of output
+ * @param [in|out] dimCnt            point to the count of dim
+ * @param [in|out] dim[]             the array of output dim
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetNonMaxSuppressionOutputDim(const ccTensorDescriptor_t scoresDesc, const int32_t maxOutPutSize,
+                                           int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief multinomial forward.
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x        input data in device memory
+ * @param [in] numSamples    number of independent samples to draw for each row slice
+ * @param [in] seed1   sed to create a random seed for the distribution
+ * @param [in] seed2  sed to create a random seed for the distribution
+ * @param [in] workSpace  work space for inter access
+ * @param [in] workSpaceSizeInBytes  work space size
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccMultinomialForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                int32_t numSamples, int64_t seed1, int64_t seed2, void *workSpace,
+                                uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t outputDesc,
+                                void *output);
+/**
+ * @ingroup dnn
+ * @brief get output dim of generated one-hot tensor.
+ * @param [in] indicesDesc   Indices description of one-hot position.
+ * @param [in] depth         On/off value description.
+ * @param [in] axis          Data pointer of on/off value.
+ * @param [output] dimCnt    Description of the generated one-hot tensor.
+ * @param [output] dim       Data pointer of output.
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetOneHotOutputDim(const ccTensorDescriptor_t indicesDesc, int32_t depth, int32_t axis, int32_t *dimCnt,
+                                int32_t *dim, int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief generate an one-hot Tensor use given on/off value.
+ * @param [in] handle        Stream handle.
+ * @param [in] alpha         reserved.
+ * @param [in] indicesDesc   Indices description of one-hot position.
+ * @param [in] indices       Data pointer of indices.
+ * @param [in] onDesc        On value description.
+ * @param [in] on            Data pointer of on value.
+ * @param [in] offDesc       Off value description.
+ * @param [in] off           Data pointer of off value.
+ * @param [in] depth         On/off value description.
+ * @param [in] axis          Data pointer of on/off value.
+ * @param [in] beta          reserved.
+ * @param [in] outputDesc    Description of the generated one-hot tensor.
+ * @param [output] output    Data pointer of output.
+ * @return ccStatus_t
+ */
+ccStatus_t ccOneHotForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t indicesDesc,
+                           const void *indices, const ccTensorDescriptor_t onDesc, const void *on,
+                           const ccTensorDescriptor_t offDesc, const void *off, const int32_t depth, const int32_t axis,
+                           const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @ingroup dnn
+ * @brief get the workspaceSize of multinomial
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] numSamples       number sample
+ * @param [out] sizeInBytes       wor space size of byte
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetMultinomialWorkspaceSize(const ccTensorDescriptor_t xDesc, uint32_t *sizeInBytes);
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of multinomial
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] numSample        number of independent samples to draw for each row slice
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetMultinomialOutputDim(const ccTensorDescriptor_t xDesc, int32_t numSample, int32_t *dimCnt,
+                                     int32_t dim[], int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of BiasAddBackward
+ * @param [in] dyDesc            descriptor of input tensor
+ * @param [in] out] n             outputTensor [N]CHW
+ * @param [in|out] c             outputTensor N[C]HW
+ * @param [in|out] h             outputTensor NC[H]W
+ * @param [in|out] w            outputTensor NCH[W]
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetBiasAddBackwardOutputDim(const ccTensorDescriptor_t dyDesc, int32_t *n, int32_t *c, int32_t *h,
+                                         int32_t *w);
+
+/**
+ * @ingroup dnn
+ * @brief biasadd backward.
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] dyDesc       descriptor of input data
+ * @param [in] dy       input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] dbDesc   descriptor of output data
+ * @param [in|out] db   output data in device memory
+ * @return ccStatus_t
+ */
+#ifndef DAVINCI_LITE
+ccStatus_t ccBiasAddBackward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t dyDesc, const void *dy,
+                             const void *beta, const ccTensorDescriptor_t dbDesc, void *db);
+
+ccStatus_t ccMaxPoolWithArgmaxForward(ccHandle_t handle, const ccPoolingDescriptor_t poolingDesc, const void *alpha,
+                                      const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                      const ccTensorDescriptor_t yDesc, void *y, const ccTensorDescriptor_t argMaskDesc,
+                                      void *argMask);
+#endif
+
+ccStatus_t ccCreatePoolingMaskDescriptor(ccTensorDescriptor_t *poolingMaskDesc);
+
+ccStatus_t ccDestroyPoolingMaskDescriptor(ccTensorDescriptor_t *poolingMaskDesc);
+
+ccStatus_t ccSetPoolingMaskTensorDescriptor(ccTensorDescriptor_t poolingMaskDesc, ccTensorFormat_t format,
+                                            ccDataType_t dataType, int32_t n, int32_t c, int32_t h, int32_t w,
+                                            int32_t windowH, int32_t windowW);
+
+ccStatus_t ccGetPoolingMaskTensorSizeInBytes(ccTensorDescriptor_t poolingMaskDesc, uint32_t *size);
+
+/**
+ * @ingroup dnn
+ * @brief get the mask output dimension info of maxpooling training forward
+ * @param [in] pooling   descriptor of convolution operator
+ * @param [in] xDesc   descriptor of input tensor
+ * @param [in|out] n   point to batch size
+ * @param [in|out] c   point to channels
+ * @param [in|out] h   point to height of feature map
+ * @param [in|out] w   point to width of feature map
+ * @param [in|out] windowH   point to height of window
+ * @param [in|out] windowW   point to width of windowW
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetPoolingMaskDim(const ccPoolingDescriptor_t poolingDesc, const ccTensorDescriptor_t xDesc, int32_t *n,
+                               int32_t *c, int32_t *h, int32_t *w, int32_t *windowH, int32_t *windowW);
+
+#ifndef DAVINCI_LITE
+ccStatus_t ccSoftmaxCrossEntropyLoss(ccHandle_t handle, ccSoftmaxAlgo_t algo, ccSoftmaxMode_t mode,
+                                     ccCrossEntropyMode_t ceMode, const void *alpha, const void *scale,
+                                     const ccTensorDescriptor_t logitsDesc, const void *logits,
+                                     const ccTensorDescriptor_t labelsDesc, const void *labels, const void *labelSmooth,
+                                     const void *beta, const ccTensorDescriptor_t lossDesc, void *loss);
+
+ccStatus_t ccSoftmaxCrossEntropyDx(ccHandle_t handle, ccSoftmaxAlgo_t algo, ccSoftmaxMode_t mode,
+                                   ccCrossEntropyMode_t ceMode, const void *alpha, const void *scale,
+                                   const ccTensorDescriptor_t logitsDesc, const void *logits,
+                                   const ccTensorDescriptor_t labelsDesc, const void *labels, const void *labelSmooth,
+                                   const void *beta, const ccTensorDescriptor_t dxDesc, void *dx);
+
+ccStatus_t ccAvgPoolingBackward(ccHandle_t handle, const ccPoolingDescriptor_t poolingDesc, const void *alpha,
+                                const ccTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+                                const ccTensorDescriptor_t dxDesc, const void *dx);
+
+ccStatus_t ccTrainingAssignOp(ccHandle_t handle, const ccAssignOpMode_t assignOpDesc, const void *alpha,
+                              const void *beta, const ccTensorDescriptor_t aDesc, void *a,
+                              const ccTensorDescriptor_t bDesc, const void *b);
+
+/**
+ * @ingroup dnn
+ * @brief momentum optimizer for variable update
+ * @param [in] handle                       cce handle
+ * @param [in] inputDesc                    descriptor of input tensor: gradient,accumulation,variable
+ * @param [in] gradient                     gradient input
+ * @param [in|out] accumulation             accumulation input and updated output
+ * @param [in|out] variable                 variable input and updated output
+ * @param [in] algo                         indicate whether need FP16 output
+ * @param [in] momentum                     scaler to control accumulation
+ * @param [in] learningRate                 scaler
+ * @param [in] lossScaleReciprocal          scaler
+ * @param [in] workSpace                    additional memory address
+ * @param [in] workSpaceSizeInBytes         additional memory size
+ * @param [out] variableUpdatedFP16Desc     descriptor of FP16 output tensor: variableUpdatedFP16
+ * @param [out] variableUpdatedFP16         variableUpdatedFP16
+ * @return ccStatus_t
+ */
+ccStatus_t ccApplyMomentum(ccHandle_t handle, const ccTensorDescriptor_t inputDesc, const void *gradient,
+                           void *accumulation, void *variable, const ccMomentumAlgo_t algo, const void *momentum,
+                           const void *learningRate, const void *lossScaleReciprocal, void *workSpace,
+                           const uint32_t workSpaceSizeInBytes, const ccTensorDescriptor_t variableUpdatedFP16Desc,
+                           void *variableUpdatedFP16);
+
+ccStatus_t ccSsdClassifyLossTrain(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t labelDesc,
+                                  const void *label, const ccTensorDescriptor_t greaterConstDesc,
+                                  const void *greaterConst, const ccTensorDescriptor_t subConstDesc,
+                                  const void *subConst, const ccTensorDescriptor_t sparseDesc, const void *sparse,
+                                  const void *beta, const ccTensorDescriptor_t castoutDesc, const void *castout,
+                                  const ccTensorDescriptor_t muloutDesc, const void *mulout);
+
+#endif
+
+/**
+ * @ingroup dnn
+ * @brief get the workspace size of applymomentum
+ * @param [in] inputDesc                    descriptor of input tensor
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetApplyMomentumWorkspaceSize(const ccTensorDescriptor_t inputDesc, uint32_t *sizeInBytes);
+#ifndef DAVINCI_LITE
+ccStatus_t ccHwck2FracZ(ccHandle_t handle, const ccFilterDescriptor_t xDesc, const void *x,
+                        const ccFilterDescriptor_t yDesc, void *y);
+
+ccStatus_t ccFracZ2Hwck(ccHandle_t handle, const ccFilterDescriptor_t xDesc, const void *x,
+                        const ccFilterDescriptor_t yDesc, void *y);
+ccStatus_t ccAddNForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const int32_t inputNum,
+                         const void *x[], const void *beta, void *workSpace, uint32_t workSpaceSizeInBytes,
+                         const ccTensorDescriptor_t yDesc, void *y);
+#endif
+ccStatus_t ccGetAddNForwardWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t xDesc, const int32_t inputNum,
+                                         const ccTensorDescriptor_t yDesc, uint32_t *sizeInBytes);
+ccStatus_t ccGetAddNForwardOutputDim(const ccTensorDescriptor_t xDesc, int32_t *dimCnt, int32_t *dim, int32_t dimLen);
+ccStatus_t ccAddTrainForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                             const ccTensorDescriptor_t wDesc, const void *w, const void *beta, void *workSpace,
+                             uint32_t workSpaceSizeInBytes, const ccTensorDescriptor_t yDesc, void *y);
+ccStatus_t ccGetAddTrainForwardWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t xDesc,
+                                             const ccTensorDescriptor_t wDesc, const ccTensorDescriptor_t yDesc,
+                                             uint32_t *sizeInBytes);
+ccStatus_t ccGetAddTrainForwardOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t wDesc,
+                                         int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+ccStatus_t ccMulTrainForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                             const ccTensorDescriptor_t wDesc, const void *w, const void *beta, void *workSpace,
+                             uint32_t workSpaceSizeInBytes, const ccTensorDescriptor_t yDesc, void *y);
+ccStatus_t ccGetMulTrainForwardWorkspaceSize(ccHandle_t handle, const ccTensorDescriptor_t xDesc,
+                                             const ccTensorDescriptor_t wDesc, const ccTensorDescriptor_t yDesc,
+                                             uint32_t *sizeInBytes);
+ccStatus_t ccGetMulTrainForwardOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t wDesc,
+                                         int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief get workspace size
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] sizeInBytes  workspace size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetRandomShuffleWorkspaceSize(const ccTensorDescriptor_t xDesc, uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief random shuffle forward computation
+ * @param [in] handle               cce handle
+ * @param [in] alpha                common scale factor
+ * @param [in] xDesc                descriptor of input data
+ * @param [in] x                    input data in device memory
+ * @param [in] workspace            temporary space
+ * @param [in] workspaceSizeInBytes temporary space size
+ * @param [in] seed                 random seed used to generate random number
+ * @param [in] seed2                random seed used to generate random number
+ * @param [in] beta                 common scale factor
+ * @param [in] outputDesc           descriptor of output data
+ * @param [in|out] output           output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccRandomShuffleForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                  void *workspace, const uint32_t workspaceSizeInBytes, const int64_t seed1,
+                                  const int64_t seed2, const void *beta, const ccTensorDescriptor_t outputDesc,
+                                  void *output);
+/**
+ * @ingroup dnn
+ * @brief sin forward:
+ *          data type only support float float16 double
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] input        input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccSinForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *input,
+                        const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief cos forward:
+ *          data type only support float float16 double
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] input        input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccCosForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *input,
+                        const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief tan forward:
+ *          data type only support float float16 double
+ *          data format only support ND
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] input        input data in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccTanForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *input,
+                        const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of unstack
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] axis             the axis to unstack along
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetUnstackOutputDim(const ccTensorDescriptor_t xDesc, int32_t axis, int32_t *dimCnt, int32_t dim[],
+                                 int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief unstack forward.
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data
+ * @param [in] x            input data in device memory
+ * @param [in] num          the length of the dimension axis
+ * @param [in] axis         the axis to unstack along
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccUnstackForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                            int32_t num, int32_t axis, const void *beta, const ccTensorDescriptor_t outputDesc,
+                            void *output[]);
+
+ccStatus_t ccResizeNearestNeighborCpuForward(ccHandle_t handle, const ccResizeNearestNeighborDescriptor_t resizeDesc,
+                                             const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                             const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of resize nearest neighbor
+ * @param [in] resizeDesc       descriptor of resize
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetResizeNearestNeighborOutputDim(const ccResizeNearestNeighborDescriptor_t resizeDesc,
+                                               const ccTensorDescriptor_t xDesc, int32_t *dimCnt, int32_t dim[],
+                                               int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of ResizeNearestNeighbor
+ * @param [in|out] resizeDesc   point to descriptor of ResizeNearestNeighbor attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateResizeNearestNeighborDescriptor(ccResizeNearestNeighborDescriptor_t *resizeDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of ResizeNearestNeighbor
+ * @param [in|out] resizeDesc   point to descriptor of ResizeNearestNeighbor attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyResizeNearestNeighborDescriptor(ccResizeNearestNeighborDescriptor_t *resizeDesc);
+
+/**
+ * @ingroup dnn
+ * @brief set descriptor of ResizeNearestNeighbor.
+ * @param [in|out] resizeDesc           descriptor of resize nearest neighbor operator
+ * @param [in] alignCorners             whether the centers of input and output are aligned
+ * @param [in] height                   height of output
+ * @param [in] width                    width of output
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetResizeNearestNeighborDescriptor(ccResizeNearestNeighborDescriptor_t resizeDesc, bool alignCorners,
+                                                int32_t height, int32_t width);
+
+/**
+ * @ingroup dnn
+ * [ccGetPadV2OutputDim]
+ * @brief get the output dimension info of pad
+ * @param [in] xDesc            descriptor of input tensor x
+ * @param [in] padDesc          descriptor of input paddings
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetPadV2OutputDim(const ccTensorDescriptor_t xDesc, const ccPadV2Descriptor_t padDesc, int32_t *dimCnt,
+                               int32_t dim[], int32_t dimLen);
+
+ccStatus_t ccPadV2CpuForward(ccHandle_t handle, const ccPadV2Descriptor_t padDesc, const void *alpha,
+                             const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                             const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of parameters for padv2 function
+ * @param [in] point to descriptor of parameters for padv2 function
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreatePadV2Descriptor(ccPadV2Descriptor_t *padDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of parameters for padv2 function
+ * @param [in] point to descriptor of parameters for padv2 function
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyPadV2Descriptor(ccPadV2Descriptor_t *padDesc);
+
+/**
+ * @brief init descriptor for parameter of padv2 function
+ * @param [in|out] padDesc   descriptor of pad
+ * @param [in] padShapeCnt   padshape count
+ * @param [in] padShapeLow   padshape low
+ * @param [in] padShapeHigh  padshape high
+ * @param [in] padMode       pad mode
+ * @param [in] padValue      pad value ptr
+ * @param [in] padValueType  pad value data type
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetPadV2Descriptor(ccPadV2Descriptor_t padDesc, const int32_t padShapeCnt, const int32_t padShapeLow[],
+                                const int32_t padShapeHigh[], const ccPadMode_t padMode, const void *padValue,
+                                const ccDataType_t padValueType);
+/**
+ * @ingroup dnn
+ * @brief create descriptor of batchToSpace
+ * @param [in|out] batchToSpaceDesc  point to descriptor of batchToSpace
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateBatchToSpaceDescriptor(ccBatchToSpaceDescriptor_t *batchToSpaceDesc);
+
+/**
+ * @ingroup dnn
+ * @brief set batchToSpaceDesc
+ * @param [in|out] batchToSpaceDesc descriptor of batchToSpace
+ * @param [in] blockShape  blockShape of batchToSpace
+ * @param [in] crops  crops of batchToSpace
+ * @param [in] blockShapeLength  blockShapeLength of batchToSpace
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetBatchToSpaceDescriptor(ccBatchToSpaceDescriptor_t paramsDesc, const int32_t *blockShape,
+                                       const int32_t *crops, const int32_t blockShapeLength);
+
+/**
+ * @ingroup dnn
+ * @brief get batchToSpaceDesc
+ * @param [in|out] batchToSpaceDesc descriptor of batchToSpace
+ * @param [in] blockShape  blockShape of batchToSpace
+ * @param [in] crops  crops of batchToSpace
+ * @param [in] blockShapeLength  blockShapeLength of batchToSpace
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetBatchToSpaceDescriptor(const ccBatchToSpaceDescriptor_t paramsDesc, int32_t *blockShape, int32_t *crops,
+                                       int32_t *blockShapeLength);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of batchToSpace
+ * @param [in] *batchToSpaceDesc descriptor of batchToSpace
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyBatchToSpaceDescriptor(ccBatchToSpaceDescriptor_t *batchToSpaceDesc);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of batch to space
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccGetBatchToSpaceOutputDim(const ccTensorDescriptor_t xDesc,
+                                      const ccBatchToSpaceDescriptor_t batchToSpaceDesc, int32_t *dimCnt, int32_t dim[],
+                                      int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief batch to space forward computation
+ * @param [in] handle           cce handle
+ * @param [in] paramsDesc       descriptor of input params
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccBatchToSpaceForward(ccHandle_t handle, const ccBatchToSpaceDescriptor_t paramsDesc, const void *alpha,
+                                 const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                 const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of spaceToBatch
+ * @param [in|out] spaceToBatchDesc  point to descriptor of spaceToBatch
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateSpaceToBatchDescriptor(ccSpaceToBatchDescriptor_t *spaceToBatchDesc);
+
+/**
+ * @ingroup dnn
+ * @brief set spaceToBatchDesc
+ * @param [in|out] spaceToBatchDesc descriptor of spaceToBatch
+ * @param [in] blockShape  blockShape of spaceToBatch
+ * @param [in] paddings  paddings of spaceToBatch
+ * @param [in] blockShapeLength  blockShapeLength of spaceToBatch
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetSpaceToBatchDescriptor(ccSpaceToBatchDescriptor_t paramsDesc, const int32_t *blockShape,
+                                       const int32_t *paddings, const int32_t blockShapeLength);
+
+/**
+ * @ingroup dnn
+ * @brief get spaceToBatchDesc
+ * @param [in|out] spaceToBatchDesc descriptor of spaceToBatch
+ * @param [in] blockShape  blockShape of spaceToBatch
+ * @param [in] paddings  paddings of spaceToBatch
+ * @param [in] blockShapeLength  blockShapeLength of spaceToBatch
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetSpaceToBatchDescriptor(const ccSpaceToBatchDescriptor_t paramsDesc, int32_t *blockShape,
+                                       int32_t *paddings, int32_t *blockShapeLength);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of spaceToBatch
+ * @param [in] *spaceToBatchDesc descriptor of spaceToBatch
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroySpaceToBatchDescriptor(ccSpaceToBatchDescriptor_t *spaceToBatchDesc);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of space to batch
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccGetSpaceToBatchOutputDim(const ccTensorDescriptor_t xDesc,
+                                      const ccSpaceToBatchDescriptor_t spaceToBatchDesc, int32_t *dimCnt, int32_t dim[],
+                                      int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief space to batch forward computation
+ * @param [in] handle           cce handle
+ * @param [in] paramsDesc       descriptor of input params
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+
+ccStatus_t ccSpaceToBatchForward(ccHandle_t handle, const ccSpaceToBatchDescriptor_t paramsDesc, const void *alpha,
+                                 const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                                 const ccTensorDescriptor_t outputDesc, void *output);
+
+ccStatus_t ccTransFilterDesc2TensorDesc(ccFilterDescriptor_t wDesc, ccTensorDescriptor_t tensorDesc);
+
+/*
+ * @brief get the output dimension info of extractImagePatches
+ * @param [in] xDesc            descriptor of input tensor x
+ * @param [in] ksizes           ksizes array
+ * @param [in] strides          strides array
+ * @param [in] rates            rates array
+ * @param [in] padding          padding type
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetExtractImagePatchesOutputDim(const ccTensorDescriptor_t xDesc, const ccIntArray_t *ksizes,
+                                             const ccIntArray_t *strides, const ccIntArray_t *rates,
+                                             const ccExtractImagePatchesPadType_t padding, int32_t *dimCnt,
+                                             int32_t dim[], const int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief cum forward.
+ * @param [in] handle       cce handle
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data, dimCnt:1~8
+ * @param [in] x            input data in device memory
+ * @param [in] axisDesc      scale factor, dimCnt:0
+ * @param [in] axis            which axis to cum calc, device memory
+ * @param [in] beta         common scale factor
+ * @param [in] opType         calc type, eg. sum, prod....
+ * @param [in] exclusive       cum  flag, true or false
+ * @param [in] reverse         cum  flag, true or false
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccCumForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                        const ccTensorDescriptor_t axisDesc, const void *axis, const void *beta, const CumOpType opType,
+                        const bool exclusive, const bool reverse, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @ingroup dnn
+ * @brief ExtractImagePatches forward.
+ * @param [in] handle       cce handle
+ * @param [in] ksizes       ksizes array
+ * @param [in] strides      strides array
+ * @param [in] rates        rates array
+ * @param [in] padding      padding type
+ * @param [in] alpha        common scale factor
+ * @param [in] xDesc        descriptor of input data x
+ * @param [in] x            input data x in device memory
+ * @param [in] beta         common scale factor
+ * @param [in] outputDesc   descriptor of output data
+ * @param [in|out] output   output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccExtractImagePatchesForward(ccHandle_t handle, const ccIntArray_t *ksizes, const ccIntArray_t *strides,
+                                        const ccIntArray_t *rates, const ccExtractImagePatchesPadType_t padding,
+                                        const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                                        const void *beta, const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @brief get argmax output dim info
+ * @param [in] argDesc          argmaxmin descriptor
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] dimCnt       output dim count
+ * @param [in|out] dim          output dim
+ * @param [in| dimlen        length of dim
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetArgMaxOutputDim(const ccArgmaxminDescriptor_t argDesc, const ccTensorDescriptor_t xDesc,
+                                int32_t *dimCnt, int32_t dim[], int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief argmax forward computation
+ * @param [in] handle           cce handle
+ * @param [in] argDesc          argmaxmin descriptor
+ * @param [in] alpha            scaling factors
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in] x                input data in device memory
+ * @param [in] workSpace        workspace pointer
+ * @param [in] workSpaceSizeInBytes   workspace size in bytes
+ * @param [in] beta             bias factors
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccArgMaxForward(ccHandle_t handle, const ccArgmaxminDescriptor_t argDesc, const void *alpha,
+                           const ccTensorDescriptor_t xDesc, const void *x, void *workSpace,
+                           const uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t outputDesc,
+                           void *output);
+
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of argmaxmin
+ * @param [in] argDesc          descriptor of tagCcArgmaxmin
+ * @param [in] xDesc            descriptor of input tensor
+ * @param [in|out] sizeInBytes  workspace size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetArgMaxWorkspaceSize(const ccArgmaxminDescriptor_t argDesc, const ccTensorDescriptor_t xDesc,
+                                    uint32_t *sizeInBytes);
+
+/**
+ * @ingroup dnn
+ * @brief create descriptor of Argmaxmin
+ * @param [in|out] resizeDesc   point to descriptor of Argmaxmin attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateArgmaxminDescriptor(ccArgmaxminDescriptor_t *argDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of Interp
+ * @param [in|out] resizeDesc   point to descriptor of Argmaxmin attr
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyArgmaxminDescriptor(ccArgmaxminDescriptor_t *argDesc);
+
+/**
+ * @ingroup dnn
+ * @brief destroy descriptor of Interp
+ * @param [in|out] argDesc      descriptor of tagCcArgmaxmin
+ * @param [in] axisType
+ * @param [in] outMaxVal        whether to return the maximum value
+ * @param [in] topK             number that returns the maximum index or maximum value
+ * @param [in] axis             Describes which axis of the input Tensor to reduce across
+ * @param [in] keepDims         whether to keep reduced dim
+ * @param [in] reduceSize       the num of elements to be reduce to get topK elements, reduceSize=-1 means the total num
+ * of elements in axis dimension
+ * @param [in] reduceStride     the stride for reduce operation, reduceStride=1 means the layout of target data is
+ * continuous
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetArgmaxminDescriptor(ccArgmaxminDescriptor_t argDesc, int32_t axisType, bool outMaxVal, int64_t topK,
+                                    int64_t axis, bool keepDims, int64_t reduceSize = -1, int64_t reduceDStride = 1);
+
+ccStatus_t ccArgMinForward(ccHandle_t handle, const ccArgmaxminDescriptor_t argDesc, const void *alpha,
+                           const ccTensorDescriptor_t xDesc, const void *x, const void *beta,
+                           const ccTensorDescriptor_t outputDesc, void *output);
+
+ccStatus_t ccGetArgMinOutputDim(const ccArgmaxminDescriptor_t argDesc, const ccTensorDescriptor_t xDesc,
+                                int32_t *dimCnt, int32_t dim[], const int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief lsh projection forward computation
+ * @param [in] handle           cce handle
+ * @param [in] alpha            scaling factors
+ * @param [in] hashDesc         descriptor of input tensor hashDesc
+ * @param [in] hash             input data hash in device memory
+ * @param [in] weightDesc       descriptor of input tensor weightDesc
+ * @param [in] weight           input data weight in device memory
+ * @param [in] inputDesc       descriptor of input tensor inputDesc
+ * @param [in] lookup           input data lookup in device memory
+ * @param [in] type             1:SPARSE 2.DENSE
+ * @param [in] beta             bias factors
+ * @param [in] workSpace          workSpace data in device memory
+ * @param [in] workSpaceSizeInBytes   workSpace length
+ * @param [in] outputDesc       descriptor of output tensor
+ * @param [in|out] output       output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccLshProjectionForward(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t hashDesc,
+                                  const void *hash, const ccTensorDescriptor_t weightDesc, const void *weight,
+                                  const ccTensorDescriptor_t inputDesc, const void *input, const LSHProjectionType type,
+                                  const void *beta, void *workSpace, const uint32_t workSpaceSizeInBytes,
+                                  const ccTensorDescriptor_t outputDesc, void *output);
+/**
+ * @ingroup dnn
+ * @brief get the workspace size of lsh projection
+ * @param [in] inputDesc         descriptor of input tensor input
+ * @param [in] hashDataType      data type of hash
+ * @param [in|out] sizeInBytes   workspace size
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLshProjectionForwardWorkspaceSize(const ccTensorDescriptor_t inputDesc, const ccDataType_t hashDataType,
+                                                  uint32_t *sizeInBytes);
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of LshProjection,
+ * @param [in] hashDesc         descriptor of hash
+ * @param [in] type             type of mode
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in] dimLen           dim length
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLshProjectionOutputDim(const ccTensorDescriptor_t hashDesc, const LSHProjectionType type,
+                                       int32_t *dimCnt, int32_t dim[], const int32_t dimLen);
+/**
+ * @ingroup dnn
+ * @brief get the weight dimension info of LshProjection,
+ * @param [in] inputDesc          descriptor of input
+ * @param [in|out] dimCnt       point to the weight dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in] dimLen           dim length
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetLshProjectionWeightDim(const ccTensorDescriptor_t inputDesc, int32_t *dimCnt, int32_t dim[],
+                                       const int32_t dimLen);
+
+/**
+ * @ingroup dnn
+ * @brief init descriptor for parameter of upsample function
+ * @param [in] handle                 cce handle
+ * @param [in] upsamplePara           input para in host memory
+ * @param [in] alpha                  common scale factor
+ * @param [in] bottomDesc             descriptor of input data bottomDesc
+ * @param [in] bottom                 input data bottom in device memory
+ * @param [in] bottomMaskDesc         descriptor of input data bottomMaskDesc
+ * @param [in] bottomMask             input data bottomMask in device memory
+ * @param [in] beta                   common scale factor
+ * @param [in] outputDesc             descriptor of output data
+ * @param [in|out] output             output data in device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccUpsampleForward(ccHandle_t handle, const ccUpsampleParaDescriptor_t upsamplePara, const void *alpha,
+                             const ccTensorDescriptor_t bottomDesc, const void *bottom,
+                             const ccTensorDescriptor_t bottomMaskDesc, const void *bottomMask, const void *beta,
+                             const ccTensorDescriptor_t outputDesc, void *output);
+
+/**
+ * @brief creat descriptor for parameter of usample function
+ * @param [in|out] upsampleDesc   descriptor of upsamplepara
+ * @return ccStatus_t
+ */
+ccStatus_t ccCreateUpsampleDescriptor(ccUpsampleParaDescriptor_t *upsampleDesc);
+
+/**
+ * @brief destroy descriptor for parameter of upsample function
+ * @param [in|out] upsampleDesc   descriptor of upsamplepara
+ * @return ccStatus_t
+ */
+ccStatus_t ccDestroyUpsampleDescriptor(ccUpsampleParaDescriptor_t *upsampleDesc);
+
+/**
+ * @brief set descriptor for parameter of upsample function
+ * @param [in|out] upsampleDesc   descriptor of upsamplepara
+ * @param [in] scale              the scale of height and width
+ * @param [in] scaleHeight        the scale of height
+ * @param [in] scaleWidth         the scale of Width
+ * @param [in] upsampleHeight     the height of output
+ * @param [in] upsampleWidth      the width of output
+ * @param [in] padOutHeight       pad value height
+ * @param [in] padOutWidth        pad value width
+ * @return ccStatus_t
+ */
+ccStatus_t ccSetUpsampleDescriptor(ccUpsampleParaDescriptor_t upsampleDesc, const int32_t scale,
+                                   const int32_t scaleHeight, const int32_t scaleWidth, const int32_t upsampleHeight,
+                                   const int32_t upsampleWidth, const bool padOutHeight, const bool padOutWidth);
+/**
+ * @ingroup dnn
+ * @brief get the output dimension info of upsample
+ * @param [in] upsamplePara     para of upsample
+ * @param [in] bottomDesc       descriptor of input bottom tensor
+ * @param [in|out] dimCnt       point to the output dimCnt
+ * @param [in|out] dim          arrays to save dims
+ * @param [in] dimLen           the len of dim array
+ * @return ccStatus_t
+ */
+ccStatus_t ccGetUpsampleOutputDim(const ccUpsampleParaDescriptor_t upsamplePara, const ccTensorDescriptor_t bottomDesc,
+                                  int32_t *dimCnt, int32_t dim[], const int32_t dimLen);
+
+#ifndef DAVINCI_LITE
+ccStatus_t ccMatmul(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                    const ccTensorDescriptor_t wDesc, const void *w, const ccTensorDescriptor_t biasDesc,
+                    const void *bias, const ccFullConnectFwdAlgo_t algo, void *workSpace,
+                    const uint32_t workSpaceSizeInBytes, const void *beta, const ccTensorDescriptor_t yDesc, void *y,
+                    const bool transposeA, const bool transposeB);
+ccStatus_t ccGetMatmulOutputDim(const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t wDesc, int32_t *n,
+                                int32_t *c, int32_t *h, int32_t *w, bool transposeA, bool transposeB);
+ccStatus_t ccGetMatmulWorkspaceSize(ccHandle_t handle, const ccFullConnectFwdAlgo_t algo,
+                                    const ccTensorDescriptor_t xDesc, const ccTensorDescriptor_t wDesc,
+                                    const ccTensorDescriptor_t yDesc, uint32_t *sizeInBytes, bool transposeA,
+                                    bool transposeB);
+#endif
+
+/**
+ * @ingroup dnn
+ * @brief gather_v2 function
+ * @param [in] handle                cce handle
+ * @param [in] alpha                 common scale factor
+ * @param [in] paramsDesc            descriptor
+ * @param [in] params                device memory
+ * @param [in] indicesDesc           descriptor
+ * @param [in] indices               device memory
+ * @param [in] axisDesc              descriptor
+ * @param [in] axis                  device memory
+ * @param [in] beta                  common scale factor
+ * @param [in] outputDesc            descriptor
+ * @param [in|out] output            device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccGatherV2(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t paramsDesc, const void *params,
+                      const ccTensorDescriptor_t indicesDesc, const void *indices, const ccTensorDescriptor_t axisDesc,
+                      const void *axis, const void *beta, const ccTensorDescriptor_t outputDesc, const void *output);
+
+/**
+ * @ingroup dnn
+ * @brief memory_clear function
+ * @param [in] handle                 cce handle
+ * @param [in] addrSpaceSizeInBytes   addr space size
+ * @param [in|out] addr               device memory
+ * @return ccStatus_t
+ */
+ccStatus_t ccMemoryClear(ccHandle_t handle, const uint64_t addrSpaceSizeInBytes, const void *addr);
+
+/**
+ * @ingroup dnn
+ * @brief check input is overflow
+ * @param [in] handle      cce handle
+ * @param [in] alpha       scaling factors
+ * @param [in] xDesc       descriptor of input tensor
+ * @param [in] x           input data in device memory
+ * @param [in] yDesc       descriptor of output tensor
+ * @param [in|out] y       output data in device memory
+ * @param [in] beta        scaling factors
+ * @return ccStatus_t
+ */
+ccStatus_t ccIsFinite(ccHandle_t handle, const void *alpha, const ccTensorDescriptor_t xDesc, const void *x,
+                      const ccTensorDescriptor_t yDesc, const void *y, const void *beta);
+};  // namespace cce
+
+#endif  // DNN_OP_H__
diff --git a/third_party/fwkacllib/inc/cce/dnn_struct.hpp b/third_party/fwkacllib/inc/cce/dnn_struct.hpp
new file mode 100644
index 00000000..96566074
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/dnn_struct.hpp
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DNN_STRUCT_HPP__
+#define DNN_STRUCT_HPP__
+
+#include "dnn.h"
+#include "dnn_struct_base.hpp"
+
+#endif  // DNN_STRUCT_HPP__
diff --git a/third_party/fwkacllib/inc/cce/dnn_struct_base.hpp b/third_party/fwkacllib/inc/cce/dnn_struct_base.hpp
new file mode 100644
index 00000000..dd75e9ea
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/dnn_struct_base.hpp
@@ -0,0 +1,894 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DNN_STRUCT_BASE_HPP__
+#define DNN_STRUCT_BASE_HPP__
+
+#include "cce/cce_def.hpp"
+
+namespace cce {
+
+/**
+ * @ingroup dnn
+ * @brief max number of dimensions
+ */
+#define CC_DIM_MAX (8)
+
+/**
+ * @ingroup dnn
+ * @brief max number of dimensions when use NC1HWC0 format
+ */
+#define CC_REALDIM_MAX (4)
+
+/**
+ * @ingroup dnn
+ * @brief max input count of MscnnBoxOutput
+ */
+#define CC_MAX_INPUT_CNT (10)
+
+/**
+ * @ingroup dnn
+ * @brief image dimensions of aipp input
+ */
+#define CC_AIPP_IMG_DIM (2)
+
+/**
+ * @ingroup dnn
+ * @brief image channel number of aipp input
+ */
+#define CC_AIPP_IMG_CHN_NUM (4)
+
+/**
+ * @ingroup dnn
+ * @brief element number of aipp color space convertion matrix
+ */
+#define CC_AIPP_CSC_MATRIX_DIM (9)
+
+/**
+ * @ingroup dnn
+ * @brief element number of aipp color space convertion bias
+ */
+#define CC_AIPP_CSC_BIAS_DIM (3)
+
+/**
+ * @ingroup dnn
+ * @brief parameter number of op exp/log/pow
+ */
+#define PARAM_CNT_THREE (3)
+
+/**
+ * @ingroup dnn
+ * @brief parameter number of op nonmaxsuppression
+ */
+#define PARAM_CNT_TWO (2)
+#define DIMCNT_NUMBER_ONE (1)
+#define DIMCNT_NUMBER_TWO (2)
+#define DIMCNT_NUMBER_FOUR (4)
+
+#define COMMON_FORMAT_NCHW_N_INDEX (0)
+#define COMMON_FORMAT_NCHW_C_INDEX (1)
+#define COMMON_FORMAT_NCHW_H_INDEX (2)
+#define COMMON_FORMAT_NCHW_W_INDEX (3)
+
+/**
+ * @ingroup dnn
+ * @brief parameter number of op upsample
+ */
+#define UPSAMPLE_SCAL_DEFAULT_TWO (2)
+#define UPSAMPLE_ILLEGAL_VALUE_1 (1)
+
+/**
+ * @ingroup dnn
+ * @brief struct define of StridedSlice required params.
+ */
+
+typedef struct tagCcStridedSlice {
+  uint32_t dimCnt;
+  int32_t begin[CC_DIM_MAX];
+  int32_t end[CC_DIM_MAX];
+  int32_t strides[CC_DIM_MAX];
+} ccStridedSlice_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of Strided_slice attrs
+ */
+typedef struct tagCcStridedSliceAttrs {
+  uint32_t beginMask;
+  uint32_t endMask;
+  uint32_t ellipsisMask;
+  uint32_t newAxisMask;
+  uint32_t shrinkAxisMask;
+} ccStridedSliceAttrs_t;
+
+/**
+ * @ingroup dnn
+ * @brief params of batchToSpace
+ */
+typedef struct tagCcBatchToSpace {
+  int32_t blockShapeLength;
+  int32_t blockShape[CC_DIM_MAX];
+  int32_t crops[2 * CC_DIM_MAX];
+} ccBatchToSpace_t;
+
+/**
+ * @ingroup dnn
+ * @brief params of spaceToBatch
+ */
+typedef struct tagCcSpaceToBatch {
+  int32_t blockShapeLength;
+  int32_t blockShape[CC_DIM_MAX];
+  int32_t paddings[2 * CC_DIM_MAX];
+} ccSpaceToBatch_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of tensor
+ */
+typedef struct tagCcTensor {
+  ccTensorFormat_t format;
+  ccDataType_t dataType;
+  int32_t dimCnt;
+  int32_t realDimCnt;
+  uint32_t dataSize;
+  int32_t dim[CC_DIM_MAX];
+  int32_t stride[CC_DIM_MAX];
+  ccVecQuantizePara_t vecQuantizePara;
+} ccTensor_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of filter tensor
+ */
+typedef struct tagCcFilter {
+  ccTensorFormat_t format;
+  ccDataType_t dataType;
+  int32_t dimCnt;
+  uint32_t dataSize;
+  int32_t dim[CC_DIM_MAX];
+} ccFilter_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of convolution operator
+ */
+typedef struct tagCcConvolution {
+  ccConvolutionMode_t mode;
+  ccPaddingMode_t padMode;
+  int32_t dimCnt;
+  int32_t padding[2 * (CC_DIM_MAX - 2)];
+  int32_t filterStride[CC_DIM_MAX - 2];
+  int32_t dilation[CC_DIM_MAX - 2];
+  int32_t group;
+  ccQuantizeDescriptor_t quantInfo;
+  ccConvolutionAipp_t aippInfo;
+  int32_t adj[CC_DIM_MAX - 2];
+  int32_t targetShape[CC_DIM_MAX - 2];
+  int32_t beforePadding[2 * (CC_DIM_MAX - 2)];  // pad before conv
+  uint32_t reluFlag;
+  int64_t concatBatchSize;
+} ccConvolution_t;
+
+#define ccCorrelation_t ccConvolution_t
+typedef struct tagCcFullConnection_t {
+  ccQuantizeDescriptor_t quantInfo;
+  uint32_t infoTabSize;
+  const void *infoTab;
+  bool reluFlag;
+  ccFullConnectFwdAlgo_t algo;
+} ccFullConnection_t;
+
+typedef struct tagCcConcatFour2Five_t {
+  uint32_t branchNum;  // how many branch for box or class
+  uint32_t classNum;   // box branch's classNum is four, class branch's classNum is class number
+} ccConcatFour2Five_t;
+
+typedef struct tagCcTransdata_t {
+  uint64_t scaleQAddr;
+  uint8_t scaleQValueMode;
+  uint64_t offsetQAddr;
+  uint8_t quantAlgo;
+  uint8_t quantize8bitFlag;
+} ccTransdata_t;
+/**
+ * @ingroup dnn
+ * @brief struct define of pooling operator
+ */
+typedef struct tagCcPooling {
+  ccPoolingMode_t mode;
+  ccPaddingMode_t padMode;
+  ccNanPropagation_t maxpoolingNanOpt;
+  int32_t dimCnt;
+  int32_t windowDim[CC_DIM_MAX - 2];
+  int32_t padding[CC_DIM_MAX - 2];
+  int32_t stride[CC_DIM_MAX - 2];
+  int32_t dataMode;
+  int32_t ceilMode;
+  ccQuantizeDescriptor_t quantInfo;
+  ccPooingFwdAlgo_t algo;
+} ccPooling_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of activation operator
+ */
+typedef struct tagCcActivation {
+  ccActivationMode_t mode;
+  ccNanPropagation_t reluNanOpt;
+  double coef; /* ceiling for clipped RELU, alpha for ELU */
+  ccActivationPara_u activationPara;
+} ccActivation_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of svdf operator
+ */
+typedef struct tagCcSvdf {
+  ccTensorFormat_t format;
+  ccDataType_t dataType;
+  uint32_t batches;
+  uint32_t features;
+  uint32_t rank;
+  uint32_t inputSize;
+  uint32_t memorySize;
+} ccSvdf_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of svdf operator
+ */
+typedef struct tagCcHashTableLookup {
+  ccTensorFormat_t format;
+  ccDataType_t lookupType;
+  ccDataType_t keyType;
+  ccDataType_t valueType;
+  ccDataType_t outputType;
+  ccDataType_t hitsType;
+  uint32_t lookups;
+  uint32_t keys;
+  uint32_t rows;
+  uint32_t features;
+  uint16_t valueScale;
+  uint16_t outputScale;
+  uint16_t valueOffset;
+  uint16_t outputOffset;
+} ccHashTableLookup_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of prelu operator
+ */
+typedef struct tagCcPRelu {
+  ccNanPropagation_t reluNanOpt;
+  int32_t slopeCount;
+  bool channelShared;
+} ccPRelu_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of crop operator
+ */
+typedef struct tagCcCrop {
+  int32_t startAxis;
+  int32_t offset[CC_DIM_MAX];
+  int32_t offsetCnt;
+} ccCrop_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of SpatialTransformer operator
+ */
+typedef struct tagCcSpatialTransformer {
+  ccSamplerType_t samplerType;
+  ccDataType_t dataType;
+  int32_t dimCnt;
+  uint64_t dim[CC_DIM_MAX];
+  uint64_t alignCorner;
+} ccSpatialTransformer_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of ShiftTransformer operator
+ */
+typedef struct tagCcShiftTransformer {
+  ccSamplerType_t samplerType;
+  double xPreDefined;
+  double yPreDefined;
+  bool xShift;
+  bool yShift;
+  int32_t gridH;
+  int32_t gridW;
+} ccShiftTransformer_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of FasterRcnnProposal operator
+ */
+typedef struct tagCcFasterRcnnProposal {
+  int32_t preNMStopK;
+  int32_t postNMStopK;
+  float nmsTresh;
+  float minSize;
+  float featStride;
+  float baseSize;
+  int32_t ratioCnt;
+  int32_t scaleCnt;
+  float *ratio;
+  float *scale;
+  int32_t imgH;
+  int32_t imgW;
+} ccFasterRcnnProposal_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of LRN operator
+ */
+typedef struct tagCcLRN {
+  ccLRNMode_t lrnMode;
+  int32_t lrnN;
+  double lrnAlpha;
+  double lrnBeta;
+  double lrnK;
+} ccLRN_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of instanceNorm
+ */
+typedef struct tagCcInstancenorm {
+  ccInstanceNormMode_t mode;
+  double epsilon;
+} ccInstancenorm_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of assignOp operator
+ */
+typedef struct tagCcAssignOp {
+  ccAssignOpMode_t assignOpMode;
+} ccAssignOp_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of arcSinCos operator
+ */
+typedef struct tagCcArcSinCos {
+  ccArcSinCosMode_t arcSinCosMode;
+} ccArcSinCos_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of Detectpostprocess operator
+ */
+typedef struct tagCcDetectpostprocess {
+  int32_t numClasses;
+  float confThreshold;
+  float nmsThreshold;
+  int32_t outTopK;
+  float bboxRegWeightsDx;
+  float bboxRegWeightsDy;
+  float bboxRegWeightsDw;
+  float bboxRegWeightsDh;
+} ccDetectpostprocess_t;
+/**
+ * @ingroup dnn
+ * @brief struct define of FasterRcnnDetectionOutput operator
+ */
+typedef struct tagCcFasterRcnnDetectionOutput {
+  int32_t numClasses;
+  float nmsThreshold;
+  float postConfThreshold;
+  int32_t imgH;
+  int32_t imgW;
+  int32_t batchSize;
+} ccFasterRcnnDetectionOutput_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of SsdDetectionOutput operator
+ */
+typedef struct tagCcSsdDetectionOutput {
+  int32_t numClasses;
+  int32_t backgroundLabelId;
+  double preConfThreshold;
+  int32_t preTopK;
+  double nmsThreshold;
+  double nmsEta;
+  ccBoxCodeType_t codeType;
+  int32_t outTopK;
+  bool shareLocation;
+  bool varianceEncodedInTarget;
+  uint32_t boxTypeNum;
+  float var[4];
+  uint32_t variance_num;
+} ccSsdDetectionOutput_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of RefinedetDetectionOutput operator
+ */
+typedef struct tagCcRefinedetDetectionOutput {
+  int32_t numClasses;
+  int32_t backgroundLabelId;
+  double preConfThreshold;
+  int32_t preTopK;
+  double nmsThreshold;
+  double nmsEta;
+  ccBoxCodeType_t codeType;
+  int32_t outTopK;
+  bool shareLocation;
+  bool varianceEncodedInTarget;
+  uint32_t boxTypeNum;
+  float var[4];
+  uint32_t variance_num;
+  double objectness_score;
+} ccRefinedetDetectionOutput_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of MsrGenerateRpnProposals operator
+ */
+typedef struct tagCcMsrGenerateRpnProposals {
+  int32_t preNmsTopK;
+  int32_t postNmsTopK;
+  float nmsThreshold;
+  float rpnMiniSize;
+  int32_t imgH;
+  int32_t imgW;
+  uint32_t boxTypeNum;
+  float scoreThreshold;
+} ccMsrGenerateRpnProposals_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of RetinaPostprocessor operator
+ */
+typedef struct tagCcRetinaPostprocessor {
+  int32_t numClasses;
+  int32_t maxDetections;
+  float nmsThreshold;
+  float scoreThreshold;
+  int32_t imgH;
+  int32_t imgW;
+  uint32_t boxTypeNum;
+  float mean[4];
+  int32_t meanNum;
+  float std[4];
+  int32_t stdNum;
+  int32_t outputNum;
+  bool ocrFlag;
+} ccRetinaPostprocessor_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of GenerateSsdAnchors operator
+ */
+typedef struct tagCcGenerateSsdAnchors {
+  int32_t featureMapShapeList[20];
+  uint32_t featureMapShapeListSize;
+  int32_t boxSpecsNum[10];
+  uint32_t boxSpecsNumSize;
+  float scales[10];
+  uint32_t scalesNum;
+  float aspectRatios[10];
+  uint32_t aspectRatiosNum;
+  int32_t baseAnchorSize[2];
+  uint32_t baseAnchorSizeNum;
+  int32_t anchorStride[2];
+  uint32_t anchorStrideNum;
+  int32_t anchorOffset[2];
+  uint32_t anchorOffsetNum;
+  bool reduceBoxesInLowestLayer;
+  float minScale;
+  float maxScale;
+  int32_t imgH;
+  int32_t imgW;
+} ccGenerateSsdAnchors_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of MscnnBoxOutput operator
+ */
+typedef struct tagCcMscnnBoxOutput {
+  double fgThreshold;
+  double nmsThreshold;
+  ccNmsType_t nmsType;
+  int32_t fieldH[CC_MAX_INPUT_CNT];
+  int32_t fieldW[CC_MAX_INPUT_CNT];
+  int32_t downsampleRate[CC_MAX_INPUT_CNT];
+  int32_t defaultBoxCnt;
+  double fieldWhr;
+  double fieldXyr;
+  int32_t maxNmsNum;
+  int32_t maxPostNmsNum;
+  double minSize;
+} ccMscnnBoxOutput_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of NMS operator
+ */
+typedef struct tagCcNms {
+  int32_t numClasses;
+  int32_t backgroundLabelId;
+  double preConfThreshold;
+  int32_t preTopK;
+  double nmsThreshold;
+  double nmsEta;
+  int32_t postTopK;
+  int32_t outTopK;
+  double postConfThreshold;
+  bool shareLocation;
+} ccNms_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of NMS/MultiClassNMS operator
+ */
+typedef struct tagCcMultiClassNms {
+  uint64_t numClasses;
+  float objThreshold;
+  float nmsThreshold;
+  float clsThreshold;
+  bool normal;
+  uint64_t coorType;
+} ccCcMultiClassNms_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of YoloDetectionOutput operator
+ */
+typedef struct tagCcYoloDetectionOutput {
+  ccYoloVersion_t yoloVersion;
+  uint32_t netH;
+  uint32_t netW;
+  uint32_t postTopK;
+  uint32_t classes;
+  float nmsThreshold;
+  float iouThreDecay;
+  float coorScaleFactor;
+  bool relative;
+  float objThreshold;
+  float clsThreshold;
+  uint32_t biasNum;
+  float *bias;
+} ccYoloDetectionOutput_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of GetRegionBox operator
+ */
+#ifndef CC_MAX_YOLO_BIAS_NUM
+#define CC_MAX_YOLO_BIAS_NUM (16)
+#endif
+
+typedef struct tagCcGetRegionBox {
+  uint32_t biasNum;
+  uint32_t H;
+  uint32_t W;
+  float bias[CC_MAX_YOLO_BIAS_NUM];
+} ccGetRegionBox_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of CorrectBoxes operator
+ */
+typedef struct tagCorrectBoxes {
+  uint32_t netW;
+  uint32_t netH;
+  bool relative;
+} ccCorrectBoxes_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of ClsProb operator
+ */
+typedef struct tagClsProb {
+  float objThreshold;
+} ccClsProb_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of SsdPriorBox operator
+ */
+typedef struct tagCcSsdPriorBox {
+  ccBoxCodeType_t codeType;
+  double *minSize;
+  int32_t minSizeNum;
+  double *maxSize;
+  int32_t maxSizeNum;
+  double *aspectRatio;
+  int32_t aspectRatioNum;
+  double *variance;
+  int32_t varianceNum;
+  int32_t imgH;
+  int32_t imgW;
+  double stepH;
+  double stepW;
+  double offset;
+  bool flip;
+  bool clip;
+} ccSsdPriorBox_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of Yolo2Region operator
+ */
+typedef struct tagCcYolo2Region {
+  ccSoftmaxTree_t softmaxTree;
+  bool softmax;
+  bool background;
+  bool treeSoftmax;
+} ccYolo2Region_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of YoloRegion operator
+ */
+typedef struct tagCcYoloRegion {
+  ccSoftmaxTree_t softmaxTree;
+  bool softmax;
+  bool background;
+  bool treeSoftmax;
+  int32_t classes;
+  int32_t coords;
+  int32_t boxes;
+  ccYoloVersion_t yoloV;
+} ccYoloRegion_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of power operator
+ */
+typedef struct tagCcPower {
+  float scale;
+  float shift;
+  float power;
+} ccPower_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of exp operator
+ */
+typedef struct tagCcExp {
+  ccDataType_t dataType;
+  uint32_t paramCnt;
+} ccExp_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of exp operator
+ */
+typedef struct tagCcLog {
+  ccDataType_t dataType;
+  uint32_t paramCnt;
+} ccLog_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of pow operator
+ */
+typedef struct tagCcPow {
+  ccDataType_t dataType;
+  uint32_t paramCnt;
+} ccPow_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of padv2 operator
+ */
+typedef struct tagCcPadV2 {
+  ccPadMode_t padMode;
+  void *padValue;
+  ccDataType_t padValueType;
+  int32_t padDimCnt;
+  int32_t padShapeLow[CC_DIM_MAX];
+  int32_t padShapeHigh[CC_DIM_MAX];
+} ccPadV2_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of psROIPooling operator
+ */
+typedef struct tagCcPsRoiPooling {
+  ccPoolingMode_t poolingMode;
+  int32_t pooledH;
+  int32_t pooledW;
+  float spatialScale;
+  float padRatio;
+  int32_t groupSize;
+  int32_t outputDim;
+} ccPsRoiPooling_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of RoIAlign operator
+ */
+typedef struct tagCcRoiAlign {
+  int32_t pooledH;
+  int32_t pooledW;
+  float spatialScale;
+  int32_t samplingRatio;
+} ccRoiAlign_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of RoiInterpPooling operator
+ */
+typedef struct tagCcRoiInterpPooling {
+  int32_t pooledH;
+  int32_t pooledW;
+  int32_t poolKernelH;
+  int32_t poolKernelW;
+  int32_t pooledTailH;
+  int32_t pooledTailW;
+  float spatialScaleH;
+  float spatialScaleW;
+} ccRoiInterpPooling_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of DetectionFull3DOutput operator
+ */
+typedef struct tagCcDetectionFull3DOutput {
+  int32_t imageWidth;
+  int32_t imageHeight;
+  int32_t numAngleBins;
+  float trcMarginRatioX;
+  float trcMarginRatioY;
+  int32_t pitchRangeD;
+  int32_t pitchPresetD;
+  float mountHeight;
+  int32_t visiblenessBins;
+  float meanVisibleness;
+  bool discreteVisibleness;
+} ccDetectionFull3DOutput_t;
+
+/**
+ * @ingroup dnn
+ * @brief struct define of MsrFastRcnnPredictions operator
+ */
+typedef struct tagMsrFastRcnnPredictions {
+  int32_t numClasses;    // num of classes
+  float scoreThreshold;  // the threshold of the score
+  double nmsThreshold;   // the threshold of nms
+  int32_t postTopK;
+  int32_t outTopK;
+  int32_t imgH;  // the height of image
+  int32_t imgW;  // the width of image
+} ccMsrFastRcnnPredictions_t;
+
+typedef struct tagCcResizeBilinear {
+  ccResizeOutputDimMode_t resizeOutputDimMode;
+  bool alignCorners;
+  int32_t zoom_factor;
+  int32_t shrink_factor;
+  int32_t height;
+  int32_t width;
+  int32_t pad_begin;
+  int32_t pad_end;
+} ccResizeBilinear_t;
+
+typedef struct tagCcResizeNearestNeighbor {
+  bool alignCorners;
+  int32_t height;
+  int32_t width;
+} ccResizeNearestNeighbor_t;
+
+typedef struct tagCcEltwise {
+  ccQuantize_t *quantInfo;
+  bool reluFlag;
+} ccEltwise_t;
+
+typedef struct tagCcBatchNorm {
+  bool reluFlag;
+} ccBatchNorm_t;
+
+typedef struct tagCcPad {
+  ccPadMode_t padMode;
+  float padValue;
+  int32_t htoppad;     // padLow[0]
+  int32_t hbottompad;  // padHigh[0]
+  int32_t wleftpad;    // padLow[1]
+  int32_t wrightpad;   // padHigh[1]
+} ccPad_t;
+
+typedef struct tagCcSubCondition {
+  uint32_t BaseCondValue[4];
+  ccCMPType_t condType[4];
+  ccResultType_t resultType;
+} ccSubCondition;
+
+typedef struct tagCcShapeClassifyCond {
+  uint32_t subConditionNum;
+  ccResultType_t resultType;
+  uint32_t true_value;
+  ccSubCondition subCond[2];
+} ccShapeClassifyCond;
+
+#ifndef CC_SHAPE_CLASSIFY_CONDITION_NUM
+#define CC_SHAPE_CLASSIFY_CONDITION_NUM (8)
+#endif
+
+typedef struct tagCcShapeClassify {
+  uint32_t shapeClassifyConditionNum;
+  uint32_t defaultValue;
+  ccShapeClassifyCond shapeClassifyCond[CC_SHAPE_CLASSIFY_CONDITION_NUM];
+} ccShapeClassify_t;
+
+/**
+ * @ingroup dnn
+ * @bref struct define of square operator
+ */
+typedef struct tagCcSquare {
+  ccSquareMode_t mode;
+} ccSquare_t;
+
+/*
+ * @ingroup dnn
+ * @brief operation of segment reduction
+ */
+typedef enum {
+  CC_SEGMENT_REDUCTION_OP_SUM = 0, /**< sum */
+  CC_SEGMENT_REDUCTION_OP_INVALID
+} ccSegmentReductionOpType_t;
+
+typedef struct tagCcFillParam {
+  // The filler type.
+  ccFillOpType_t fillType;
+  ccDataType_t valueDatatype;
+  const void *value;  // the value in constant fill
+  const void *min;    // the min value in uniform fill
+  const void *max;    // the max value in uniform fill
+  const void *mean;   // the mean value in Gaussian fill
+  const void *std;    // the std value in Gaussian fill
+  // the seed used to generate data in Gaussian and uniform fill
+  int64_t seed1;
+  int64_t seed2;
+} ccFillParam_t;
+
+typedef struct tagNonMaxSuppression {
+  ccDataType_t dataType;
+  uint32_t paraCount;
+} ccNonMaxSuppression_t;
+
+typedef struct tagCcArgmaxmin {
+  int32_t axisType;
+  bool outMaxVal;
+  int64_t topK;
+  int64_t reduceSize;
+  int64_t reduceStride;
+  int64_t axis;
+  bool keepDims;
+} ccArgmaxmin_t;
+
+typedef struct tagUpsamplePara {
+  int32_t scale;
+  int32_t scaleHeight;
+  int32_t scaleWidth;
+  int32_t upsampleHeight;
+  int32_t upsampleWidth;
+  bool padOutHeight;
+  bool padOutWidth;
+} ccUpsamplePara_t;
+
+typedef struct tagCcConcatFive2Four_t {
+  ccTransForLossMode_t mode;
+  uint32_t classNum;
+} ccConcatFive2Four_t;
+
+};     // namespace cce
+#endif  // DNN_STRUCT_BASE_HPP__
diff --git a/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
new file mode 100644
index 00000000..5733d68f
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/fwk_adpt_struct.h
@@ -0,0 +1,155 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FWK_ADPT_STRUCT_H__
+#define FWK_ADPT_STRUCT_H__
+
+#include <cstdint>
+
+namespace aicpu {
+namespace FWKAdapter {
+
+// API RETURN CODE
+enum FWKAdptAPIRetCode {
+  FWK_ADPT_SUCCESS = 0,                  // success
+  FWK_ADPT_NOT_INIT = 1,                 // not init
+  FWK_ADPT_ALLOC_FAILED = 2,             // allocate memory failed
+  FWK_ADPT_PARAM_INVALID = 3,            // invalid input param
+  FWK_ADPT_PARAM_PARSE_FAILED = 4,       // parase input param failed
+  FWK_ADPT_NATIVE_ERROR = 5,             // error code
+  FWK_ADPT_NOT_SUPPORT_OPTYPE = 6,       // unsupport operate type
+  FWK_ADPT_INTERNAL_ERROR = 7,           // adpter internal error
+  FWK_ADPT_NOT_SUPPORT_DATATYPE = 8,     // unsupport input/output data type
+  FWK_ADPT_KERNEL_ALREADY_RUNING = 9,    // kernel already runing, not support parallel run
+  FWK_ADPT_SESSION_NOT_EXIST = 10,       // session id not exist
+  FWK_ADPT_SESSION_ALREADY_EXIST = 11,   // session id alread exist for create session
+  FWK_ADPT_NATIVE_END_OF_SEQUENCE = 12,  // end of sequence
+  FWK_ADPT_EXTEND_TYPE_NOT_EXIST = 13,   // extend info type not exist
+  FWK_ADPT_UNKNOWN_ERROR = 99            // unknown error code
+};
+
+// FWKAdapter operate type
+// Notice: add new operate type  need check with OMM, and make sure append to the end line.
+enum FWKOperateType {
+  FWK_ADPT_SESSION_CREATE = 0,
+  FWK_ADPT_KERNEL_RUN,
+  FWK_ADPT_KERNEL_DESTROY,
+  FWK_ADPT_SESSION_DESTROY,
+  FWK_ADPT_SINGLE_OP_RUN,
+  FWK_ADPT_KERNEL_RUN_NO_SESS,
+};
+
+// Extend Info type for task
+enum FWKTaskExtInfoType {
+  FWK_ADPT_EXT_SHAPE_TYPE = 0,
+  FWK_ADPT_EXT_INPUT_SHAPE,
+  FWK_ADPT_EXT_OUTPUT_SHAPE,
+  FWK_ADPT_EXT_UPDATE_ADDR,
+  FWK_ADPT_EXT_OP_NAME,
+  FWK_ADPT_EXT_SESSION_INFO,
+  FWK_ADPT_EXT_BITMAP,
+  FWK_ADPT_EXT_TOPIC_TYPE,
+  FWK_ADPT_EXT_ASYNCWAIT,
+  FWK_ADPT_EXT_INVALID
+};
+
+enum FWKExtTopicType {
+  FWK_ADPT_TOPIC_DEVICE_ONLY = 0,
+  FWK_ADPT_TOPIC_DEVICE_FIRST,
+  FWK_ADPT_TOPIC_HOST_ONLY,
+  FWK_ADPT_TOPIC_HOST_FIRST,
+  FWK_ADPT_TOPIC_INVALID
+};
+
+enum FWKExtUpdateAddrType {
+  FWK_ADPT_UPDATE_NULL = 0,
+  FWK_ADPT_UPDATE_INPUT,
+  FWK_ADPT_UPDATE_OUTPUT,
+  FWK_ADPT_UPDATE_INPUT_OUTPUT
+};
+
+enum FWKExtWaitType {
+  FWK_ADPT_WAIT_TYPE_NULL = 0,
+  FWK_ADPT_WAIT_TYPE_EVENT,
+  FWK_ADPT_WAIT_TYPE_INVALID
+};
+
+#pragma pack(push, 1)
+// API Parameter Structure
+struct StrFWKKernel {
+  FWKOperateType opType;
+  uint64_t sessionID;  // unique
+
+  uint64_t stepIDAddr;    // step id addr
+  uint64_t kernelID;      // run kernel id, unique in session
+  uint64_t nodeDefLen;    // nodeDef protobuf len
+  uint64_t nodeDefBuf;    // NodeDef protobuf offset addr, need convert to void*
+  uint64_t funDefLibLen;  // FunctionDefLibrary protobuf len
+  uint64_t funDefLibBuf;  // FunctionDefLibrary protobuf addr which use in NodeDef, need convert to void*
+
+  uint64_t inputOutputLen;     // InputOutput shap protobuf len
+  uint64_t inputOutputBuf;     // InputOutput shap protobuf addr, need convert to void*
+  uint64_t workspaceBaseAddr;  // Workspace base addr, need convert to void*
+  uint64_t inputOutputAddr;    // InputOutput addr, need convert to void*
+
+  uint64_t extInfoLen;         // extend info total length
+  uint64_t extInfoAddr;        // extend info addr, ExtInfo structure
+};
+#pragma pack(pop)
+
+typedef StrFWKKernel FWKOperateParam;
+
+// Extent info ShapeAndType
+const uint32_t kMaxShapeDims = 8;
+#pragma pack(push, 1)
+struct ShapeAndType {
+  int32_t type;
+  int64_t dims[kMaxShapeDims];
+};
+#pragma pack(pop)
+
+// Extend info structure for extInfoAddr
+const uint32_t kExtInfoHeadSize = 8;
+
+#pragma pack(push, 1)
+struct ExtInfo {
+  int32_t  infoType;    // extend type
+  uint32_t infoLen;     // length for infoMsg
+  char     infoMsg[0];  // extend value
+};
+#pragma pack(pop)
+
+#pragma pack(push, 1)
+struct ResultSummary {
+  uint64_t shape_data_ptr;   // shape data addr, need convert to void*
+  uint64_t shape_data_size;  // num of dims
+  uint64_t raw_data_ptr;     // raw data addr,  need convert to void*
+  uint64_t raw_data_size;    // size of raw data
+};
+#pragma pack(pop)
+
+#pragma pack(push, 1)
+struct AsyncWait {
+  uint8_t waitType; // wait type, FWK_ADPT_WAIT_TYPE_EVENT: event wait
+  uint32_t waitId; // wait id, GE refresh
+  uint32_t timeOut; // reserved
+  uint64_t reserved;
+};
+#pragma pack(pop)
+}  // end  namespace FWKAdapter
+}  // namespace aicpu
+
+#endif  // FWK_ADPT_STRUCT_H__
diff --git a/third_party/fwkacllib/inc/cce/l2fusion_struct.hpp b/third_party/fwkacllib/inc/cce/l2fusion_struct.hpp
new file mode 100644
index 00000000..fa5a95c9
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/l2fusion_struct.hpp
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef L2FUSION_STRUCT_HPP_
+#define L2FUSION_STRUCT_HPP_
+
+#include <map>
+#include <string>
+#include "runtime/kernel.h"
+
+#define L2_DYNAMIC_SPLIT_NUM
+
+using namespace std;
+
+namespace fusion {
+
+typedef struct tagL2Data {
+  uint32_t l2Index;
+  uint64_t l2Addr;
+  uint64_t l2PageNum;
+} L2Data_t;
+
+typedef std::map<uint64_t, L2Data_t> L2DataMap_t;    // the key is ddr addr
+typedef std::pair<uint64_t, L2Data_t> L2DataPair_t;  // the key is ddr addr
+
+typedef struct TagTaskL2Info {
+  string nodeName;
+  rtL2Ctrl_t l2ctrl;
+
+  L2DataMap_t input;
+  L2DataMap_t output;
+  uint32_t isUsed;
+} TaskL2Info_t;
+
+typedef std::map<uint32_t, TaskL2Info_t> TaskL2InfoMap_t;    // the key is nodeId
+typedef std::pair<uint32_t, TaskL2Info_t> TaskL2InfoPair_t;  // the key is nodeId
+
+typedef std::map<string, TaskL2Info_t> TaskL2InfoFEMap_t;    // the key is nodeName
+typedef std::pair<string, TaskL2Info_t> TaskL2InfoFEPair_t;  // the key is nodeName
+
+}  // namespace fusion
+
+#endif  // L2FUSION_STRUCT_HPP_
diff --git a/third_party/fwkacllib/inc/cce/optimizer/fusion_engine.h b/third_party/fwkacllib/inc/cce/optimizer/fusion_engine.h
new file mode 100644
index 00000000..299998e3
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/optimizer/fusion_engine.h
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FUSION_ENGINE_HPP_
+#define FUSION_ENGINE_HPP_
+
+#include "cce/cce.h"
+#include "graph/compute_graph.h"
+#include "proto/task.pb.h"
+
+#include <map>
+#include <vector>
+
+using namespace domi;
+using namespace std;
+
+namespace fusion {
+enum {
+  FUSION_STATUS_SUCCESS = 0,
+  FUSION_STATUS_FAIL = 1,
+};
+
+typedef struct {
+  uint64_t weightSize;
+  uint64_t memorySize;
+  uint8_t *dataMemBase;
+  uint8_t *weightMemBase;
+  uint32_t l2Enable;      // 1 //1 - enable l2 buffer allocation, 0 - disable l2 buffer allocation
+  uint32_t fusionEnable;  // 1    // 1 - enable buffer fusion, 0 - disable buffer fusion
+} ModelRes;
+
+static const std::string SCOPE_ID_ATTR = "fusion_scope";
+static const std::string L2FUSION_DYNAMIC_CONVERGE_OP = "l2fusion_dynamic_converge_op";
+static const std::string L2FUSION_DYNAMIC_SPLIT_NUM = "l2fusion_dynamic_split_num";
+static const std::string FUSION_VIRTUAL_OP = "fusion_virtual_op";
+static const std::string FUSION_MULTI_BATCH_STRIDE = "fusion_multi_bathc_stride";
+
+#define TVM_TYPE 1
+
+typedef std::map<int64_t, std::vector<ge::NodePtr>> kScopeNodeMap_t;
+typedef std::pair<int64_t, std::vector<ge::NodePtr>> kScopeNodePair_t;
+
+uint32_t BufferFusion(ge::ComputeGraphPtr origGraph, ge::ComputeGraphPtr fusionGraph, bool enable_l2dynamic = true);
+uint32_t BufferFusionTrain(ge::ComputeGraphPtr origGraph, ge::ComputeGraphPtr fusionGraph);
+uint32_t GraphFusion(ge::ComputeGraphPtr origGraph, ge::ComputeGraphPtr fusionGraph);
+uint32_t FusionTaskBuild(cce::ccHandle_t ccHandle, ge::ComputeGraphPtr fusionGraph, ge::Buffer &buffer,
+                         ModelRes &modelRes, std::vector<TaskDef> &task_def_list_);
+void FusionTaskBuildComplete(std::vector<cce::ccHandle_t> cchandleList);
+uint32_t GraphFusionTrain(ge::ComputeGraphPtr origGraph, ge::ComputeGraphPtr fusionGraph);
+}  // namespace fusion
+
+#endif  // FUSION_ENGINE_HPP_
diff --git a/third_party/fwkacllib/inc/cce/taskdown_api.h b/third_party/fwkacllib/inc/cce/taskdown_api.h
new file mode 100644
index 00000000..2323aaa7
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/taskdown_api.h
@@ -0,0 +1,54 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TASKDOWN_API_H_
+#define TASKDOWN_API_H_
+
+#include <map>
+#include <vector>
+#include "cce/cce.h"
+#include "l2fusion_struct.hpp"
+#include "taskdown_common.hpp"
+
+namespace cce {
+
+#define CC_FUSION_OP_MAX 32
+
+typedef struct tagOpAddrsInfo {
+  void *addrPos;
+  uintptr_t addrData;
+} ccOpAddrsInfo;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ccStatus_t ccUpdateKernelArgs(ccOpContext &opContext, uint64_t dataBaseAddr, uint64_t weightBaseAddr,
+                              uint64_t variableBaseAddr, void *argsAddr, uint64_t argsSize, void *l2ctrlAddr);
+
+#ifdef __cplusplus
+}
+#endif
+
+ccStatus_t ccGetKernelArgsAddrs(ccOpContext &opContext, void *argsAddr, uint64_t argsSize, void *l2ctrlAddr,
+                                std::vector<ccOpAddrsInfo> &opAddrsInfo);
+
+ccStatus_t ccSetKernelArgs(std::vector<ccOpAddrsInfo> &dateInfo);
+
+ccStatus_t ccGetKernelTypeByOpId(uint32_t opId, ccKernelType &kernelType);
+
+}  // namespace cce
+#endif  // TASKDOWN_API_H_
diff --git a/third_party/fwkacllib/inc/cce/taskdown_common.hpp b/third_party/fwkacllib/inc/cce/taskdown_common.hpp
new file mode 100644
index 00000000..7954162e
--- /dev/null
+++ b/third_party/fwkacllib/inc/cce/taskdown_common.hpp
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TASKDOWN_COMMON_H_
+#define TASKDOWN_COMMON_H_
+
+#include <map>
+#include "cce/cce_def.hpp"
+#include "common/attr_list.hpp"
+#include "l2fusion_struct.hpp"
+
+namespace cce {
+
+#define CC_FUSION_OP_MAX 32
+
+typedef enum tagccKernelType {
+  CCE_AI_CORE = 0,   /* cce aicore */
+  CCE_AI_CPU = 1,    /* cce aicpu */
+  TE = 2,            /* te operator*/
+  CUSTOMIZED = 3,    /* customized operator */
+  TE_AI_CORE = 4,    /* te aicore operator*/
+  TE_AI_CPU = 5,     /* te aicpu operator */
+  AI_CPU = 6,        /* aicpu */
+  CUST_AI_CPU = 7,   /* custom aicpu*/
+  HOST_CPU = 8,      /* host cpu */
+  INVALID = 10000    /* unknown kernel type */
+} ccKernelType;
+
+typedef struct tagOpContext {
+  ccKernelType kernelType;
+  uint32_t opId;
+  uint32_t kernelFuncId;
+  uint32_t opIndex;
+  uint32_t opCount;
+  uint32_t opIndex2[CC_FUSION_OP_MAX];
+  bool isFlowtable;
+  uint16_t *argsOffset;
+  uint32_t argsCount;
+  uint64_t genDataBaseAddr;
+  uint64_t genDataBaseSize;
+  uint64_t genWeightBaseAddr;
+  uint64_t genWeightBaseSize;
+  uint64_t genVariableBaseAddr;
+  uint64_t genVariableBaseSize;
+  uint64_t l2ctrlSize;
+} ccOpContext;
+
+typedef struct tagOpReadCount {
+  bool isEnable;
+  std::map<uint64_t, uint32_t> tensorRc;
+} ccOpReadCount;
+
+typedef enum tagTaskDownKernelIdMode {
+  CC_TASKDOWN_RESERVED = 0,
+  CC_TASKDOWN_ROIPOOLING,
+  CC_TASKDOWN_ROIPOOLING_PERF,
+  CC_TASKDOWN_ROIALIGN,
+  CC_TASKDOWN_ROIALIGN_PERF,
+  CC_TASKDOWN_FC,
+  CC_TASKDOWN_FC_COMPRESS,
+  CC_TASKDOWN_SOFTMAX_LOWEST,
+  CC_TASKDOWN_ROIALIGN_FP16,
+  CC_TASKDOWN_RESIZE_NEAREST_NEIGHBOR,
+  CC_TASKDOWN_RESIZE_NEAREST_NEIGHBOR_COMMON,
+} ccTaskDownKernelIdMode_t;
+
+ccStatus_t GetStream(ccHandle_t handle, rtStream_t *streamId);
+
+ccStatus_t ccClearOpMap(ccHandle_t handle);
+
+ccStatus_t ccSetKernelOpMap(ccHandle_t handle);
+
+ccStatus_t ccSetKernelContext(ccHandle_t handle, uint32_t opId, AttrList &attrList, bool isFlowtable,
+                              ccKernelType kernelType, void *pgraph);
+
+ccStatus_t ccGetKernelContext(rtStream_t streamId, ccOpContext &opContext);
+
+ccStatus_t ccGetKernelTypeByOpId(uint32_t opId, ccKernelType &kernelType);
+
+ccStatus_t ccSetStreamL2Map(ccHandle_t handle, fusion::TaskL2InfoMap_t &l2AllocRes);
+
+ccStatus_t ccGetStreamL2Map(rtStream_t streamId, uint32_t opIndex, fusion::TaskL2Info_t *&l2Data);
+
+ccStatus_t ccSetOpIndex(ccHandle_t handle, uint32_t opIndex);
+
+ccStatus_t ccGetOpIndex(ccHandle_t handle, uint32_t &opIndex);
+
+ccStatus_t ccGetOpIndexByStream(rtStream_t streamId, uint32_t &opIndex);
+
+ccStatus_t ccClearStreamL2Map(ccHandle_t handle);
+
+ccStatus_t ccGetKernelReadCount(rtStream_t streamId, ccOpReadCount &rc);
+
+}  // namespace cce
+#endif  // TASKDOWN_COMMON_H_