feat(lite): open source for lite

GitOrigin-RevId: f442431381
3 years ago · 71230e9a00
--- a/lite/.gitattributes
+++ b/lite/.gitattributes
@@ -0,0 +1,10 @@
 test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text
 third_party/librknn_api filter=lfs diff=lfs merge=lfs -text
 test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -0,0 +1,135 @@
 option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON)

 # config lite_build_config.h.in
 set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL})
 set(LITE_WITH_CUDA ${MGE_WITH_CUDA})
 set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING})
 set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS})
 set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC})

 if(NOT MGB_WITH_FLATBUFFERS)
    include(../cmake/flatbuffers.cmake)
 endif()

 file(GLOB_RECURSE SRC_FBS src/**/*.fbs)
 build_flatbuffers(
    "${SRC_FBS}"
    ""
    lite_fbs_generate
    ""
    "${CMAKE_CURRENT_BINARY_DIR}"
    ""
    ""
    )

 file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp)

 if(MGE_WITH_MINIMUM_SIZE)
    set(LITE_ENABLE_LOGGING OFF)
    set(LITE_ENABLE_EXCEPTION OFF)
 endif()

 # Write out lite_build_config.h
 # It defines macros needed by lite
 configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

 # begin config lite
 if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
    # FXIME third_party cpp redis do not support build with clang-cl
    file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp)
    list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS})
    file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp)
    list(APPEND SOURCES_LITE ${SOURCES_TACOPIE})
 endif()
 add_library(lite_static STATIC ${SOURCES_LITE})
 add_dependencies(lite_static lite_fbs_generate)
 include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>)

 if(LITE_BUILD_WITH_MGE)
    target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
    add_compile_definitions(LITE_BUILD_WITH_MGE=1)
    message(STATUS "build lite with MegEngine.")
 else()
    target_link_libraries(lite_static PUBLIC flatbuffers)
 endif()

 include_directories(
    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include>
    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include>
    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite>
    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include>
    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src>
    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include>
    )
 # end config lite

 # define a shared lib
 add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>)
 if(LITE_BUILD_WITH_MGE)
    target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS})
 endif()
 if(ANDROID)
    link_libraries(log)
    target_link_libraries(lite_static PRIVATE log)
    target_link_libraries(lite_shared PRIVATE log)
 endif()

 if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32)
    # FXIME third_party cpp redis do not support build with clang-cl
    target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
    target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
    target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes)
    target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes)
 endif()
 set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script")
 add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT})
 if(NOT MSVC AND NOT WIN32)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
 endif()
 #TODO: implemente version script for other OS
 if (UNIX AND NOT APPLE)
    target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT})
    set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT})
 endif()

 # config install
 install(TARGETS lite_static
    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
    ARCHIVE DESTINATION lite/lib/${MGE_ARCH})

 install(TARGETS lite_shared
    LIBRARY DESTINATION lite/lib/${MGE_ARCH}
    FRAMEWORK DESTINATION lite/lib/${MGE_ARCH}
    ARCHIVE DESTINATION lite/lib/${MGE_ARCH}
    )

 install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h
    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c)

 install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include
    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")

 install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include
    DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h")

 add_subdirectory(example)
 if(MGE_WITH_TEST)
    add_subdirectory(test)
 endif()

 # tools and example
 add_executable(rc4_encryptor tools/rc4_encrypt.cpp)

 target_link_libraries(rc4_encryptor lite_static)
 if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
    # FIXME: hip obj can not find cpp obj only through lite_static
    target_link_libraries(rc4_encryptor megdnn)
 endif()
 target_include_directories(rc4_encryptor PRIVATE
    {PROJECT_SOURCE_DIR}/lite/src/decryption)
 install (TARGETS rc4_encryptor
    EXPORT ${LITE_EXPORT_TARGETS}
    RUNTIME DESTINATION lite/tools)
--- a/lite/README.md
+++ b/lite/README.md
@@ -0,0 +1,251 @@
 # Lite

 It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in 
 user's SDK

 ## bazel build 

 目前支持内部 bazel 和 CMake 编译，支持 C++/C, Python 接口，
 下面是 bazel 中 lite_shared 目标的编译，可以作为其他目标的编译的参考，
 该编译依赖内部 bazel 编译以及 megvii3。

 ### 配置编译环境

 需要使用 megvii3 workspace 来完成 bazel 的编译

 #### Clone megvii3 安装 bazel

 ```bash
    git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git
    ./utils/bazel/get_bazel.sh
 ```

 #### Clone megbrain
 ```
    git submodule update brain/megbrain brain/midout
 ```

 ### 编译 x86 CUDA 版本

 ```bash
    ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
        --compiler="gcc7_cuda10" -c opt
 ```

 ### 编译 x86 CPU 版本

 ```bash
    ./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \
        --compiler="gcc9" -c opt
 ```

 ### 编译 arm OpenCL 版本

 ```bash
    ./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \
        -c opt --define enable_opencl=1  --define enable_opencl_search=1
 ```
 ### 编译 arm opencl lite_examples
 bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \
 --cpu=android_aarch64 --define enable_opencl=1  --define enable_opencl_search=1
 ####如何运行snpe_loder 的lite_exampes 请查看下面的wiki
 https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906

 ### 编译 armv7 CPU 版本

 ```bash
    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \
        -c opt
 ```

 ### 编译 arm64 CPU 版本

 ```bash
    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
        -c opt
 ```

 ### 编译 arm64 CPU v8.2 版本

 ```bash
    ./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \
       --copt -march=armv8.2-a+fp16+dotprod  -c opt
 ```

 ## 同时支持cmake构建
 cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine
 和RKNPU后端且打开OpenCL的release模式
 ```bash
 EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \
 -DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh"
 ```
 * 如果需要支持性能分析的 profile 功能，则需要在编译时候加上
 --copt -DMGB_ENABLE_JSON=1 该参数
 * 如果需要支持 fast-run 功能则需要加上
 --copt -DMGB_ENABLE_FASTRUN=1，开启 fast-run 功能
 * 如果编译 arm64，可以加上 --copt -mcpu=cortex-a53 选项进行优化。

 ### midout 裁减编译
 具体 midout 的裁减原理见 megbrain 中 midout 裁减，裁减方法见 MegBrain 
 和 MegEngine 的裁减方法

 ## 模型

 ### 支持的模型

 lite 目前支持只支持 MegEngine dump 的模型格式，可以加载的模型文件包括原始
 的模型文件，原始的加密模型，pack 之后的加密或者非加密模型。加密算法以及
 加密的秘钥可以用户自定义，然后注册到 lite 中，详见 example 中加解密部分。

 * 原始模型未加密：直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型
 * 原始加密模型：将上述 dump 的模型通过加密算法进行加密，lite 提供两种默认
 的加密算法，在 tools 中，分别为 aes 和 rc4. 对应为：aes_encypt.sh 和
 rc4_encrypt.cpp，rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在
 加载时候需要在 Config 中配置模型的加密方式。
 * pack 之后的模型：模型结构将在下面介绍，可以将上面加密或者未加密的模型，和下面
 定义的 json config 文件一同打包为一个 pack 之后的模型，可以使用 tools 下面
 的 pack_model_and_info.py 工具中完成，pack_model_and_info.py 的使用详见其中
 的 help 输出。

 ### 模型结构

 不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分.

 * 打包处理之后的文件：
  模型打包过程可以通过脚本 pack_model_and_json.py 来完成，其将模型info文件（
  可以是任意格式，推荐使用JSON，可以加密也可以不加密）和加密或者未加密的模型文件
  一同打包在一起，并在文件开头加上 Header 来帮助解析。
 * 原始文件和原始的加密文件没有 Header 和模型 info部分，模型加载需要的信息
  可以通过 Config 和 NetworkIO 进行传递。

 ### Header

 Header 部分最开始为一个明文固定model_tag，目前定义为"packed_model"字符串，
 后面主要包含模型文件各个部分的信息，每个部分的加密方式，load 模型时候可以
 调用相应的解密方法对各个部分进行解密，以及model infomation 部分的解析方法。
 具体细节参考lite/src/parse_model/pack_model.fbs

 ### Info部分

 Info 部分主要用来解释模型，如用户关心的：模型的输入数据的格式，模型运行的平台
 等信息，这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。
 由于这个 Info 部分不同的用户需求不一致，想传递的信息也无法统一，所以目前
 Lite 中提供自定义的方式，用户可以自定义自己 Info 部分的类容，并在 Header 中
 指定 **Info 解析方式名字** ，并注册以该名字为 key 的解析函数到 Lite 中，
 以这样方式来可以实现用户自定义 Info 格式。同时，Lite 中也提供了一套定义好的
 格式，其名字为 "LITE_default"，并已经实现了对应的解析函数，该 info
 为 JSON 格式，具体内容定义如下：

 ```json
 {
    "name": "shufflenet_test",
    "valid": true,
    "version": "8.9999.0",
    "has_compression": false,
    "device": {
        "type": "CPU",
        "device_id": 0,
        "number_threads": 1,
        "use_tensor_rt": false,
        "enable_inplace_model": false
    },
    "options":{
        "weight_preprocess": false,
        "var_sanity_check_first_run": true,
        "const_shape": false,
        "jit_level": 0,
        "record_level": 0
    },
    "IO":{
        "inputs":[
             {
                "name": "data",
                "io_type": "value",
                "is_host": true,
                "dtype": "float32",
                "shape": {
                    "dim0": 1,
                    "dim1": 3,
                    "dim2": 224,
                    "dim3": 224
                }
            }
        ],
        "outputs":[
             {
                "name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]",
                "io_type": "value",
                "is_host": true,
                "dtype": "float32",
                "shape": {
                    "dim0": 1,
                    "dim1": 1000,
                    "dim2": 0,
                    "dim3": 0
                }
            }
        ]
    }
 }
 ```

 * model_name: 指这个模型的名字，用户可以用来验证是否运行了正确的模型，
 和 Header 部分中的进行对比 check
 * valid: 指在这个 info 文件中的设置是否影响模型的 Config
 * version: 指模型对应的 megbrain 的版本号，load 模型时候会进行 check
 * has_compression: 标识这个模型文件中 tensor 的数据是否压缩过
 * device: 目前支持字段包括："CPU","CUDA","OPENCL","ATLAS"
 * number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效
 * IO::inputs::type: 包括 value,shape，详见 include"network.h"
 * IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端
 * IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端
 * IO::outputs::shape::dimx: 如果为0，则便是该 dim 无效

 ### Model部分

 可以是加密的模型文件或者未加密的模型文件

 ## 使用

 丰富的使用方法详见文件 example 中文档和对应的 example。

 ## 工具

 目前 lite 中有三个工具保存在 tools 目录中，其他 megbrain 工具
 没有包含在内，分别为：

 * pack_model_and_info.py 为上面提到的模型打包工具，其为一个
  python 脚本，可以直接用其对已有的模型和模型 information 的文件，按照上面
  的格式进行打包模型，用户可以指定模型名字，模型加密方式，模型信息
  文件加密方式，解析方式等，如下：

    ```bash
    python3 pack_model_and_info.py --input-model xxx.mge \
        --model-name="shufflenet_test" \
        --model-cryption="RC4_default" \
        --input-info xxx.json \
        --info-cryption="RC4_default" \
        --info-parser="LITE_default" \
        -o xxx.lite
    ```
 * aes_encrypt.sh 为一个 aes 加密方式的加密脚本，可以将一个文件，
 通过指定的的 key 加密成一个 aes 加密的文件，其中 key 为 32 个字节
 16进制数。
    ```bash
    aes_encrypt.sh  xxx.mdl  xxx_encrypted.mdl \
        000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
    ```

 * rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具，这个工具可以通过
  制定的 key 或者默认的 key 加密制定的文件，支持 rc4 方法和
  simple_fast_rc4 两种方法，支持自定义 key。
    * bazel 编译 x86 命令为：
    ```bash
    bazel build //brain/megbrain/lite:rc4_encryptor \
        --cpu='k8' --compiler='gcc9'
    ```
    * 加密文件，具体用法见 help
    ```bash
    rc4_encryptor encrypt_predefined_rc4 \
        to_be_encrypt.file encrypted.file
    ```
--- a/lite/build_config/lite_build_config.h
+++ b/lite/build_config/lite_build_config.h
@@ -0,0 +1,32 @@
 /**
 * \file lite/build_config/lite_build_config.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #ifndef _HEADER_LITE_BUILD_CONFIG
 #define _HEADER_LITE_BUILD_CONFIG

 #ifndef LITE_ENABLE_LOGGING
 #define LITE_ENABLE_LOGGING 1
 #endif

 #ifndef LITE_ENABLE_EXCEPTION
 #if __cpp_exceptions || __EXCEPTIONS || \
        (defined(_MSC_VER) && defined(_CPPUNWIND))
 #define LITE_ENABLE_EXCEPTION 1
 #else
 #define LITE_ENABLE_EXCEPTION 0
 #endif
 #endif

 #ifndef LITE_WITH_CUDA
 #define LITE_WITH_CUDA 0
 #endif

 #ifndef LITE_ASSERT_LOC
 #define LITE_ASSERT_LOC 1
 #endif
 #endif  // _HEADER_LITE_BUILD_CONFIG
--- a/lite/example/CMakeLists.txt
+++ b/lite/example/CMakeLists.txt
@@ -0,0 +1,47 @@
 file (GLOB_RECURSE SOURCES ./*.cpp)
 add_executable(lite_examples  ${SOURCES})

 if(LITE_BUILD_WITH_RKNPU)
    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
    target_link_options(lite_examples  PRIVATE "-fuse-ld=gold")
 endif()

 target_link_libraries(lite_examples lite_static)
 if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM)
    # FIXME: hip obj can not find cpp obj only through lite_static
    target_link_libraries(lite_examples megdnn)
 endif()

 if(UNIX)
    if(APPLE OR ANDROID)
        target_link_libraries(lite_examples dl)
    else()
        target_link_libraries(lite_examples dl rt)
    endif()
 endif()

 install (TARGETS lite_examples
    EXPORT ${LITE_EXPORT_TARGETS}
    RUNTIME DESTINATION lite/bin)

 # add lite_examples_depends_shared for CI check symbol export valid
 add_executable(lite_examples_depends_shared  ${SOURCES})

 if(LITE_BUILD_WITH_RKNPU)
    #rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check
    target_link_options(lite_examples_depends_shared  PRIVATE "-fuse-ld=gold")
 endif()

 target_link_libraries(lite_examples_depends_shared lite_shared)

 if(UNIX)
    if(APPLE OR ANDROID)
        target_link_libraries(lite_examples_depends_shared dl)
    else()
        target_link_libraries(lite_examples_depends_shared dl rt)
    endif()
 endif()

 install (TARGETS lite_examples_depends_shared
    EXPORT ${LITE_EXPORT_TARGETS}
    RUNTIME DESTINATION lite/bin)
--- a/lite/example/example.h
+++ b/lite/example/example.h
@@ -0,0 +1,101 @@
 /**
 * \file example/example.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include <lite_build_config.h>

 #include "lite/global.h"
 #include "lite/network.h"
 #include "lite/tensor.h"

 #include "npy.h"

 #include <string.h>
 #include <memory>
 #include <unordered_map>
 #include <vector>

 namespace lite {
 namespace example {

 void set_cpu_affinity(const std::vector<int>& cpuset);

 struct Args {
    int args_parse_ret = 0;
    std::string example_name;
    std::string model_path;
    std::string input_path;
    std::string output_path;
    std::string loader_path;
    static Args from_argv(int argc, char** argv);
 };

 std::shared_ptr<Tensor> parse_npy(
        const std::string& path,
        LiteBackend backend = LiteBackend::LITE_DEFAULT);

 using ExampleFunc = std::function<bool(const Args&)>;
 using ExampleFuncMap = std::unordered_map<std::string, ExampleFunc>;

 ExampleFuncMap* get_example_function_map();

 bool register_example(std::string example_name, const ExampleFunc& fuction);

 template <int>
 struct Register;

 #if LITE_BUILD_WITH_MGE
 #if LITE_WITH_CUDA
 bool load_from_path_run_cuda(const Args& args);
 #endif
 bool basic_load_from_path(const Args& args);
 bool basic_load_from_path_with_loader(const Args& args);
 bool basic_load_from_memory(const Args& args);
 bool cpu_affinity(const Args& args);
 bool network_share_same_weights(const Args& args);
 bool reset_input(const Args& args);
 bool reset_input_output(const Args& args);
 bool config_user_allocator(const Args& args);
 bool register_cryption_method(const Args& args);
 bool update_cryption_key(const Args& args);
 bool async_forward(const Args& args);

 #if LITE_WITH_CUDA
 bool device_input(const Args& args);
 bool device_input_output(const Args& args);
 bool pinned_host_input(const Args& args);
 #endif
 #endif

 }  // namespace example
 }  // namespace lite

 #if LITE_BUILD_WITH_MGE
 bool basic_c_interface(const lite::example::Args& args);
 bool device_io_c_interface(const lite::example::Args& args);
 bool async_c_interface(const lite::example::Args& args);
 #endif

 #define CONCAT_IMPL(a, b) a##b
 #define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b)

 #define REGIST_EXAMPLE(name_, func_) \
    REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_)

 #define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_)          \
    template <>                                                 \
    struct Register<number_> {                                  \
        Register() { register_example(name_, func_); }          \
    };                                                          \
    namespace {                                                 \
    Register<number_> MACRO_CONCAT(example_function_, number_); \
    }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/main.cpp
+++ b/lite/example/main.cpp
@@ -0,0 +1,172 @@
 /**
 * \file example/example.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite/global.h"
 #include "lite/network.h"
 #include "lite/tensor.h"

 #include "example.h"
 #include "npy.h"

 #include <string.h>
 #include <map>
 #include <memory>
 #include <vector>

 using namespace lite;
 using namespace example;

 Args Args::from_argv(int argc, char** argv) {
    Args ret;
    if (argc < 4) {
        printf("usage: lite_examples <example_name> <model file> <input "
               "file> <output file>.\n");
        printf("*********The output file is optional.*************\n");
        printf("The registered examples include:\n");
        size_t index = 0;
        for (auto it : *get_example_function_map()) {
            printf("%zu : %s\n", index, it.first.c_str());
            index++;
        }
        ret.args_parse_ret = -1;
        return ret;
    }
    ret.example_name = argv[1];
    ret.model_path = argv[2];
    ret.input_path = argv[3];
    if (argc > 4) {
        ret.output_path = argv[4];
    }
    if (argc > 5) {
        ret.loader_path = argv[5];
    }
    return ret;
 }

 ExampleFuncMap* lite::example::get_example_function_map() {
    static ExampleFuncMap static_map;
    return &static_map;
 }

 bool lite::example::register_example(std::string example_name,
                                     const ExampleFunc& fuction) {
    auto map = get_example_function_map();
    if (map->find(example_name) != map->end()) {
        printf("Error!!! This example is registed yet\n");
        return false;
    }
    (*map)[example_name] = fuction;
    return true;
 }

 std::shared_ptr<Tensor> lite::example::parse_npy(const std::string& path,
                                                 LiteBackend backend) {
    std::string type_str;
    std::vector<npy::ndarray_len_t> stl_shape;
    std::vector<int8_t> raw;
    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);

    auto lite_tensor =
            std::make_shared<Tensor>(backend, LiteDeviceType::LITE_CPU);
    Layout layout;
    layout.ndim = stl_shape.size();
    const std::map<std::string, LiteDataType> type_map = {
            {"f4", LiteDataType::LITE_FLOAT},
            {"i4", LiteDataType::LITE_INT},
            {"i1", LiteDataType::LITE_INT8},
            {"u1", LiteDataType::LITE_UINT8}};
    layout.shapes[0] = 1;
    for (size_t i = 0; i < layout.ndim; i++) {
        layout.shapes[i] = static_cast<size_t>(stl_shape[i]);
    }

    for (auto& item : type_map) {
        if (type_str.find(item.first) != std::string::npos) {
            layout.data_type = item.second;
            break;
        }
    }
    lite_tensor->set_layout(layout);
    size_t length = lite_tensor->get_tensor_total_size_in_byte();
    void* dest = lite_tensor->get_memory_ptr();
    memcpy(dest, raw.data(), length);
    //! rknn not support reshape now
    if (layout.ndim == 3) {
            lite_tensor->reshape({1, static_cast<int>(layout.shapes[0]),
                                  static_cast<int>(layout.shapes[1]),
                                  static_cast<int>(layout.shapes[2])});
    }
    return lite_tensor;
 }

 void lite::example::set_cpu_affinity(const std::vector<int>& cpuset) {
 #if defined(__APPLE__) || defined(WIN32)
 #pragma message("set_cpu_affinity not enabled on apple and windows platform")
 #else
    cpu_set_t mask;
    CPU_ZERO(&mask);
    for (auto i : cpuset) {
        CPU_SET(i, &mask);
    }
    auto err = sched_setaffinity(0, sizeof(mask), &mask);
    if (err) {
        printf("failed to sched_setaffinity: %s (error ignored)",
               strerror(errno));
    }
 #endif
 }

 int main(int argc, char** argv) {
    set_log_level(LiteLogLevel::WARN);
    auto&& args = Args::from_argv(argc, argv);
    if (args.args_parse_ret)
        return -1;
    auto map = get_example_function_map();
    auto example = (*map)[args.example_name];
    if (example) {
        printf("Begin to run %s example.\n", args.example_name.c_str());
        return example(args);
    } else {
        printf("The example of %s is not registed.", args.example_name.c_str());
        return -1;
    }
 }
 namespace lite {
 namespace example {

 #if LITE_BUILD_WITH_MGE
 #if LITE_WITH_CUDA
 REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda);
 #endif
 REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path);
 REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader);
 REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory);
 REGIST_EXAMPLE("cpu_affinity", cpu_affinity);
 REGIST_EXAMPLE("register_cryption_method", register_cryption_method);
 REGIST_EXAMPLE("update_cryption_key", update_cryption_key);
 REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights);
 REGIST_EXAMPLE("reset_input", reset_input);
 REGIST_EXAMPLE("reset_input_output", reset_input_output);
 REGIST_EXAMPLE("config_user_allocator", config_user_allocator);
 REGIST_EXAMPLE("async_forward", async_forward);

 REGIST_EXAMPLE("basic_c_interface", basic_c_interface);
 REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface);
 REGIST_EXAMPLE("async_c_interface", async_c_interface);

 #if LITE_WITH_CUDA
 REGIST_EXAMPLE("device_input", device_input);
 REGIST_EXAMPLE("device_input_output", device_input_output);
 REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
 #endif
 #endif
 }  // namespace example
 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/README.md
+++ b/lite/example/mge/README.md
@@ -0,0 +1,166 @@
 # Example

 在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子，主要
 是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example 
 都是使用 shufflenet 来进行演示。

 ## Example bazel 的编译和运行

 * 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境，编译 CPU 版本
 ```bash
    ./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \
        --compiler="gcc9" -c opt
 ```
 * 运行时需要指定运行的具体 example 名字，运行的模型，模型运行的数据
 * 获取所有的 example 名字
 ```
    bazel-bin/brain/megbrain/lite/lite_examples
 ```
 * 运行 example，下面命令运行 basic_load_from_memory
 ```
    bazel-bin/brain/megbrain/lite/lite_examples \
        basic_load_from_memory \
        path-to-megbrain/lite/test/resource/lite/shufflenet.mge \
        path-to-megbrain/lite/test/resource/lite/input_data.npy
 ```

 ## basic 使用

 * **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和
 basic_load_from_memory**

 * 该 example 使用 lite 来完成基本的 inference 功能，load 模型使用默认的配置，
 进行 forward 之前将输入数据 copy 到输入 tensor 中，完成 forward 之后，再将
 数据从输出 tensor 中 copy 到用户的内存中，输入 tensor 和输出 tensor 都是从
 Network 中通过 name 来获取的，输入输出 tensor 的 layout 也可以从对应的 tensor
 中直接获取获取，**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。**

 ## 输入输出指定的内存

 * **实现在 reset_io.cpp 中，包括两个 example，reset_input 和 reset_input_output
 两个 example。**

 * 该 example 中演示输入 tensor 的内存为用户指定的内存（该内存中已经保存好输入
 数据），输出 tensor 也可以是用户指定的内存，这样 Network 完成 Forward 之后就会将数据
 保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。

 * 主要是通过 tensor 中的 reset 接口，该接口可以重新指定 tensor 的内存和对应的
 layout，如果 layout 没有指定，默认为 tensor 中原来的 layout。

 * **该方法中由于内存是用户申请，需要用户提前知道输入，输出 tensor 对应的 layout，然后
 根据 layout 来申请内存，另外通过 reset 设置到 tensor 中的内存，生命周期不由 tensor
 管理，由外部用户来管理。**

 ## 输入输出指定 device 上内存

 * **实现在 device_io.cpp 中，device_input 和 device_input_output 两个 example。**

 * 该 example 中配置模型运行在 device(CUDA) 上，并且使用用户提前申请的 device 上的内存
 作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上，不设置默认
 在 CPU 上，其他地方和**输入输出为用户指定的内存**的使用相同

 * 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端

 ## 申请 pinned host 内存作为输入

 * **实现在 device_io.cpp 中，函数名字为 pinned_host_input。**

 * 这个 example 中模型运行在 device(CUDA) 上，但是输入输出在 CPU 上，为了加速 host2device 的
 copy，将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出
 output tensor 不是 device 上的时候，默认就是 pinned host 的。

 * 申请 pinned host 内存的方法是：构建 tensor 的时候指定 device，layout，以及 is_host_pinned
 参数，这样申请的内存就是 pinned host 的内存。

    ```C
     bool is_pinned_host = true;
     auto tensor_pinned_input =
             Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
    ```

 ## 用户指定内存分配器

 * **实现在 user_allocator.cpp 中，函数名为：config_user_allocator。**

 * 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法，用户自定义
 内存分配器需要继承自 lite 中的 Allocator 基类，并实现 allocate 和 free 两个接口。目前在 CPU
 上验证是正确的，其他设备上有待测试。

 * 设置自定定义内存分配器的接口为 Network 中如下接口：
    ```C
    Network& set_memory_allocator(std::shared_ptr<Allocator> user_allocator);
    ```

 ## 多个 Network 共享同一份模型 weights

 * **实现在 network_share_weights.cpp 中，函数名为：network_share_same_weights。**

 * 很多情况用户希望多个 Network 共享同一份 weights，因为模型中 weights 是只读的，这样可以节省
 模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能，首先创建一个新的 Network，
 用户可以指定新的 Config 和 NetworkIO 以及其他一些配置，使得新创建出来的 Network 完成不同的
 功能。

 * 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口：
    ```C
        static void shared_weight_with_network(
            std::shared_ptr<Network> dst_network,
            const std::shared_ptr<Network> src_network);
    ```
    * dst_network: 指新 load 出来的 Network
    * src_network：已经 load 的老的 Network

 ## CPU 绑核

 * **实现在 cpu_affinity.cpp 中，函数名为：cpu_affinity。**

 * 该 example 之中指定模型运行在 CPU 多线程上，然后使用 Network 中的
 set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来，用户可以
 根据该 id 决定具体绑核行为，在多线程中，如果线程总数为 n，则 id 为 n-1 的线程为主线程。

 ## 用户注册自定义解密算法和 key

 * **实现在 user_cryption.cpp 中，函数名为：register_cryption_method 和 update_aes_key 。**

 * 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口，实现了使用用户自定的解密算法
 实现模型的 load 操作。在这个 example 中，自定义了一个解密方法，(其实没有做任何事情，
 将模型两次异或上 key 之后返回，等于将原始模型直接返回)，然后将其注册到 lite 中，后面创建 Network 时候在其
 config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其
 key 的更新操作。
 目前 lite 里面定义好了几种解密算法：
    * AES_default : 其 key 是由 32 个 unsighed char 组成，默认为0到31
    * RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char，hash
      key 在前，enc_key 在后。
    * SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。
 大概命名规则为：前面大写是具体算法的名字，'_'后面的小写，代表解密 key。
 具体的接口为：
    ```C
    bool register_decryption_and_key(std::string decrypt_name,
                                    const DecryptionFunc& func,
                                    const std::vector<uint8_t>& key);
    bool update_decryption_or_key(std::string decrypt_name,
                                    const DecryptionFunc& func,
                                    const std::vector<uint8_t>& key);
    ```
 register 接口中必须要求三个参数都是正确的值，update中 decrypt_nam 必须为已有的解密算法，
 将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新

 ## 异步执行模式

 * **实现在 basic.cpp 中，函数名为：async_forward。**

 * 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式，
 目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持，在 inference 时异步模式，
 主线程可以在工作线程正在执行计算的同时做一些其他的运算，避免长时间等待，但是
 在一些单核处理器上没有收益。

 ## 纯 C example

 * **实现在 lite_c_interface.cpp，函数名为：basic_c_interface，
 device_io_c_interface，async_c_interface**

 * Lite 完成对 C++ 接口的封装，对外暴露了纯 C 的接口，用户如果不是源码依赖 Lite
 的情况下，应该使用纯 C 接口来完成集成。
 * 纯 C 的所有接口都是返回一个 int，如果这个 int 的数值不为 0，则又错误产生，需要
 调用 LITE_get_last_error 来获取错误信息。
 * 纯 C 的所有 get 函数都需要先定义一个对应的对象，然后将该对象的指针传递进接口，
 Lite 会将结果写入到 对应指针的地址里面。
--- a/lite/example/mge/basic.cpp
+++ b/lite/example/mge/basic.cpp
@@ -0,0 +1,370 @@
 /**
 * \file example/basic.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <thread>
 #include "../example.h"
 #if LITE_BUILD_WITH_MGE
 #include <cstdio>

 #include "misc.h"

 using namespace lite;
 using namespace example;

 namespace {
 void output_info(std::shared_ptr<Network> network, size_t output_size) {
    for (size_t index = 0; index < output_size; index++) {
        printf("output[%zu] names %s \n", index,
               network->get_all_output_name()[index].c_str());
        std::shared_ptr<Tensor> output_tensor =
                network->get_output_tensor(index);
        size_t ndim = output_tensor->get_layout().ndim;
        for (size_t i = 0; i < ndim; i++) {
            printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
                   output_tensor->get_layout().shapes[i]);
        }
    }
 }

 void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
    for (size_t index = 0; index < output_size; index++) {
        auto output_tensor = network->get_output_tensor(index);
        void* out_data = output_tensor->get_memory_ptr();
        size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                            output_tensor->get_layout().get_elem_size();
        LiteDataType dtype = output_tensor->get_layout().data_type;
        float max = -1000.0f;
        float min = 1000.0f;
        int max_idx = 0;
        int min_idx = 0;
        float sum = 0.0f;
 #define cb(_dtype, _real_dtype)                                        \
    case LiteDataType::_dtype: {                                       \
        for (size_t i = 0; i < out_length; i++) {                      \
            _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
            sum += data;                                               \
            if (max < data) {                                          \
                max = data;                                            \
                max_idx = i;                                           \
            }                                                          \
            if (min > data) {                                          \
                min = data;                                            \
                min_idx = i;                                           \
            }                                                          \
        }                                                              \
    } break;

        switch (dtype) {
            cb(LITE_FLOAT, float);
            cb(LITE_INT, int);
            cb(LITE_INT8, int8_t);
            cb(LITE_UINT8, uint8_t);
            default:
                printf("unknow datatype");
        }
        printf("output_length %zu index %zu  max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n",
               out_length, index, max, max_idx, min, min_idx, sum);
    }
 #undef cb
 }
 }  // namespace

 #if LITE_WITH_CUDA
 bool lite::example::load_from_path_run_cuda(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;
    set_log_level(LiteLogLevel::DEBUG);
    //! config the network running in CUDA device
    lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
    //! set NetworkIO
    NetworkIO network_io;
    std::string input_name = "img0_comp_fullface";
    bool is_host = false;
    IO device_input{input_name, is_host};
    network_io.inputs.push_back(device_input);
    //! create and load the network
    std::shared_ptr<Network> network =
            std::make_shared<Network>(config, network_io);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    Layout input_layout = input_tensor->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);

    //! malloc the device memory
    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);

    //! copy to the device memory
    tensor_device.copy_from(*src_tensor);

    //! Now the device memory if filled with user input data, set it to the
    //! input tensor
    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);

    //! forward
    {
        lite::Timer ltimer("warmup");
        network->forward();
        network->wait();
        ltimer.print_used_time(0);
    }
    lite::Timer ltimer("forward_iter");
    for (int i = 0; i < 10; i++) {
        ltimer.reset_start();
        network->forward();
        network->wait();
        ltimer.print_used_time(i);
    }
    //! get the output data or read tensor set in network_in
    size_t output_size = network->get_all_output_name().size();
    output_info(network, output_size);
    output_data_info(network, output_size);
    return true;
 }
 #endif
 bool lite::example::basic_load_from_path(const Args& args) {
    set_log_level(LiteLogLevel::DEBUG);
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>();
    network->load_model(network_path);
    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);

    auto layout = input_tensor->get_layout();
    for (size_t i = 0; i < layout.ndim; i++) {
        printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
    }

    //! copy or forward data to network
    size_t length = input_tensor->get_tensor_total_size_in_byte();
    void* dst_ptr = input_tensor->get_memory_ptr();
    auto src_tensor = parse_npy(input_path);
    auto layout0 = src_tensor->get_layout();
    for (size_t i = 0; i < layout0.ndim; i++) {
        printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
    }
    void* src = src_tensor->get_memory_ptr();
    memcpy(dst_ptr, src, length);

    //! forward
    {
        lite::Timer ltimer("warmup");
        network->forward();
        network->wait();
        ltimer.print_used_time(0);
    }
    lite::Timer ltimer("forward_iter");
    for (int i = 0; i < 10; i++) {
        network->forward();
        network->wait();
        ltimer.print_used_time(i);
    }

    //! forward
    {
        lite::Timer ltimer("warmup");
        network->forward();
        network->wait();
        ltimer.print_used_time(0);
    }
    for (int i = 0; i < 10; i++) {
        ltimer.reset_start();
        network->forward();
        network->wait();
        ltimer.print_used_time(i);
    }

    //! get the output data or read tensor set in network_in
    size_t output_size = network->get_all_output_name().size();
    output_info(network, output_size);
    output_data_info(network, output_size);
    return true;
 }

 bool lite::example::basic_load_from_path_with_loader(const Args& args) {
    set_log_level(LiteLogLevel::DEBUG);
    lite::set_loader_lib_path(args.loader_path);
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>();
    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);

    auto input_layout = input_tensor->get_layout();

    //! copy or forward data to network
    auto src_tensor = parse_npy(input_path);
    auto src_layout = src_tensor->get_layout();
    if (src_layout.ndim != input_layout.ndim) {
        printf("src dim is not equal model input dim\n");
    }
    //! pay attention the input shape can change
    for (size_t i = 0; i < input_layout.ndim; i++) {
        if (input_layout.shapes[i] != src_layout.shapes[i]) {
            printf("src shape not equal input shape");
        }
    }
    input_tensor->set_layout(src_tensor->get_layout());

    //! reset or forward data to network
    input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());

    //! forward
    network->forward();
    network->wait();

    //! forward
    {
        lite::Timer ltimer("warmup");
        network->forward();
        network->wait();
        ltimer.print_used_time(0);
    }
    lite::Timer ltimer("forward_iter");
    for (int i = 0; i < 10; i++) {
        ltimer.reset_start();
        network->forward();
        network->wait();
        ltimer.print_used_time(i);
    }

    //! get the output data or read tensor set in network_in
    size_t output_size = network->get_all_output_name().size();
    output_info(network, output_size);
    output_data_info(network, output_size);
    return true;
 }

 bool lite::example::basic_load_from_memory(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>();

    FILE* fin = fopen(network_path.c_str(), "rb");
    if (!fin) {
        printf("failed to open %s.", network_path.c_str());
    }

    fseek(fin, 0, SEEK_END);
    size_t size = ftell(fin);
    fseek(fin, 0, SEEK_SET);
    void* ptr = malloc(size);
    std::shared_ptr<void> buf{ptr, ::free};
    auto len = fread(buf.get(), 1, size, fin);
    if (len < 1) {
        printf("read file failed.\n");
    }
    fclose(fin);

    network->load_model(buf.get(), size);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    //! copy or forward data to network
    size_t length = input_tensor->get_tensor_total_size_in_byte();
    void* dst_ptr = input_tensor->get_memory_ptr();
    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    memcpy(dst_ptr, src, length);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    printf("length=%zu\n", length);
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 bool lite::example::async_forward(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;
    Config config;
    config.options.var_sanity_check_first_run = false;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    //! copy or forward data to network
    size_t length = input_tensor->get_tensor_total_size_in_byte();
    void* dst_ptr = input_tensor->get_memory_ptr();
    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    memcpy(dst_ptr, src, length);

    //! set async mode and callback
    volatile bool finished = false;
    network->set_async_callback([&finished]() {
 #if !__DEPLOY_ON_XP_SP2__
        std::cout << "worker thread_id:" << std::this_thread::get_id()
                  << std::endl;
 #endif
        finished = true;
    });

 #if !__DEPLOY_ON_XP_SP2__
    std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
 #endif

    //! forward
    network->forward();
    size_t count = 0;
    while (finished == false) {
        count++;
    }
    printf("Forward finish, count is %zu\n", count);

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    printf("length=%zu\n", length);
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/cpu_affinity.cpp
+++ b/lite/example/mge/cpu_affinity.cpp
@@ -0,0 +1,69 @@
 /**
 * \file example/cpu_affinity.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../example.h"
 #if LITE_BUILD_WITH_MGE

 using namespace lite;
 using namespace example;

 bool lite::example::cpu_affinity(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>();

    //! run with multi theads
    Runtime::set_cpu_threads_number(network, 4);

    network->load_model(network_path);

    std::vector<int> core_ids = {0, 1, 2, 3};
    auto affinity = [core_ids](int id) {
        //! add user define affinity function
        set_cpu_affinity({core_ids[id]});
        printf("set thread id = %d with the affinity of core %d.\n", id,
               core_ids[id]);
    };
    Runtime::set_runtime_thread_affinity(network, affinity);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    //! copy or forward data to network
    size_t length = input_tensor->get_tensor_total_size_in_byte();
    void* dst_ptr = input_tensor->get_memory_ptr();
    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    memcpy(dst_ptr, src, length);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    printf("length=%zu\n", length);
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/device_io.cpp
+++ b/lite/example/mge/device_io.cpp
@@ -0,0 +1,189 @@
 /**
 * \file example/device_io.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <thread>
 #include "../example.h"
 #if LITE_BUILD_WITH_MGE

 using namespace lite;
 using namespace example;

 #if LITE_WITH_CUDA

 bool lite::example::device_input(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! config the network running in CUDA device
    lite::Config config{LiteDeviceType::LITE_CUDA};

    //! set NetworkIO
    NetworkIO network_io;
    std::string input_name = "data";
    bool is_host = false;
    IO device_input{input_name, is_host};
    network_io.inputs.push_back(device_input);

    //! create and load the network
    std::shared_ptr<Network> network =
            std::make_shared<Network>(config, network_io);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    Layout input_layout = input_tensor->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);

    //! malloc the device memory
    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);

    //! copy to the device memory
    tensor_device.copy_from(*src_tensor);

    //! Now the device memory if filled with user input data, set it to the
    //! input tensor
    input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 bool lite::example::device_input_output(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! config the network running in CUDA device
    lite::Config config{LiteDeviceType::LITE_CUDA};

    //! set NetworkIO include input and output
    NetworkIO network_io;
    std::string input_name = "data";
    std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
    bool is_host = false;
    IO device_input{input_name, is_host};
    IO device_output{output_name, is_host};
    network_io.inputs.push_back(device_input);
    network_io.outputs.push_back(device_output);

    //! create and load the network
    std::shared_ptr<Network> network =
            std::make_shared<Network>(config, network_io);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
    Layout input_layout = input_tensor_device->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);

    //! malloc the device memory
    auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);

    //! copy to the device memory
    tensor_device.copy_from(*src_tensor);

    //! Now the device memory is filled with user input data, set it to the
    //! input tensor
    input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);

    //! forward
    network->forward();
    network->wait();

    //! output is in device, should copy it to host
    std::shared_ptr<Tensor> output_tensor_device =
            network->get_io_tensor(output_name);

    auto output_tensor = std::make_shared<Tensor>();
    output_tensor->copy_from(*output_tensor_device);

    //! get the output data or read tensor set in network_in
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 bool lite::example::pinned_host_input(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! config the network running in CUDA device
    lite::Config config{LiteDeviceType::LITE_CUDA};

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(network_path);

    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    Layout input_layout = input_tensor->get_layout();

    //! read data from numpy data file
    auto src_tensor = parse_npy(input_path);
    //! malloc the pinned host memory
    bool is_pinned_host = true;
    auto tensor_pinned_input =
            Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
    //! copy to the pinned memory
    tensor_pinned_input.copy_from(*src_tensor);
    //! set the pinned host memory to the network as input
    input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 #endif
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/lite_c_interface.cpp
+++ b/lite/example/mge/lite_c_interface.cpp
@@ -0,0 +1,224 @@
 /**
 * \file example/basic_c_interface.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../example.h"
 #include "misc.h"
 #if LITE_BUILD_WITH_MGE
 #include "lite-c/global_c.h"
 #include "lite-c/network_c.h"
 #include "lite-c/tensor_c.h"

 #include <thread>

 #define LITE_CAPI_CHECK(_expr)                 \
    do {                                       \
        int _ret = (_expr);                    \
        if (_ret) {                            \
            LITE_THROW(LITE_get_last_error()); \
        }                                      \
    } while (0)

 bool basic_c_interface(const lite::example::Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! read input data to lite::tensor
    auto src_tensor = lite::example::parse_npy(input_path);
    void* src_ptr = src_tensor->get_memory_ptr();

    //! create and load the network
    LiteNetwork c_network;
    LITE_CAPI_CHECK(
            LITE_make_network(&c_network, *default_config(), *default_network_io()));

    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));

    //! set input data to input tensor
    LiteTensor c_input_tensor;
    LITE_CAPI_CHECK(
            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
    void* dst_ptr;
    size_t length_in_byte;
    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
                                                       &length_in_byte));
    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr));
    //! copy or forward data to network
    memcpy(dst_ptr, src_ptr, length_in_byte);

    //! forward
    LITE_CAPI_CHECK(LITE_forward(c_network));
    LITE_CAPI_CHECK(LITE_wait(c_network));

    //! get the output data or read tensor data
    const char* output_name;
    LiteTensor c_output_tensor;
    //! get the first output tensor name
    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
                                       &c_output_tensor));
    void* output_ptr;
    size_t length_output_in_byte;
    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
                                                       &length_output_in_byte));

    size_t out_length = length_output_in_byte / sizeof(float);
    printf("length=%zu\n", out_length);

    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(output_ptr)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 bool device_io_c_interface(const lite::example::Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! read input data to lite::tensor
    auto src_tensor = lite::example::parse_npy(input_path);
    void* src_ptr = src_tensor->get_memory_ptr();
    size_t length_read_in = src_tensor->get_tensor_total_size_in_byte();

    //! create and load the network
    LiteNetwork c_network;
    LITE_CAPI_CHECK(
            LITE_make_network(&c_network, *default_config(), *default_network_io()));
    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));

    //! set input data to input tensor
    LiteTensor c_input_tensor;
    size_t length_tensor_in;
    LITE_CAPI_CHECK(
            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
                                                       &length_tensor_in));
    if (length_read_in != length_tensor_in) {
        LITE_THROW("The input data size is not match the network input tensro "
               "size,\n");
    }
    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
                                             length_tensor_in));

    //! reset the output tensor memory with user allocated memory
    size_t out_length = 1000;
    LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT};
    std::shared_ptr<float> ptr(new float[out_length],
                               [](float* ptr) { delete[] ptr; });
    const char* output_name;
    LiteTensor c_output_tensor;
    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
                                       &c_output_tensor));
    LITE_CAPI_CHECK(
            LITE_reset_tensor(c_output_tensor, output_layout, ptr.get()));

    //! forward
    LITE_CAPI_CHECK(LITE_forward(c_network));
    LITE_CAPI_CHECK(LITE_wait(c_network));

    printf("length=%zu\n", out_length);

    float max = -1.0f;
    float sum = 0.0f;
    void* out_data = ptr.get();
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 namespace {
 volatile bool finished = false;
 int async_callback(void) {
 #if !__DEPLOY_ON_XP_SP2__
    std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl;
 #endif
    finished = true;
    return 0;
 }
 }  // namespace

 bool async_c_interface(const lite::example::Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! read input data to lite::tensor
    auto src_tensor = lite::example::parse_npy(input_path);
    void* src_ptr = src_tensor->get_memory_ptr();

    LiteNetwork c_network;
    LiteConfig config = *default_config();
    config.options.var_sanity_check_first_run = false;
    LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io()));
    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str()));

    //! set input data to input tensor
    LiteTensor c_input_tensor;
    size_t length_tensor_in;
    LITE_CAPI_CHECK(
            LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor));
    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor,
                                                       &length_tensor_in));
    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr,
                                             length_tensor_in));

 #if !__DEPLOY_ON_XP_SP2__
    std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl;
 #endif

    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback));
    //! forward
    LITE_CAPI_CHECK(LITE_forward(c_network));
    size_t count = 0;
    while (finished == false) {
        count++;
    }
    printf("The count is %zu\n", count);
    finished = false;

    //! get the output data or read tensor data
    const char* output_name;
    LiteTensor c_output_tensor;
    //! get the first output tensor name
    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
                                       &c_output_tensor));
    void* output_ptr;
    size_t length_output_in_byte;
    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr));
    LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor,
                                                       &length_output_in_byte));

    size_t out_length = length_output_in_byte / sizeof(float);
    printf("length=%zu\n", out_length);

    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(output_ptr)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/network_share_weights.cpp
+++ b/lite/example/mge/network_share_weights.cpp
@@ -0,0 +1,78 @@
 /**
 * \file example/network_share_weights.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../example.h"
 #if LITE_BUILD_WITH_MGE

 using namespace lite;
 using namespace example;

 bool lite::example::network_share_same_weights(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>();
    network->load_model(network_path);

    //! load a new network from the created network and share the same weights,
    Config config_new;
    config_new.options.const_shape = true;
    NetworkIO network_io_new;
    std::shared_ptr<Network> weight_shared_network =
            std::make_shared<Network>(config_new, network_io_new);
    Runtime::shared_weight_with_network(weight_shared_network, network);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    void* dst_ptr = input_tensor->get_memory_ptr();
    std::shared_ptr<Tensor> input_tensor2 =
            weight_shared_network->get_input_tensor(0);
    void* dst_ptr2 = input_tensor2->get_memory_ptr();
    //! copy or forward data to network
    size_t length = input_tensor->get_tensor_total_size_in_byte();
    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    memcpy(dst_ptr, src, length);
    memcpy(dst_ptr2, src, length);

    //! forward
    network->forward();
    network->wait();

    weight_shared_network->forward();
    weight_shared_network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    std::shared_ptr<Tensor> output_tensor2 =
            weight_shared_network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    void* out_data2 = output_tensor2->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    printf("length=%zu\n", length);
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        float data2 = static_cast<float*>(out_data2)[i];
        if (data != data2) {
            printf("the result between the origin network and weight share "
                   "netwrok is different.\n");
        }
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/reset_io.cpp
+++ b/lite/example/mge/reset_io.cpp
@@ -0,0 +1,95 @@
 /**
 * \file example/reset_io.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../example.h"
 #if LITE_BUILD_WITH_MGE

 using namespace lite;
 using namespace example;

 bool lite::example::reset_input(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;
    lite::Config config;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    auto layout = input_tensor->get_layout();

    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    input_tensor->reset(src, layout);

    //! forward
    network->forward();
    network->wait();

    //! 6. get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 bool lite::example::reset_input_output(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;
    lite::Config config;

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    auto layout = input_tensor->get_layout();

    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    input_tensor->reset(src, layout);

    //! set output ptr to store the network output
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < 1000; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/user_allocator.cpp
+++ b/lite/example/mge/user_allocator.cpp
@@ -0,0 +1,89 @@
 /**
 * \file example/user_allocator.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../example.h"
 #if LITE_BUILD_WITH_MGE
 using namespace lite;
 using namespace example;

 namespace {
 class CheckAllocator : public lite::Allocator {
 public:
    //! allocate memory of size in the given device with the given align
    void* allocate(LiteDeviceType, int, size_t size, size_t align) override {
 #ifdef WIN32
        return _aligned_malloc(size, align);
 #elif defined(__ANDROID__) || defined(ANDROID)
        return memalign(align, size);
 #else
        void* ptr = nullptr;
        auto err = posix_memalign(&ptr, align, size);
        if (!err) {
            printf("failed to malloc %zu bytes with align %zu", size, align);
        }
        return ptr;
 #endif
    };

    //! free the memory pointed by ptr in the given device
    void free(LiteDeviceType, int, void* ptr) override {
 #ifdef WIN32
        _aligned_free(ptr);
 #else
        ::free(ptr);
 #endif
    };
 };
 }  // namespace

 bool lite::example::config_user_allocator(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    auto allocator = std::make_shared<CheckAllocator>();

    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>();

    Runtime::set_memory_allocator(network, allocator);

    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    //! copy or forward data to network
    size_t length = input_tensor->get_tensor_total_size_in_byte();
    void* dst_ptr = input_tensor->get_memory_ptr();
    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    memcpy(dst_ptr, src, length);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    printf("length=%zu\n", length);
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/mge/user_cryption.cpp
+++ b/lite/example/mge/user_cryption.cpp
@@ -0,0 +1,122 @@
 /**
 * \file example/user_cryption.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../example.h"
 #if LITE_BUILD_WITH_MGE

 using namespace lite;
 using namespace example;

 namespace {
 std::vector<uint8_t> decrypt_model(const void* model_mem, size_t size,
                                   const std::vector<uint8_t>& key) {
    if (key.size() == 1) {
        std::vector<uint8_t> ret(size, 0);
        const uint8_t* ptr = static_cast<const uint8_t*>(model_mem);
        uint8_t key_data = key[0];
        for (size_t i = 0; i < size; i++) {
            ret[i] = ptr[i] ^ key_data ^ key_data;
        }
        return ret;
    } else {
        printf("the user define decrypt method key length is wrong.\n");
        return {};
    }
 }
 }  // namespace

 bool lite::example::register_cryption_method(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! register the decryption method
    register_decryption_and_key("just_for_test", decrypt_model, {15});

    lite::Config config;
    config.bare_model_cryption_name = "just_for_test";
    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    auto layout = input_tensor->get_layout();

    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    input_tensor->reset(src, layout);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }

 bool lite::example::update_cryption_key(const Args& args) {
    std::string network_path = args.model_path;
    std::string input_path = args.input_path;

    //! update the decryption method key
    std::vector<uint8_t> key(32, 0);
    for (size_t i = 0; i < 32; i++) {
        key[i] = 31 - i;
    }
    update_decryption_or_key("AES_default", nullptr, key);

    lite::Config config;
    config.bare_model_cryption_name = "AES_default";
    //! create and load the network
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(network_path);

    //! set input data to input tensor
    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
    auto layout = input_tensor->get_layout();

    auto src_tensor = parse_npy(input_path);
    void* src = src_tensor->get_memory_ptr();
    input_tensor->reset(src, layout);

    //! forward
    network->forward();
    network->wait();

    //! get the output data or read tensor set in network_in
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    void* out_data = output_tensor->get_memory_ptr();
    size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
                        output_tensor->get_layout().get_elem_size();
    float max = -1.0f;
    float sum = 0.0f;
    for (size_t i = 0; i < out_length; i++) {
        float data = static_cast<float*>(out_data)[i];
        sum += data;
        if (max < data)
            max = data;
    }
    printf("max=%e, sum=%e\n", max, sum);
    return true;
 }
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/example/npy.h
+++ b/lite/example/npy.h
@@ -0,0 +1,638 @@
 /*
   Copyright 2017 Leon Merten Lohse

   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
 */

 /*
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #ifndef NPY_H
 #define NPY_H

 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <vector>

 namespace npy {

 /* Compile-time test for byte order.
   If your compiler does not define these per default, you may want to define
   one of these constants manually.
   Defaults to little endian order. */
 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN ||                  \
        defined(__BIG_ENDIAN__) || defined(__ARMEB__) ||                      \
        defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \
        defined(__MIBSEB) || defined(__MIBSEB__)
 const bool big_endian = true;
 #else
 const bool big_endian = false;
 #endif

 const char magic_string[] = "\x93NUMPY";
 const size_t magic_string_length = 6;

 const char little_endian_char = '<';
 const char big_endian_char = '>';
 const char no_endian_char = '|';

 constexpr char host_endian_char =
        (big_endian ? big_endian_char : little_endian_char);

 /* npy array length */
 typedef unsigned long int ndarray_len_t;

 inline void write_magic(std::ostream& ostream, unsigned char v_major = 1,
                        unsigned char v_minor = 0) {
    ostream.write(magic_string, magic_string_length);
    ostream.put(v_major);
    ostream.put(v_minor);
 }

 inline void read_magic(std::istream& istream, unsigned char& v_major,
                       unsigned char& v_minor) {
    char buf[magic_string_length + 2];
    istream.read(buf, magic_string_length + 2);

    if (!istream) {
        fprintf(stderr, "io error: failed reading file");
    }

    if (0 != std::memcmp(buf, magic_string, magic_string_length)) {
        fprintf(stderr, "this file does not have a valid npy format.");
    }

    v_major = buf[magic_string_length];
    v_minor = buf[magic_string_length + 1];
 }

 // typestring magic
 struct Typestring {
 private:
    char c_endian;
    char c_type;
    int len;

 public:
    inline std::string str() {
        const size_t max_buflen = 16;
        char buf[max_buflen];
        std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
        return std::string(buf);
    }

    Typestring(const std::vector<float>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {}
    Typestring(const std::vector<double>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {}
    Typestring(const std::vector<long double>&)
            : c_endian{host_endian_char},
              c_type{'f'},
              len{sizeof(long double)} {}

    Typestring(const std::vector<char>&)
            : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {}
    Typestring(const std::vector<short>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {}
    Typestring(const std::vector<int>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {}
    Typestring(const std::vector<long>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {}
    Typestring(const std::vector<long long>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {}

    Typestring(const std::vector<unsigned char>&)
            : c_endian{no_endian_char},
              c_type{'u'},
              len{sizeof(unsigned char)} {}
    Typestring(const std::vector<unsigned short>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned short)} {}
    Typestring(const std::vector<unsigned int>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned int)} {}
    Typestring(const std::vector<unsigned long>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned long)} {}
    Typestring(const std::vector<unsigned long long>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned long long)} {}

    Typestring(const std::vector<std::complex<float>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<float>)} {}
    Typestring(const std::vector<std::complex<double>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<double>)} {}
    Typestring(const std::vector<std::complex<long double>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<long double>)} {}
 };

 inline void parse_typestring(std::string typestring) {
    std::regex re("'([<>|])([ifuc])(\\d+)'");
    std::smatch sm;

    std::regex_match(typestring, sm, re);

    if (sm.size() != 4) {
        fprintf(stderr, "invalid typestring");
    }
 }

 namespace pyparse {

 /**
  Removes leading and trailing whitespaces
  */
 inline std::string trim(const std::string& str) {
    const std::string whitespace = " \t";
    auto begin = str.find_first_not_of(whitespace);

    if (begin == std::string::npos)
        return "";

    auto end = str.find_last_not_of(whitespace);

    return str.substr(begin, end - begin + 1);
 }

 inline std::string get_value_from_map(const std::string& mapstr) {
    size_t sep_pos = mapstr.find_first_of(":");
    if (sep_pos == std::string::npos)
        return "";

    std::string tmp = mapstr.substr(sep_pos + 1);
    return trim(tmp);
 }

 /**
   Parses the string representation of a Python dict

   The keys need to be known and may not appear anywhere else in the data.
 */
 inline std::unordered_map<std::string, std::string> parse_dict(
        std::string in, std::vector<std::string>& keys) {
    std::unordered_map<std::string, std::string> map;

    if (keys.size() == 0)
        return map;

    in = trim(in);

    // unwrap dictionary
    if ((in.front() == '{') && (in.back() == '}'))
        in = in.substr(1, in.length() - 2);
    else {
        fprintf(stderr, "Not a Python dictionary.");
    }

    std::vector<std::pair<size_t, std::string>> positions;

    for (auto const& value : keys) {
        size_t pos = in.find("'" + value + "'");

        if (pos == std::string::npos) {
            fprintf(stderr, "Missing %s key.", value.c_str());
        }

        std::pair<size_t, std::string> position_pair{pos, value};
        positions.push_back(position_pair);
    }

    // sort by position in dict
    std::sort(positions.begin(), positions.end());

    for (size_t i = 0; i < positions.size(); ++i) {
        std::string raw_value;
        size_t begin{positions[i].first};
        size_t end{std::string::npos};

        std::string key = positions[i].second;

        if (i + 1 < positions.size())
            end = positions[i + 1].first;

        raw_value = in.substr(begin, end - begin);

        raw_value = trim(raw_value);

        if (raw_value.back() == ',')
            raw_value.pop_back();

        map[key] = get_value_from_map(raw_value);
    }

    return map;
 }

 /**
  Parses the string representation of a Python boolean
  */
 inline bool parse_bool(const std::string& in) {
    if (in == "True")
        return true;
    if (in == "False")
        return false;

    fprintf(stderr, "Invalid python boolan.");
    return false;
 }

 /**
  Parses the string representation of a Python str
  */
 inline std::string parse_str(const std::string& in) {
    if ((in.front() == '\'') && (in.back() == '\''))
        return in.substr(1, in.length() - 2);

    fprintf(stderr, "Invalid python string.");
    return "";
 }

 /**
  Parses the string represenatation of a Python tuple into a vector of its items
 */
 inline std::vector<std::string> parse_tuple(std::string in) {
    std::vector<std::string> v;
    const char seperator = ',';

    in = trim(in);

    if ((in.front() == '(') && (in.back() == ')'))
        in = in.substr(1, in.length() - 2);
    else {
        fprintf(stderr, "Invalid Python tuple.");
    }

    std::istringstream iss(in);

    for (std::string token; std::getline(iss, token, seperator);) {
        v.push_back(token);
    }

    return v;
 }

 template <typename T>
 inline std::string write_tuple(const std::vector<T>& v) {
    if (v.size() == 0)
        return "";

    std::ostringstream ss;

    if (v.size() == 1) {
        ss << "(" << v.front() << ",)";
    } else {
        const std::string delimiter = ", ";
        // v.size() > 1
        ss << "(";
        std::copy(v.begin(), v.end() - 1,
                  std::ostream_iterator<T>(ss, delimiter.c_str()));
        ss << v.back();
        ss << ")";
    }

    return ss.str();
 }

 inline std::string write_boolean(bool b) {
    if (b)
        return "True";
    else
        return "False";
 }

 }  // namespace pyparse

 inline void parse_header(std::string header, std::string& descr) {
    /*
       The first 6 bytes are a magic string: exactly "x93NUMPY".
       The next 1 byte is an unsigned byte: the major version number of the file
       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
       number of the file format, e.g. x00. Note: the version of the file format
       is not tied to the version of the numpy package. The next 2 bytes form a
       little-endian unsigned short int: the length of the header data
       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
       array's format. It is an ASCII string which contains a Python literal
       expression of a dictionary. It is terminated by a newline ('n') and
       padded with spaces
       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
       evenly divisible by 16 for alignment purposes. The dictionary contains
       three keys:

       "descr" : dtype.descr
       An object that can be passed as an argument to the numpy.dtype()
       constructor to create the array's dtype. For repeatability and
       readability, this dictionary is formatted using pprint.pformat() so the
       keys are in alphabetic order.
     */

    // remove trailing newline
    if (header.back() != '\n')
        fprintf(stderr, "invalid header");
    header.pop_back();

    // parse the dictionary
    std::vector<std::string> keys{"descr"};
    auto dict_map = npy::pyparse::parse_dict(header, keys);

    if (dict_map.size() == 0)
        fprintf(stderr, "invalid dictionary in header");

    std::string descr_s = dict_map["descr"];
    parse_typestring(descr_s);
    // remove
    descr = npy::pyparse::parse_str(descr_s);
    return;
 }

 inline void parse_header(std::string header, std::string& descr,
                         bool& fortran_order,
                         std::vector<ndarray_len_t>& shape) {
    /*
       The first 6 bytes are a magic string: exactly "x93NUMPY".
       The next 1 byte is an unsigned byte: the major version number of the file
       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
       number of the file format, e.g. x00. Note: the version of the file format
       is not tied to the version of the numpy package. The next 2 bytes form a
       little-endian unsigned short int: the length of the header data
       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
       array's format. It is an ASCII string which contains a Python literal
       expression of a dictionary. It is terminated by a newline ('n') and
       padded with spaces
       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
       evenly divisible by 16 for alignment purposes. The dictionary contains
       three keys:

       "descr" : dtype.descr
       An object that can be passed as an argument to the numpy.dtype()
       constructor to create the array's dtype. "fortran_order" : bool Whether
       the array data is Fortran-contiguous or not. Since Fortran-contiguous
       arrays are a common form of non-C-contiguity, we allow them to be written
       directly to disk for efficiency. "shape" : tuple of int The shape of the
       array. For repeatability and readability, this dictionary is formatted
       using pprint.pformat() so the keys are in alphabetic order.
     */

    // remove trailing newline
    if (header.back() != '\n')
        fprintf(stderr, "invalid header");
    header.pop_back();

    // parse the dictionary
    std::vector<std::string> keys{"descr", "fortran_order", "shape"};
    auto dict_map = npy::pyparse::parse_dict(header, keys);

    if (dict_map.size() == 0)
        fprintf(stderr, "invalid dictionary in header");

    std::string descr_s = dict_map["descr"];
    std::string fortran_s = dict_map["fortran_order"];
    std::string shape_s = dict_map["shape"];

    // TODO: extract info from typestring
    parse_typestring(descr_s);
    // remove
    descr = npy::pyparse::parse_str(descr_s);

    // convert literal Python bool to C++ bool
    fortran_order = npy::pyparse::parse_bool(fortran_s);

    // parse the shape tuple
    auto shape_v = npy::pyparse::parse_tuple(shape_s);
    if (shape_v.size() == 0)
        fprintf(stderr, "invalid shape tuple in header");

    for (auto item : shape_v) {
        ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item));
        shape.push_back(dim);
    }
 }

 inline std::string write_header_dict(const std::string& descr,
                                     bool fortran_order,
                                     const std::vector<ndarray_len_t>& shape) {
    std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
    std::string shape_s = npy::pyparse::write_tuple(shape);

    return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order +
           ", 'shape': " + shape_s + ", }";
 }

 inline void write_header(std::ostream& out, const std::string& descr,
                         bool fortran_order,
                         const std::vector<ndarray_len_t>& shape_v) {
    std::string header_dict = write_header_dict(descr, fortran_order, shape_v);

    size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;

    unsigned char version[2] = {1, 0};
    if (length >= 255 * 255) {
        length = magic_string_length + 2 + 4 + header_dict.length() + 1;
        version[0] = 2;
        version[1] = 0;
    }
    size_t padding_len = 16 - length % 16;
    std::string padding(padding_len, ' ');

    // write magic
    write_magic(out, version[0], version[1]);

    // write header length
    if (version[0] == 1 && version[1] == 0) {
        char header_len_le16[2];
        uint16_t header_len = static_cast<uint16_t>(header_dict.length() +
                                                    padding.length() + 1);

        header_len_le16[0] = (header_len >> 0) & 0xff;
        header_len_le16[1] = (header_len >> 8) & 0xff;
        out.write(reinterpret_cast<char*>(header_len_le16), 2);
    } else {
        char header_len_le32[4];
        uint32_t header_len = static_cast<uint32_t>(header_dict.length() +
                                                    padding.length() + 1);

        header_len_le32[0] = (header_len >> 0) & 0xff;
        header_len_le32[1] = (header_len >> 8) & 0xff;
        header_len_le32[2] = (header_len >> 16) & 0xff;
        header_len_le32[3] = (header_len >> 24) & 0xff;
        out.write(reinterpret_cast<char*>(header_len_le32), 4);
    }

    out << header_dict << padding << '\n';
 }

 inline std::string read_header(std::istream& istream) {
    // check magic bytes an version number
    unsigned char v_major, v_minor;
    read_magic(istream, v_major, v_minor);

    uint32_t header_length = 0;
    if (v_major == 1 && v_minor == 0) {
        char header_len_le16[2];
        istream.read(header_len_le16, 2);
        header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);

        if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
            // TODO: display warning
        }
    } else if (v_major == 2 && v_minor == 0) {
        char header_len_le32[4];
        istream.read(header_len_le32, 4);

        header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) |
                        (header_len_le32[2] << 16) | (header_len_le32[3] << 24);

        if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
            // TODO: display warning
        }
    } else {
        fprintf(stderr, "unsupported file format version");
    }

    auto buf_v = std::vector<char>();
    buf_v.reserve(header_length);
    istream.read(buf_v.data(), header_length);
    std::string header(buf_v.data(), header_length);

    return header;
 }

 inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
    ndarray_len_t size = 1;
    for (ndarray_len_t i : shape)
        size *= i;

    return size;
 }

 template <typename Scalar>
 inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order,
                             unsigned int n_dims, const unsigned long shape[],
                             const std::vector<Scalar>& data) {
    Typestring typestring_o(data);
    std::string typestring = typestring_o.str();

    std::ofstream stream(filename, std::ofstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::vector<ndarray_len_t> shape_v(shape, shape + n_dims);
    write_header(stream, typestring, fortran_order, shape_v);

    auto size = static_cast<size_t>(comp_size(shape_v));

    stream.write(reinterpret_cast<const char*>(data.data()),
                 sizeof(Scalar) * size);
 }

 template <typename Scalar>
 inline void LoadArrayFromNumpy(const std::string& filename,
                               std::vector<unsigned long>& shape,
                               std::vector<Scalar>& data) {
    bool fortran_order;
    LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data);
 }

 template <typename Scalar>
 inline void LoadArrayFromNumpy(const std::string& filename,
                               std::vector<unsigned long>& shape,
                               bool& fortran_order, std::vector<Scalar>& data) {
    std::ifstream stream(filename, std::ifstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::string header = read_header(stream);

    // parse header
    std::string typestr;

    parse_header(header, typestr, fortran_order, shape);

    // check if the typestring matches the given one
    Typestring typestring_o{data};
    std::string expect_typestr = typestring_o.str();
    if (typestr != expect_typestr) {
        fprintf(stderr, "formatting error: typestrings not matching");
    }

    // compute the data size based on the shape
    auto size = static_cast<size_t>(comp_size(shape));
    data.resize(size);

    // read the data
    stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size);
 }

 inline void LoadArrayFromNumpy(const std::string& filename,
                               std::string& type_str,
                               std::vector<ndarray_len_t>& shape,
                               std::vector<int8_t>& data) {
    std::ifstream stream(filename, std::ifstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::string header = read_header(stream);
    bool fortran_order;
    // parse header
    parse_header(header, type_str, fortran_order, shape);

    // check if the typestring matches the given one
    std::string size_str = type_str.substr(type_str.size() - 1);
    size_t elem_size = atoi(size_str.c_str());

    // compute the data size based on the shape
    auto byte_size = elem_size * static_cast<size_t>(comp_size(shape));
    data.resize(byte_size);

    // read the data
    stream.read(reinterpret_cast<char*>(data.data()), byte_size);
 }

 }  // namespace npy

 #endif  // NPY_H
--- a/lite/include/lite/common_enum_c.h
+++ b/lite/include/lite/common_enum_c.h
@@ -0,0 +1,97 @@
 /**
 * \file inlude/lite/common_enum_c.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #ifndef LITE_COMMON_ENUM_C_H_
 #define LITE_COMMON_ENUM_C_H_

 /*!
 * \brief The log level.
 */
 typedef enum LiteLogLevel {
    DEBUG = 0, /*!< The lowest level and most verbose */
    INFO = 1,  /*!< The lowest level and most verbose */
    WARN = 2,  /*!< Print only warning and errors */
    ERROR = 3, /*!< Print only errors */
 } LiteLogLevel;

 typedef enum LiteBackend {
    LITE_DEFAULT = 0, //! default backend is mge
 } LiteBackend;

 typedef enum LiteDeviceType {
    LITE_CPU = 0,
    LITE_CUDA = 1,
    LITE_ATLAS = 3,
    LITE_NPU = 4,
    //! when the device information is set in model, so set LITE_DEVICE_DEFAULT
    //! in lite
    LITE_DEVICE_DEFAULT = 5,
 } LiteDeviceType;

 typedef enum LiteDataType {
    LITE_FLOAT = 0,
    LITE_HALF = 1,
    LITE_INT = 2,
    LITE_INT16 = 3,
    LITE_INT8 = 4,
    LITE_UINT8 = 5,
    LITE_UINT = 6,
    LITE_UINT16 = 7,
    LITE_INT64 = 8,
 } LiteCDataType;

 typedef enum LiteTensorPhase {
    //! Tensor maybe input or output
    LITE_IO = 0,
    //! Tensor is input
    LITE_INPUT = 1,
    //! Tensor is output
    LITE_OUTPUT = 2,
 } LiteTensorPhase;

 /*!
 * \brief the input and output type, include SHAPE and VALUE
 * sometimes user only need the shape of the output tensor
 */
 typedef enum LiteIOType {
    LITE_IO_VALUE = 0,
    LITE_IO_SHAPE = 1,
 } LiteIOType;

 /*!
 * \brief operation algorithm seletion strategy type, some operations have
 * multi algorithms, different algorithm has different attribute, according to
 * the strategy, the best algorithm will be selected.
 *
 * Note: These strategies can be combined
 *
 * 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid,
 * use heuristic instead
 *
 * 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the
 * reproducible algo
 *
 * 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best
 * algorithm from the reproducible algorithms set
 *
 * 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best
 * algorithm form the optimzed algorithms, thus profile will process fast
 *
 * 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means:
 * profile the best algorithm form the optimzed and reproducible algorithms
 */
 typedef enum LiteAlgoSelectStrategy {
    LITE_ALGO_HEURISTIC = 1 << 0,
    LITE_ALGO_PROFILE = 1 << 1,
    LITE_ALGO_REPRODUCIBLE = 1 << 2,
    LITE_ALGO_OPTIMIZED = 1 << 3,
 } LiteAlgoSelectStrategy;

 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/global.h
+++ b/lite/include/lite/global.h
@@ -0,0 +1,157 @@
 /**
 * \file inlude/lite/global.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "macro.h"
 #include "network.h"

 #include <functional>
 #include <memory>
 #include <vector>

 namespace lite {

 /**
 * \brief Model decryption function
 *
 * \param[in] const void* is the decrypted model memory pointer
 * \param[in] size_t the size the decrypted model memory in byte
 * \param[in] const std::vector<uint8_t>& the decryption key vector
 */
 using DecryptionFunc = std::function<std::vector<uint8_t>(
        const void*, size_t, const std::vector<uint8_t>&)>;

 /**
 * \brief register a custom decryption method and key to lite.
 *
 * \param[in] decrypt_name the name of the decryption, which will act as the
 * hash key to find the decryption method.
 *
 * \param[in] func the decryption function, which will decrypt the model with
 * the registered key, return a vector that contain the decrypted model.
 *
 * \param[in] key the decryption key of the method
 */
 LITE_API bool register_decryption_and_key(std::string decrypt_name,
                                          const DecryptionFunc& func,
                                          const std::vector<uint8_t>& key);

 /**
 * \brief update decryption function or key of a custom decryption method.
 *
 * \param[in] decrypt_name the name of the decryption, which will act as the
 * hash key to find the decryption method.
 *
 * \param[in] func the decryption function, which will decrypt the model with
 * the registered key, return a vector that contain the decrypted model. if
 * function is nullptr, it will not be updated.
 *
 * \param[in] key the decryption key of the method, if the size of key is zero,
 * it will not be updated
 */
 LITE_API bool update_decryption_or_key(std::string decrypt_name,
                                       const DecryptionFunc& func,
                                       const std::vector<uint8_t>& key);

 /**
 * \brief Model information parse function
 *
 * \param[in] const void* is the information memory
 * \param[in] size_t the size the information memory
 * \param[in] const std::string the model name used for check whether the
 * infomation match the model
 * \param[in] Config the model config, ParseInfoFunc can fill it with the
 * information in json, the config will influence Network loading later
 * \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the
 * information in json, the networkio will influence Network forwarding later
 * \param[in] std::unordered_map<std::string, LiteAny>& isolated_config_map, the
 * other config not inclue in config and networkIO, ParseInfoFunc can fill it
 * with the information in json, now support:
 * "device_id" : int, default 0
 * "number_threads" : size_t, default 1
 * "is_inplace_model" : bool, default false
 * "use_tensorrt" : bool, default false
 */
 using ParseInfoFunc = std::function<bool(
        const void*, size_t, const std::string model_name, Config& config,
        NetworkIO& network_io,
        std::unordered_map<std::string, LiteAny>& isolated_config_map,
        std::string& extra_info)>;

 /**
 * \brief register a custom parser function to lite.
 *
 * \param[in] info_type the name of the parser function, which will act as the
 * hash key to find the parser method.
 *
 * \param[in] parse_func the parser function, which will parse the given
 * information and modify the Network Config and IO.
 *
 */
 LITE_API bool register_parse_info_func(std::string info_type,
                                       const ParseInfoFunc& parse_func);

 /*! \brief Get version
 */
 LITE_API void get_version(int& major, int& minor, int& patch);

 /*! \brief Set the current log level.
 * \param[in] level The new log level
 */
 LITE_API void set_log_level(LiteLogLevel level);

 /*! \brief Get the current log level.
 * \return The current log level
 */
 LITE_API LiteLogLevel get_log_level();

 /*! \brief Get device count
 * \param[in] device_type device type
 * \return the device count
 */
 LITE_API size_t get_device_count(LiteDeviceType device_type);

 /*! \brief try to coalesce all free memory in megenine
 */
 LITE_API void try_coalesce_all_free_memory();

 /*!
 * \brief Set the loader to the lite
 * \param loader_path is the file path which store the cache
 */
 LITE_API void set_loader_lib_path(const std::string& loader_path);

 /*!
 * \brief Set the algo policy cache file for CPU/CUDA ...
 * \param cache_path is the file path which store the cache
 * \param always_sync sync the cache when model run
 */
 LITE_API void set_persistent_cache(const std::string& cache_path,
                                   bool always_sync = false);

 /*!
 * \brief dump the PersistentCache policy cache to file, if the network is set
 * to profile when forward, though this the algo policy will dump to file
 */
 LITE_API void dump_persistent_cache(const std::string& cache_path);

 /*!
 * \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
 */
 LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path);

 /*!
 * \brief dump the TensorRT cache to the file set in set_tensor_rt_cache
 */
 LITE_API void dump_tensor_rt_cache();

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/macro.h
+++ b/lite/include/lite/macro.h
@@ -0,0 +1,20 @@
 /**
 * \file include/lite/macro.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #ifndef LITE_MACRO_H_
 #define LITE_MACRO_H_

 #if defined(_WIN32)
 #define LITE_API __declspec(dllexport)
 #else
 #define LITE_API __attribute__((visibility("default")))
 #endif
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/network.h
+++ b/lite/include/lite/network.h
@@ -0,0 +1,368 @@
 /**
 * \file inlude/lite/network.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "macro.h"
 #include "tensor.h"

 #include <functional>
 #include <memory>
 #include <mutex>
 #include <string>
 #include <unordered_map>

 namespace lite {

 LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x,
                                                 LiteAlgoSelectStrategy y) {
    return static_cast<LiteAlgoSelectStrategy>(static_cast<uint32_t>(x) |
                                               static_cast<uint32_t>(y));
 }

 /*!
 * \brief the inference options which will be translated to megenine
 *
 * \param weight_preprocess is the option wich optimize the inferece performance
 * with preprocess the const weights
 *
 * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
 * dimshuffle
 *
 * \param fake_next_exec  whether only to perform non-computing tasks (like
 * memory allocation and queue initialization) for next exec. This would be
 * reset to false when the graph is executed.
 *
 * \param var_sanity_check_first_run Disable var sanity check on the first run.
 * Var sanity check is enabled on the first-time execution by default, and can
 * be used to find some potential memory access errors in the operator
 * implementation.
 *
 * \param const_shape This can be used to reduce memory usage since some
 * static inference data structures can be omitted.
 *
 * \param force_dynamic_alloc force dynamic memory alloc for all vars
 *
 * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
 * which are used as CallbackCaller input when call compile() function
 *
 * \param no_profiling_on_shape_change do not re-profile to select best impl
 * algo when input shape changes (use previous algo)
 *
 * \param jit_level Execute supported operators with JIT (support MLIR,
 * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
 * 1 for basic elemwise opr;
 * 2 for including reduce operator
 *
 * \param record_level flag optimize the inference performace with record the
 * kernel tasks in first run, hereafter the inference all need to execute the
 * recorded tasks.
 * level = 0 means the normal inference,
 * level = 1 means use record inference,
 * level = 2 means record inference with free the extra memory
 *
 * \param graph_opt_level optimization level:
 * 0: disable
 * 1: level-1: inplace arith transformations during graph
 *    construction
 * 2: level-2: level-1, plus global optimization before graph
 *    compiling
 * 3: also enable JIT
 * <0: corresponding level, with result check for debug
 *
 * \param async_exec_level exec: dispatch on separate threads for different
 * comp_node.
 * 0: do not perform async dispatch
 * 1: dispatch async if there are more than one comp node with limited queue
 * mask 0b10: async if there are multiple comp nodes with
 * mask 0b100: always async
 */
 struct LITE_API Options {
    bool weight_preprocess = false;
    bool fuse_preprocess = false;
    bool fake_next_exec = false;
    bool var_sanity_check_first_run = true;
    bool const_shape = false;
    bool force_dynamic_alloc = false;
    bool force_output_dynamic_alloc = false;
    bool no_profiling_on_shape_change = false;
    uint8_t jit_level = 0;
    uint8_t comp_node_seq_record_level = 0;
    uint8_t graph_opt_level = 2;
    uint16_t async_exec_level = 1;

    //! layout transform options
    bool enable_nchw44 = false;
    bool enable_nchw44_dot = false;
    bool enable_nchw88 = false;
    bool enable_nhwcd4 = false;
    bool enable_nchw4 = false;
    bool enable_nchw32 = false;
    bool enable_nchw64 = false;
 };

 /*!
 * \brief Configuration when load and compile the graph
 *
 * \param bare_model_cryption_name is the bare model cryption method name, bare
 *model is not pack json info inside
 *
 *\param has_compression flag whether the model is compressed, the compress
 *method will read form the model
 */
 struct LITE_API Config {
    bool has_compression = false;
    int device_id = 0;
    LiteDeviceType device_type = LiteDeviceType::LITE_CPU;
    LiteBackend backend = LiteBackend::LITE_DEFAULT;
    std::string bare_model_cryption_name = {};
    Options options = {};
 };

 /*!
 * \brief config the network input and output item
 *
 */
 struct LITE_API IO {
    //! the tensor name in the graph corresponding to the IO
    std::string name;

    //! Used to mark where the input tensor comes from and the output where copy
    //! to, if is_host is true, the input is from host and output copy to host,
    //! otherwise device. Sometimes The input is from device and output no need
    //! copy to host, default is true.
    bool is_host = true;

    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
    //! output tensor value is invaid, only shape will be set, default is VALUE
    LiteIOType io_type = LiteIOType::LITE_IO_VALUE;

    //! The layout of the config from user, if other layout is set before
    //! forward or get after forward by input tensor reset, this layout will by
    //! pass. if no other layout is set before forward, this layout will work.
    //! if this layout is no set, the model will forward with its origin layout.
    //! if in output, it will used to check.
    Layout config_layout = {};
 };

 /*!
 * \brief the input and output information when load the network
 * the NetworkIO will remain in the network until the network is destroyed
 */
 struct LITE_API NetworkIO {
    std::vector<IO> inputs = {};
    std::vector<IO> outputs = {};
 };

 /*!
 * \brief A user-implemented allocator interface
 */
 class LITE_API Allocator {
 public:
    virtual ~Allocator() = default;

    //! allocate memory of size in the given device with the given align
    virtual void* allocate(LiteDeviceType device_type, int device_id,
                           size_t size, size_t align) = 0;

    //! free the memory pointed by ptr in the given device
    virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0;
 };

 /*!
 * \brief the thread affinith callback type
 * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
 * thread_id of (nr_threads - 1) is the main worker thread.
 */
 using ThreadAffinityCallback = std::function<void(int thread_id)>;

 using AsyncCallback = std::function<void(void)>;

 /*!
 * \brief the start/finish callback function
 * \param unordered_map map from the io tensor name to the pair of which is the
 * corresponding IO of user config and the realy input or output tensor.
 */
 using StartCallback = std::function<void(
        const std::unordered_map<std::string,
                                 std::pair<IO, std::shared_ptr<Tensor>>>&)>;
 using FinishCallback = std::function<void(
        const std::unordered_map<std::string,
                                 std::pair<IO, std::shared_ptr<Tensor>>>&)>;

 /*!
 * \brief The network is construct form a model, implement model load, init,
 * forward, and display some model information
 */
 class LITE_API Network {
 public:
    class NetworkImplBase;

    ~Network();

    Network(const Config& config = {}, const NetworkIO& networkio = {});

    Network(const NetworkIO& networkio, const Config& config = {});

    //! load the model form memory
    void load_model(void* model_mem, size_t size);

    //! load the model from a model path
    void load_model(std::string model_path);

    //! only compute the output tensor in user configured
    void compute_only_configured_output();

    //! get the network input and output tensor, the layout of which is
    //! sync from mge tensor, when the name of input and output tensor  are the
    //! same, use LiteTensorPhase to separate
    std::shared_ptr<Tensor> get_io_tensor(
            std::string io_name,
            LiteTensorPhase phase = LiteTensorPhase::LITE_IO);

    //! get the network input by index
    std::shared_ptr<Tensor> get_input_tensor(size_t index);

    //! get the network output tensor by index
    std::shared_ptr<Tensor> get_output_tensor(size_t index);

    //! set the network forward in async mode and set the async callback
    //! function
    Network& set_async_callback(const AsyncCallback& async_callback);

    //! set the start forward callback function, which will be execute before
    //! forward. this can be used to check network input or dump model inputs
    //! for debug
    Network& set_start_callback(const StartCallback& start_callback);

    //! set the finish forward callback function, which will be execute after
    //! forward. this can be used to dump model outputs for debug
    Network& set_finish_callback(const FinishCallback& finish_callback);

    //! forward the network with filled input data and fill the output data
    //! to the output tensor
    void forward();

    //! waite until forward finish in sync model
    void wait();

    //! get the input tensor name in the order in load return
    std::string get_input_name(size_t index) const;

    //! get the output tensor name in the order in load return
    std::string get_output_name(size_t index) const;

    //! get all the input tensor name in the order in load return
    std::vector<std::string> get_all_input_name() const;

    //! get all the output tensor name in the order in load return
    std::vector<std::string> get_all_output_name() const;

    //! set/get device id, default device id = 0
    Network& set_device_id(int device_id);
    int get_device_id() const;

    //! set/get stream id, default stream id = 0
    Network& set_stream_id(int stream_id);
    int get_stream_id() const;

    //! enable profile the network, a file will be generated
    void enable_profile_performance(std::string profile_file_path);

    //! get model extra info
    const std::string& get_model_extra_info();

    //! get device type
    LiteDeviceType get_device_type() const;

 public:
    friend class NetworkHelper;

 private:
    //! update member from implement
    void update_from_implement();

    //! decrypt and parse the model file
    void prase_model(std::shared_ptr<void> model_data, size_t size);

 private:
    bool m_loaded = false;
    Config m_config;
    NetworkIO m_network_io;
    std::unique_ptr<NetworkImplBase> m_impl;
    std::string m_extra_info;
 };

 /*********************** MGE special network function ***************/
 class LITE_API Runtime {
 public:
    //! When device is CPU, this interface will set the to be loaded model
    //! run in multi thread mode with the given thread number.
    static void set_cpu_threads_number(std::shared_ptr<Network> dst_network,
                                       size_t nr_threads);
    static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network);

    //! set threads affinity callback;
    static void set_runtime_thread_affinity(
            std::shared_ptr<Network> network,
            const ThreadAffinityCallback& thread_affinity_callback);

    //! Set cpu default mode when device is CPU, in some low computation
    //! device or single core device, this mode will get good performace
    static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network);
    static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network);

    //! Set use tensorrt forward
    static void use_tensorrt(std::shared_ptr<Network> dst_network);

    //! set opr algorithm selection strategy in the network
    //! shared_batch_size: the batch size used by fastrun,
    //!                    Non-zero value means that fastrun use this batch size
    //!                    regardless of the batch size of the model. Zero means
    //!                    fastrun use batch size of the model
    //! binary_equal_between_batch: if the content of each input batch is binary
    //!                             equal,whether the content of each output
    //!                             batch is promised to be equal
    static void set_network_algo_policy(
            std::shared_ptr<Network> dst_network,
            LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0,
            bool binary_equal_between_batch = false);

    //! set workspace_limit for oprs with multiple algorithms, set
    //! workspace limitation can save memory but may influence the performance
    static void set_network_algo_workspace_limit(
            std::shared_ptr<Network> dst_network, size_t workspace_limit);

    //! set the network memroy allocator, the allocator is defined by user
    static void set_memory_allocator(std::shared_ptr<Network> dst_network,
                                     std::shared_ptr<Allocator> user_allocator);

    //! share the runtime memory with other network, the weights is not shared
    static void share_runtime_memory_with(std::shared_ptr<Network> dst_network,
                                          std::shared_ptr<Network> src_network);

    //! Dump input/output values of all internal variables to output
    //! file, in txt format
    static void enable_io_txt_dump(std::shared_ptr<Network> dst_network,
                                   std::string io_txt_out_file);

    //! Dump input/output values of all internal variables to output
    //! directory, in binary format
    static void enable_io_bin_dump(std::shared_ptr<Network> dst_network,
                                   std::string io_bin_out_dir);

    //! load a new network which will share weights with src network
    static void shared_weight_with_network(
            std::shared_ptr<Network> dst_network,
            const std::shared_ptr<Network> src_network);
 };

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/include/lite/tensor.h
+++ b/lite/include/lite/tensor.h
@@ -0,0 +1,224 @@
 /**
 * \file inlude/lite/tensor.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "common_enum_c.h"
 #include "macro.h"

 #include <memory>
 #include <unordered_map>
 #include <vector>

 namespace lite {

 /*!
 * \brief the simple layout description
 */
 struct LITE_API Layout {
    static constexpr uint32_t MAXDIM = 7;
    size_t shapes[MAXDIM];
    size_t ndim = 0;
    LiteDataType data_type = LiteDataType::LITE_FLOAT;

    //! get the total byte of a layout
    size_t get_elem_size() const;

    //! compare whether the two layout is equal
    bool operator==(const Layout& other) const;
 };

 /*!
 * \brief warpper of the MegEngine Tensor
 *
 * The memory is not alloc directly, when call get_memory_ptr() the memory
 * will be allocated in tensor implement, which will be deleted automatically
 *
 * Note: if the tensor memory is set through reset() interface, the memory is
 * managed by the user, it will not be freed by the tensor
 *
 * If the device or layout is not set, when copy form other source tensor, its
 * device and layout will be copy form the source tensor
 *
 * if is_pinned_host is set, the storage memory of the tensor is pinned memory,
 * this is used to Optimize the H2D or D2H memory copy, if the device or layout
 * is not set, when copy form other device(CUDA) tensor, this tensor
 * will be automatically set to pinned tensor
 */
 class LITE_API Tensor {
    class TensorImpl;

 public:
    class TensorImplBase;

    Tensor();
    Tensor(LiteDeviceType device_type, bool is_pinned_host = false);
    Tensor(LiteDeviceType device_type, const Layout& layout,
           bool is_pinned_host = false);
    Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {},
           bool is_pinned_host = false);
    Tensor(int device_id, int stream_id, LiteDeviceType device_type,
           bool is_pinned_host = false);
    Tensor(LiteBackend backend,
           LiteDeviceType device_type = LiteDeviceType::LITE_CPU,
           int device_id = 0, const Layout& layout = {},
           bool is_pinned_host = false);
    ~Tensor();

    LiteDeviceType get_device_type() const { return m_device_type; };

    int get_device_id() const { return m_device_id; };

    Layout get_layout() const { return m_layout; };

    bool is_pinned_host() const { return m_is_pinned_host; };

    //! set layout will change the layout and reallocate memory of the tensor
    void set_layout(const Layout& layout);

    //! which will trigger memory alloc in tensor implement
    void* get_memory_ptr() const;

    //! get the memory with the offset describe in idx
    void* get_memory_ptr(const std::vector<size_t>& idx) const;

    //! get the tensor capacity in byte
    size_t get_tensor_total_size_in_byte() const;

    //! use the user allocated data to reset the memory of the tensor, the
    //! memory will not be managed by the lite, later, the user should delete
    //! it.
    void reset(void* prepared_data, size_t data_length_in_byte);

    //! use the user allocated data and corresponding layout to reset the data
    //! and layout of the tensor, the memory will not be managed by lite, later,
    //! the user should delete it.
    void reset(void* prepared_data, const Layout& layout);

    //! reshape the tensor with new shape, keep the data_type the same
    void reshape(const std::vector<int>& shape);

    //! get a new tensor slice from the origin tensor
    std::shared_ptr<Tensor> slice(const std::vector<size_t>& start,
                                  const std::vector<size_t>& end,
                                  const std::vector<size_t>& step = {});

    //! set the tensor memory with zero
    void fill_zero();

    //! copy tensor form other tensor
    //! Note: the best way for tensor copy is just set the dst device, left
    //! layout empty, when copying the dst layout will be set the same with
    //! src
    void copy_from(const Tensor& src);

    //! share memory with other tensor
    void share_memory_with(const Tensor& src_tensor);

    //! whether the memory of tensor is continue
    bool is_continue_memory() const;

    //! update the menbers from the implement
    void update_from_implement();

 public:
    friend class TensorHelper;

 private:
    std::shared_ptr<TensorImplBase> m_tensor_impl;

    //! flag whether the storage of the tensor is pinned, this is only used
    //! when the compnode is not in CPU
    bool m_is_pinned_host = false;
    int m_device_id = 0;
    Layout m_layout;
    //! the device of the tensor should not be changed after the tensor has
    //! constructed
    LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU;
 };

 /**
 * \brief a class can hold any type data, but not check whether the visit type
 * is valid
 */
 class LITE_API LiteAny {
 public:
    LiteAny() = default;
    template <class T>
    LiteAny(T value) : m_holder(new AnyHolder<T>(value)) {
        m_is_string = std::is_same<std::string, T>();
    }

    LiteAny(const LiteAny& any) {
        m_holder = any.m_holder->clone();
        m_is_string = any.is_string();
    }
    LiteAny& operator=(const LiteAny& any) {
        m_holder = any.m_holder->clone();
        m_is_string = any.is_string();
        return *this;
    }
    bool is_string() const { return m_is_string; }

    class HolderBase {
    public:
        virtual ~HolderBase() = default;
        virtual std::shared_ptr<HolderBase> clone() = 0;
        virtual size_t type_length() const = 0;
    };

    template<class T>
    class AnyHolder : public HolderBase {
    public:
        AnyHolder(const T value) :
            m_value(value) {
        }
        virtual std::shared_ptr<HolderBase> clone() override {
            return std::make_shared<AnyHolder>(m_value);
        }
        virtual size_t type_length() const override { return sizeof(T); }

    public:
        T m_value;
    };
    //! if type is miss matching, it will throw
    void type_missmatch(size_t expect, size_t get) const;

    //! only check the storage type and the visit type length, so it's not safe
    template <class T>
    T unsafe_cast() const {
        if (sizeof(T) != m_holder->type_length()) {
            type_missmatch(m_holder->type_length(), sizeof(T));
        }
        return static_cast<LiteAny::AnyHolder<T>*>(m_holder.get())->m_value;
    }
    //! only check the storage type and the visit type length, so it's not safe
    void* cast_void_ptr() const {
        return &static_cast<LiteAny::AnyHolder<char>*>(m_holder.get())->m_value;
    }

 private:
    std::shared_ptr<HolderBase> m_holder;
    bool m_is_string = false;
 };

 /*********************** special tensor function ***************/
 class LITE_API TensorUtils {
 public:
    //! concat all the input tensor to one on the specified dim, the result
    //! tensor reside in dst_device_id of dst_device, if dst_device is
    //! LITE_DEVICE_DEFAULT, the device will get from the first tensor
    static std::shared_ptr<Tensor> concat(
            const std::vector<Tensor>& tensors, int dim,
            LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT,
            int dst_device_id = -1);
 };
 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/include/lite-c/global_c.h
+++ b/lite/lite-c/include/lite-c/global_c.h
@@ -0,0 +1,169 @@
 /**
 * \file lite-c/include/lite-c/global-c.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #ifndef LITE_C_GLOBAL_H_
 #define LITE_C_GLOBAL_H_

 #include "macro.h"
 #include "network_c.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*! \brief Get version
 */
 LITE_API int LITE_get_version(int* major, int* minor, int* patch);

 /*! \brief Get the last error message.
 * \return the message pointer
 */
 LITE_API const char* LITE_get_last_error();

 /*! \brief Get device count
 * \param[in] device_type device type
 * \return the device count
 */
 LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count);

 /*! \brief try to coalesce all free memory in megenine
 */
 LITE_API int LITE_try_coalesce_all_free_memory();

 /**
 * \brief Model decryption function
 *
 * \param[in] input_data is the decrypted model memory pointer
 * \param[in] input_size the size the decrypted model memory in byte
 * \param[in] key_data decryption key data
 * \param[in] key_size the size of decryption key data
 * \param[out] output_data the data of decrypted data, if output_data is
 * nullptr, just query the output memory length, else write the decryted data to
 * the output_data
 * \return size of decrypted data
 */
 typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size,
                                     const uint8_t* key_data, size_t key_size,
                                     const void* output_data);

 /**
 * \brief Model information parse function
 *
 * \param[in] info_data is the information memory
 * \param[in] info_size the size the information memory
 * \param[in] model_name the model name used for check whether the
 * infomation match the model
 * \param[in] config the model config, ParseInfoFunc can fill it with the
 * information in json, the config will influence Network loading later
 * \param[in] network_io the model IO, ParseInfoFunc can fill it with the
 * information in json, the networkio will influence Network forwarding later
 * \param[in] device_id the address to store device_id, default 0
 * \param[in] nr_threads the address to store nr_threads, default 1
 * \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default
 * \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default
 * false
 */
 typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size,
                                 const char* model_name, LiteConfig* config,
                                 LiteNetworkIO* network_io, int* device_id,
                                 size_t* nr_threads, int* is_cpu_inplace_mode,
                                 int* use_tensorrt);

 /**
 * \brief register a custom decryption method and key to lite.
 *
 * \param[in] decrypt_name the name of the decryption, which will act as the
 * hash key to find the decryption method.
 *
 * \param[in] func the decryption function, which will decrypt the model with
 * the registered key, return a vector that contain the decrypted model.
 * \param[in] key_data the decryption key of the method
 * \param[in] key_size the size of decryption key
 */
 LITE_API int LITE_register_decryption_and_key(const char* decrypt_name,
                                              const LiteDecryptionFunc func,
                                              const uint8_t* key_data,
                                              size_t key_size);

 /**
 * \brief update decryption function or key of a custom decryption method.
 *
 * \param[in] decrypt_name the name of the decryption, which will act as the
 * hash key to find the decryption method.
 *
 * \param[in] func the decryption function, which will decrypt the model with
 * the registered key, return a vector that contain the decrypted model. if
 * function is nullptr, it will not be updated.
 *
 * \param[in] key the decryption key of the method, if the size of key is zero,
 * it will not be updated
 */
 LITE_API int LITE_update_decryption_or_key(const char* decrypt_name,
                                           const LiteDecryptionFunc func,
                                           const uint8_t* key_data,
                                           size_t key_size);

 /**
 * \brief register a custom parser function to lite.
 *
 * \param[in] info_type the name of the parser function, which will act as the
 * hash key to find the parser method.
 *
 * \param[in] parse_func the parser function, which will parse the given
 * information and modify the Network Config and IO.
 *
 */
 LITE_API int LITE_register_parse_info_func(const char* info_type,
                                           const LiteParseInfoFunc parse_func);

 /*!
 * \brief Set the loader to the lite
 * \param[in] loader_path is the file path which store the cache
 */
 LITE_API int LITE_set_loader_lib_path(const char* loader_path);

 /*!
 * \brief Set the algo policy cache file for CPU/CUDA ...
 * \param[in] cache_path is the file path which store the cache
 * \param[in] always_sync sync the cache when cache updated
 */
 LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync);

 /*!
 * \brief Set the tensor policy cache file for CPU/CUDA ...
 * \param[in] cache_path is the file path which store the cache
 */
 LITE_API int LITE_set_tensor_rt_cache(const char* cache_path);

 /*! \brief Set the current log level.
 * \param[in] level The new log level
 */
 LITE_API int LITE_set_log_level(LiteLogLevel level);

 /*! \brief Get the current log level.
 * \param[in] level The pointer to log level
 */
 LITE_API int LITE_get_log_level(LiteLogLevel* level);
 /*!
 * \brief dump the algo policy cache to file, if the network is set to profile
 * when forward, though this the algo policy will dump to file
 * \param[in] cache_path is the file path which store the cache
 */
 LITE_API int LITE_dump_persistent_cache(const char* cache_path);

 /*!
 * \brief dump the tensorrt policy cache to file
 */
 LITE_API int LITE_dump_tensor_rt_cache();
 #endif
 #ifdef __cplusplus
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/include/lite-c/network_c.h
+++ b/lite/lite-c/include/lite-c/network_c.h
@@ -0,0 +1,525 @@
 /**
 * \file lite-c/include/lite-c/network_c.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #ifndef LITE_C_NETWORK_H_
 #define LITE_C_NETWORK_H_

 #include "tensor_c.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*!
 * \brief the inference options which will be translated to megenine
 *
 * \param weight_preprocess is the option wich optimize the inferece performance
 * with preprocess the const weights
 *
 * \param fuse_preprocess fuse preprocess patten, like astype + pad_channel +
 * dimshuffle
 *
 * \param fake_next_exec  whether only to perform non-computing tasks (like
 * memory allocation and queue initialization) for next exec. This would be
 * reset to false when the graph is executed.
 *
 * \param var_sanity_check_first_run Disable var sanity check on the first run.
 * Var sanity check is enabled on the first-time execution by default, and can
 * be used to find some potential memory access errors in the operator
 * implementation.
 *
 * \param const_shape This can be used to reduce memory usage since some
 * static inference data structures can be omitted.
 *
 * \param force_dynamic_alloc force dynamic memory alloc for all vars
 *
 * \param force_output_dynamic_alloc force dynamic memory alloc for output vars
 * which are used as CallbackCaller input when call compile() function
 *
 * \param no_profiling_on_shape_change do not re-profile to select best impl
 * algo when input shape changes (use previous algo)
 *
 * \param jit_level Execute supported operators with JIT (support MLIR,
 * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level:
 * 1 for basic elemwise opr;
 * 2 for including reduce operator
 *
 * \param record_level flag optimize the inference performace with record the
 * kernel tasks in first run, hereafter the inference all need to execute the
 * recorded tasks.
 * level = 0 means the normal inference,
 * level = 1 means use record inference,
 * level = 2 means record inference with free the extra memory
 *
 * \param graph_opt_level optimization level:
 * 0: disable
 * 1: level-1: inplace arith transformations during graph
 *    construction
 * 2: level-2: level-1, plus global optimization before graph
 *    compiling
 * 3: also enable JIT
 * <0: corresponding level, with result check for debug
 *
 * \param async_exec_level exec: dispatch on separate threads for different
 * comp_node.
 * 0: do not perform async dispatch
 * 1: dispatch async if there are more than one comp node with limited queue
 * mask 0b10: async if there are multiple comp nodes with
 * mask 0b100: always async
 */
 typedef struct Options {
    int weight_preprocess;
    int fuse_preprocess;
    int fake_next_exec;
    int var_sanity_check_first_run;
    int const_shape;
    int force_dynamic_alloc;
    int force_output_dynamic_alloc;
    int no_profiling_on_shape_change;
    int jit_level;
    int comp_node_seq_record_level;
    int graph_opt_level;
    int async_exec_level;

    //! layout transform options
    int enable_nchw44;
    int enable_nchw44_dot;
    int enable_nchw88;
    int enable_nhwcd4;
    int enable_nchw4;
    int enable_nchw32;
    int enable_nchw64;
 } LiteOptions;

 //! define a default Options
 extern LITE_API const LiteOptions default_option;

 /*!
 * \brief Configuration when load and compile the graph
 *
 * \param bare_model_cryption_name is the bare model cryption method name, bare
 *model is not pack json info inside
 *
 *\param has_compression flag whether the model is compressed, the compress
 *method will read form the model
 */
 typedef struct LiteConfig {
    int has_compression;
    int device_id;
    LiteDeviceType device_type;
    LiteBackend backend;
    const char* bare_model_cryption_name;
    LiteOptions options;
 } LiteConfig;

 //! get default config
 LITE_API LiteConfig* default_config();

 /*!
 * \brief config the network input and output item
 *
 */
 typedef struct LiteIO {
    //! the tensor name in the graph corresponding to the IO
    const char* name;

    //! Used to mark where the input tensor comes from and the output where copy
    //! to, if is_host is true, the input is from host and output copy to host,
    //! otherwise device. Sometimes The input is from device and output no need
    //! copy to host, default is true.
    int is_host;

    //! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
    //! output tensor value is invaid, only shape will be set, default is VALUE
    LiteIOType io_type;

    //! The layout of the config from user, if other layout is set before
    //! forward or get after forward, this layout will by pass. if no other
    //! layout is set before forward, this layout will work. if this layout is
    //! no set, the model will forward with its origin layout. if in output, it
    //! will used to check.
    LiteLayout config_layout;
 } LiteIO;

 //! define a default IO
 extern LITE_API const LiteIO default_io;

 /*!
 * \brief the input and output information when load the network
 * the NetworkIO will remain in the network until the network is destroyed
 */
 typedef struct LiteNetworkIO {
    LiteIO* inputs;
    LiteIO* outputs;
    size_t input_size;   //! the number IO in inputs
    size_t output_size;  //! the number IO in outputs
 } LiteNetworkIO;

 //! get default NetworkIO
 LITE_API LiteNetworkIO* default_network_io();

 /*!
 * \brief A user-implemented allocator function
 */
 //! allocate memory of size in the given device with the given align
 typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id,
                              size_t size, size_t align);
 //! free the memory pointed by ptr in the given device
 typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr);

 /*!
 * \brief the thread affinith callback type
 * \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1),
 * thread_id of (nr_threads - 1) is the main worker thread.
 */
 typedef int (*LiteThreadAffinityCallback)(int thread_id);

 typedef int (*LiteAsyncCallback)();

 /*!
 * \brief the start/finish callback function
 * \param unordered_map map from the io tensor name to the pair of which is the
 * corresponding IO of user config and the realy input or output tensor.
 */

 typedef int (*LiteStartCallback)(const LiteIO* inputs,
                                 const LiteTensor* input_tensors, size_t size);

 typedef int (*LiteFinishCallback)(const LiteIO* outputs,
                                  const LiteTensor* output_tensors,
                                  size_t size);

 /*!
 * \brief The network is construct form a model, implement model load, init,
 * forward, and display some model information
 */
 typedef void* LiteNetwork;

 /**
 * \brief Create a lite Network object with default config and networkIO.
 * \param[out] network The netwrok pointer
 * \return int if the return is not zero, error happened, the error message
 * can get by LITE_get_last_error
 */
 LITE_API int LITE_make_default_network(LiteNetwork* network);

 /**
 * \brief Create a lite Network object from the given config and networkIO.
 * \param[in] config The configration to create the network
 * \param[in] network_io The configration io to create the network
 * \param[out] network The network pointer
 */
 LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config,
                               const LiteNetworkIO network_io);

 /**
 * \brief Create a lite Network object from the given config and networkIO.
 * \param[in] config The configration to create the network
 * \param[out] network The network pointer
 */
 LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config);


 /**
 * \brief load the model to network form memory
 * \param[in] model_mem The model in memory
 * \param[in] size The size of the model memory
 * \param[out] network The network to be load model in
 */
 LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem,
                                      size_t size);

 /**
 * \brief load the model to network form given path
 * \param[in] model_path The model path
 * \param[out] network The network to be load model in
 */
 LITE_API int LITE_load_model_from_path(LiteNetwork network,
                                       const char* model_path);

 /**
 * \brief load a new network which will share weights with src network
 * \param[in] origin_network The origin network pointer
 * \param[out] network The network pointer
 */
 LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network,
                                             const LiteNetwork src_network);

 /**
 * \brief Destroy a lite network object.
 * \param[in] network The network pointer
 * \return int if the return is not zero, error happened, the error message
 * can get by LITE_get_last_error
 */
 LITE_API int LITE_destroy_network(LiteNetwork network);

 /**
 * \brief forward the network with filled input data and fill the output data
 * to the output tensor
 * \param[in] network The loaded model
 */
 LITE_API int LITE_forward(const LiteNetwork network);

 /**
 * \brief waite until forward finish in sync model
 * \param[in] network The loaded model
 */
 LITE_API int LITE_wait(const LiteNetwork network);

 /**
 * \brief get the network input and ouput tensor, the layout of which is
 * get from model
 * \param[in] network The loaded model
 * \param[in] io_name The input or output name
 * \param[in] phase The tensor phase
 * \param[out] tensor The IO tensor get from the network
 */
 LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name,
                                LiteTensorPhase phase, LiteTensor* tensor);

 /**
 * \brief get the input tensor name in the order in loaded model
 * \param[in] network The loaded model
 * \param[in] index The index of input tensor
 * \param[out] name The input tensor name
 */
 LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index,
                                 const char** name);

 /**
 * \brief get the output tensor name in the order in loaded model
 * \param[in] network The loaded model
 * \param[in] index The index of output tensor
 * \param[out] name The output tensor name
 */
 LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index,
                                  const char** name);

 /**
 * \brief get all the input tensor name in the order in loaded model
 * \param[in] network The loaded model
 * \param[in] size The number of the input tensor
 * \param[out] name The input tensor names
 */
 LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size,
                                     const char** name);

 /**
 * \brief get all the output tensor name in the order in loaded model
 * \param[in] network The loaded model
 * \param[in] size The number of output tensor
 * \param[out] name The output tensor name
 */
 LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size,
                                      const char** name);

 /**
 * \brief get whether the model is running in cpu inplace mode
 * \param[in] network The loaded model
 * \param[out] is_cpu_inplace_mode whether is in cpu inplace mode
 */
 LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network,
                                      int* is_cpu_inplace_mode);

 /**
 * \brief get the number of thread the network will run with
 * \param[in] network The loaded model
 * \param[out] nr_threads the thread number when the network running
 */
 LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network,
                                         size_t* nr_threads);

 /**
 * \brief get the device id the network will run with
 * \param[in] network The loaded model
 * \param[out] device_id the device id of the network will run
 */
 LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id);

 /**
 * \brief get the stream id the network will run with
 * \param[in] network The loaded model
 * \param[out] stream_id the stream id of the network will run
 */
 LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id);

 /**
 * \brief get the device type the network will run with
 * \param[in] network The loaded model
 * \param[out] device_type the device type of the network will run
 */
 LITE_API int LITE_get_device_type(const LiteNetwork network,
                                  LiteDeviceType* device_type);

 /**
 * \brief get the device type the network will run with
 * \param[in] network The loaded model
 * \param[out] info  : the json format memory
 * \param[out] info_size: the json format memory size
 */
 LITE_API int LITE_get_model_extra_info(const LiteNetwork network,
                                       const char** info, int* info_size);

 /**
 * \brief Set cpu default mode when device is CPU, in some low computation
 * device or single core device, this mode will get good performace
 * \param[in] network The loaded model
 */
 LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network);

 /**
 * \brief When device is CPU, this interface will set the to be loaded model
 * run in multi thread mode with the given thread number.
 * \param[in] network The loaded model
 * \param[in] nr_threads The threads number
 */
 LITE_API int LITE_set_cpu_threads_number(LiteNetwork network,
                                         size_t nr_threads);

 /**
 * \brief set device id, default device id = 0
 * \param[in] network The loaded model
 * \param[in] device_id The device id to be set
 */
 LITE_API int LITE_set_device_id(LiteNetwork network, int device_id);

 /**
 * \brief set stream id, default stream id = 0
 * \param[in] network The loaded model
 * \param[in] stream_id The stream id to be set
 */
 LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id);

 /**
 * \brief enable tensorrt
 * \param[in] network The loaded model
 */
 LITE_API int LITE_use_tensorrt(LiteNetwork network);

 /**
 * \brief set opr algorithm selection strategy in the network
 * \param[in] network The loaded model
 * \param[in] select_strategy The operator algorithm selection strategy
 */
 LITE_API int LITE_set_network_algo_policy(LiteNetwork network,
                                          LiteAlgoSelectStrategy strategy);

 /**
 * \brief set opr algorithm selection strategy in the network
 * \param[in] network The loaded model
 * \param[in] shared_batch_size: the batch size used by fastrun,
 *                      Non-zero value means that fastrun use this batch size
 *                      regardless of the batch size of the model. Zero means
 *                      fastrun use batch size of the model
 * \param[in] binary_equal_between_batch: if the content of each input batch is
 *                      binary equal,whether the content of each output batch is
 *                      promised to be equal
 */
 LITE_API int LITE_set_network_algo_fastrun_config(
        LiteNetwork network, unsigned int shared_batch_size,
        int binary_equal_between_batch);

 /**
 * \brief set workspace_limit for oprs with multiple algorithms, set
 * workspace limit can save memory but may influence the performance
 * \param[in] network The loaded model
 * \param[in] workspace_limit The operator algorithm workspace limit
 */
 LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network,
                                                   size_t workspace_limit);

 /**
 * \brief set the network forward in async mode and set the async callback
 * function
 * \param[in] network The loaded model
 * \param[in] async_callback when network finish forwarding, the callbak
 * will be called
 */
 LITE_API int LITE_set_async_callback(LiteNetwork network,
                                     const LiteAsyncCallback async_callback);

 /**
 * \brief set the start forward callback function, which will be execute beform
 *  forward, this can be used to check network input or dump model inputs
 *  for debug
 * \param[in] network The loaded model
 * \param[in] start_callback when network start forwarding, the callbak
 * will be called
 */
 LITE_API int LITE_set_start_callback(LiteNetwork network,
                                     const LiteStartCallback start_callback);

 /**
 * \brief set the finish forward callback function, which will be execute after
 * forward, this can be used to dump model outputs for debug
 * \param[in] network The loaded model
 * \param[in] finish_callback when network finish forwarding, the callbak
 * will be called
 */
 LITE_API int LITE_set_finish_callback(LiteNetwork network,
                                      const LiteFinishCallback finish_callback);

 /**
 * \brief set threads affinity callback
 * \param[in] network The loaded model
 * \param[in] thread_affinity_callback
 */
 LITE_API int LITE_set_runtime_thread_affinity(
        LiteNetwork network,
        const LiteThreadAffinityCallback thread_affinity_callback);

 /**
 * \brief set the network memroy allocator, the allocator is defined by user
 * \param[in] network The loaded model
 * \param[in] allocate_fun The allocate function of the user defined allocator
 * \param[in] free_fun The free function of the user defined allocator
 */
 LITE_API int LITE_set_memory_allocator(LiteNetwork network,
                                       const LiteAllocate allocate_fun,
                                       const LiteFree free_fun);

 /**
 * \brief the dst_network share the runtime memory with src_network
 * \param[in] src_network The source network
 * \param[in] dst_network The dst network to shared memory with src_network
 */
 LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network,
                                       LiteNetwork dst_network);

 /**
 * \brief enable profile the network, a JSON format file will be generated
 * \param[in] network The loaded model
 * \param[in] profile_json_file_path The profile result file path
 */
 LITE_API int LITE_enable_profile_performance(
        LiteNetwork network, const char* profile_json_file_path);

 /**
 * \brief Dump input/output values of all internal variables to output file,
 * in text format
 * \param[in] network The loaded model
 * \param[in] io_txt_out_file The dumped txt file name
 */
 LITE_API int LITE_enable_io_txt_dump(LiteNetwork network,
                                     const char* io_txt_out_file);

 /**
 * \brief Dump input/output values of all internal variables to output
 * directory, in binary format
 * \param[in] network The loaded model
 * \param[in] io_bin_out_dir The dumped bin file directory
 */
 LITE_API int LITE_enable_io_bin_dump(LiteNetwork network,
                                     const char* io_bin_out_dir);

 #ifdef __cplusplus
 }
 #endif
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/include/lite-c/tensor_c.h
+++ b/lite/lite-c/include/lite-c/tensor_c.h
@@ -0,0 +1,251 @@
 /**
 * \file lite-c/include/lite-c/tensor_c.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #ifndef LITE_TENSOR_C_H_
 #define LITE_TENSOR_C_H_

 #include "common_enum_c.h"
 #include "macro.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 #include "stddef.h"
 #include "stdint.h"

 #define LAYOUT_MAX_DIM (7)

 /*!
 * \brief the simple layout description
 */
 typedef struct LiteLayout {
    size_t shapes[LAYOUT_MAX_DIM];
    size_t ndim;
    LiteDataType data_type;
 } LiteLayout;

 //! define a default LiteLayout
 extern LITE_API const LiteLayout default_layout;

 /*!
 * \brief warpper of the MegEngine Tensor
 *
 * if is_pinned_host is set, the storage memory of the tensor is pinned memory,
 * this is used to Optimize the H2D or D2H memory copy, if the device or layout
 * is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor
 * will be automatically set to pinned tensor
 */
 typedef struct LiteTensorDesc {
    //! flag whether the storage of the tensor is pinned, this is only used when
    //! the compnode is not in CPU
    int is_pinned_host;

    //! the layout of the tensor
    LiteLayout layout;

    //! the device of the tensor should not be changed after the tensor has
    //! constructed
    LiteDeviceType device_type;

    //! device id of the tensor
    int device_id;
 } LiteTensorDesc;

 //! define a default TensorDesc
 extern LITE_API const LiteTensorDesc default_desc;

 /*!
 * \brief The pointer to a Lite Tensor object
 */
 typedef void* LiteTensor;

 /**
 * \brief Create a lite tensor object from the given describe.
 * \param[in] tensor_describe The description to create the Tensor
 * \param[out] tensor The Tensor pointer
 * \return int if the return is not zero, error happened, the error message
 * can get by LITE_get_last_error
 */
 LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe,
                              LiteTensor* tensor);

 /**
 * \brief Destroy a lite tensor object.
 * \param[in] tensor The Tensor pointer
 * \return int if the return is not zero, error happened, the error message
 * can get by LITE_get_last_error
 */
 LITE_API int LITE_destroy_tensor(LiteTensor tensor);

 /**
 * \brief change the layout of a Tensor object.
 * \param[in] tensor The Tensor
 * \param[out] layout The Layout to be set to a tensor
 */
 LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout);

 /**
 * \brief use the user allocated data to reset the memory of the tensor, the
 * memory will not be managed by the lite, later, the user should delete
 * it.
 * \param[in] tensor The Tensor
 * \param[in] prepared_data The allocated memory which satisfy the Tensor
 * \param[in] data_length_in_byte The length of the allocated memory
 * layout
 */
 LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data,
                                      size_t data_length_in_byte);

 /**
 * \brief  use the user allocated data and corresponding layout to reset the
 * data and layout of the tensor, the memory will not be managed by lite, later,
 * the user should delete it.
 * \param[in] tensor The Tensor
 * \param[in] layout The Layout to be set to the tensor
 * \param[in] prepared_data The allocated memory which satisfy the layout to be
 * set
 */
 LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout,
                               void* prepared_data);

 /**
 * \brief reshape a tensor with the memroy not change, the total number of
 * element in the reshaped tensor must equal to the origin tensor, the input
 * shape must only contain one or zero -1 to flag it can be deduced
 * automatically.
 * \param[in] tensor The Tensor to be reshape
 * \param[in] shape the user input shape
 * \param[in] size the number of data in shape,
 */
 LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size);

 /**
 * \brief slice a tensor with input param
 * \param[in] tensor The Tensor to be slice
 * \param[in] start start index of every axis of to be sliced
 * \param[in] end end index of every axis of to be sliced
 * \param[in] step step of every axis of to be sliced, if nullptr, step will be
 * 1
 * \param[in] size the number axis to be sliced
 * \param[out] sliced_tensor the result tensor sliced from the origin tensor
 */
 LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start,
                               const size_t* end, const size_t* step,
                               size_t size, LiteTensor* slice_tensor);

 /**
 * \brief fill zero to the tensor
 * \param[in] tensor The Tensor to be memset
 */
 LITE_API int LITE_tensor_fill_zero(LiteTensor tensor);

 /**
 * \brief copy tensor form other tensor
 * \param[out] dst_tensor The Tensor to copy into
 * \param[in] src_tensor The Tensor to copy from
 */
 LITE_API int LITE_tensor_copy(LiteTensor dst_tensor,
                              const LiteTensor src_tensor);

 /**
 * \brief share memory form other tensor
 * \param[out] dst_tensor The Tensor to share into
 * \param[in] src_tensor The Tensor to be shared
 */
 LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor,
                                           const LiteTensor src_tensor);

 /**
 * \brief get the memory pointer of a Tensor object.
 * \param[in] tensor The input Tensor
 * \param[out] data a pointer to void pointer
 */
 LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data);

 /**
 * \brief get the memory pointer of a Tensor object.
 * \param[in] tensor The input Tensor
 * \param[in] index The coordinate in the tensor
 * \param[in] size The lenght of coordinate
 * \param[out] data a pointer to void pointer
 */
 LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor,
                                               const size_t* index, size_t size,
                                               void** data);

 /**
 * \brief get the tensor capacity in byte of a Tensor object.
 * \param[in] tensor The input Tensor
 * \param[out] size_ptr a pointer to the return size

 */
 LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor,
                                                size_t* size);

 /**
 * \brief get the tensor layout of a Tensor object.
 * \param[in] tensor The input Tensor
 * \param[out] layout_ptr a pointer will be write with the layout of the tensor
 */
 LITE_API int LITE_get_tensor_layout(const LiteTensor tensor,
                                    LiteLayout* layout);

 /**
 * \brief get the tensor device of a Tensor object.
 * \param[in] tensor The input Tensor
 * \param[out] device_ptr a pointer will be write with the device of the tensor
 */
 LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor,
                                         LiteDeviceType* device_type);

 /**
 * \brief get the tensor device id of a Tensor object.
 * \param[in] tensor The input Tensor
 * \param[out] device_id a pointer will be write with the device id of the
 * tensor
 */
 LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id);

 /**
 * \brief whether the tensor is is_pinned_host.
 * \param[in] tensor The input Tensor
 * \param[out] is_pinned_host_ptr a int pointer will be write with whether the
 * tensor is pinned host
 */
 LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host);

 /**
 * \brief whether the tensor memory is continue.
 * \param[in] tensor The input Tensor
 * \param[out] is_continue a int pointer will be write with whether the
 * tensor continue
 */
 LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue);
 /**
 * \brief concat the inputs tensor to one big tensor
 * \param[in] tensors ptr The input Tensors
 * \param[in] nr_tensors number input Tensor
 * \param[in] dim the dim concat act on
 * \param[in] dst_device the device type of result tensor, when
 * LITE_DEVICE_DEFAULT, the result tensor device type will get from the first
 * tensor
 * \param[in] device_id the device id of result tensor, when -1, the result
 * tensor device id will get from the first tensor
 * \param[out] result_tensor the result tensor after concat
 */
 LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim,
                                LiteDeviceType dst_device, int device_id,
                                LiteTensor* result_tensor);

 #ifdef __cplusplus
 }
 #endif
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/common.h
+++ b/lite/lite-c/src/common.h
@@ -0,0 +1,73 @@
 /**
 * \file lite-c/src/common.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #ifndef LITE_C_COMMON_H_
 #define LITE_C_COMMON_H_

 #include "../src/misc.h"
 #include "lite-c/network_c.h"
 #include "lite-c/tensor_c.h"
 #include "lite/network.h"

 #include <exception>
 #include <stdexcept>

 //! convert c Layout to lite::Layout
 lite::Layout convert_to_layout(const LiteLayout& layout);

 //! convert lite::Layout to C Layout
 LiteLayout convert_to_clayout(const lite::Layout& layout);

 //! convert c config to lite::config
 lite::Config convert_to_lite_config(const LiteConfig c_config);

 //! convert C NetworkIO io to lite::NetworkIO
 lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io);

 /*!
 * \brief handle exception
 * \param e the exception
 * \return the return value of the error
 */
 int LiteHandleException(const std::exception& e);
 #if LITE_ENABLE_EXCEPTION
 /*! \brief  macro to guard a function */
 #define LITE_CAPI_BEGIN() try {
 /*! \brief every function starts with LITE_CAPI_BEGIN();
 * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
 */
 #define LITE_CAPI_END()                       \
    }                                         \
    catch (std::exception & _except_) {       \
        return LiteHandleException(_except_); \
    }                                         \
    return 0;
 #else
 /*! \brief  macro to guard a function */
 #define LITE_CAPI_BEGIN()  {
 /*! \brief every function starts with LITE_CAPI_BEGIN();
 * ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS
 */
 #define LITE_CAPI_END()                       \
    }                                         \
    return 0;
 #endif
 /*!
 * \brief catch the exception with stms
 */
 #define LITE_CAPI_END_WITH_STMS(_stms)        \
    }                                         \
    catch (std::exception & _except_) {       \
        _stms;                                \
        return LiteHandleException(_except_); \
    }                                         \
    return 0;

 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/global.cpp
+++ b/lite/lite-c/src/global.cpp
@@ -0,0 +1,192 @@
 /**
 * \file lite-c/src/tensor.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite/global.h"
 #include "common.h"
 #include "lite-c/global_c.h"

 #include <exception>
 #include <mutex>

 namespace {

 class ErrorMsg {
 public:
    std::string& get_error_msg() { return error_msg; }
    void set_error_msg(const std::string& msg) { error_msg = msg; }

 private:
    std::string error_msg;
 };
 ErrorMsg& get_global_error() {
    static thread_local ErrorMsg error_msg;
    return error_msg;
 }
 }  // namespace

 int LiteHandleException(const std::exception& e) {
    get_global_error().set_error_msg(e.what());
    return -1;
 }

 const char* LITE_get_last_error() {
    return get_global_error().get_error_msg().c_str();
 }

 int LITE_get_version(int* major, int* minor, int* patch) {
    LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null");
    lite::get_version(*major, *minor, *patch);
    return 0;
 }

 int LITE_get_device_count(LiteDeviceType device_type, size_t* count) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(count, "The ptr pass to LITE api is null");
    *count = lite::get_device_count(device_type);
    LITE_CAPI_END();
 }

 int LITE_try_coalesce_all_free_memory(){
    LITE_CAPI_BEGIN();
    lite::try_coalesce_all_free_memory();
    LITE_CAPI_END();
 }

 int LITE_register_decryption_and_key(const char* decrypt_name,
                                     const LiteDecryptionFunc func,
                                     const uint8_t* key_data, size_t key_size) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(decrypt_name && key_data && func,
                "The ptr pass to LITE api is null");
    std::vector<uint8_t> key;
    for (size_t i = 0; i < key_size; i++) {
        key.push_back(key_data[i]);
    }
    auto decrypt_func = [func](const void* input_data, size_t input_size,
                               const std::vector<uint8_t>& key) {
        auto size =
                func(input_data, input_size, key.data(), key.size(), nullptr);
        std::vector<uint8_t> output(size, 0);
        func(input_data, input_size, key.data(), key.size(), output.data());
        return output;
    };
    lite::register_decryption_and_key(decrypt_name, decrypt_func, key);
    LITE_CAPI_END();
 }

 int LITE_update_decryption_or_key(const char* decrypt_name,
                                  const LiteDecryptionFunc func,
                                  const uint8_t* key_data, size_t key_size) {
    LITE_CAPI_BEGIN();
    std::vector<uint8_t> key;
    for (size_t i = 0; i < key_size; i++) {
        key.push_back(key_data[i]);
    }
    lite::DecryptionFunc decrypt_func = nullptr;
    if (func) {
        decrypt_func = [func](const void* input_data, size_t input_size,
                              const std::vector<uint8_t>& key) {
            auto size = func(input_data, input_size, key.data(), key.size(),
                             nullptr);
            std::vector<uint8_t> output(size, 0);
            func(input_data, input_size, key.data(), key.size(), output.data());
            return output;
        };
    }
    lite::update_decryption_or_key(decrypt_name, decrypt_func, key);
    LITE_CAPI_END();
 }

 int LITE_register_parse_info_func(const char* info_type,
                                  const LiteParseInfoFunc parse_func) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null");
    auto lite_func = [parse_func](
                             const void* info_data, size_t info_size,
                             const std::string model_name, lite::Config& config,
                             lite::NetworkIO& network_io,
                             std::unordered_map<std::string, lite::LiteAny>&
                                     separate_config_map,
                             std::string& extra_info) {
        LITE_MARK_USED_VAR(extra_info);
        size_t nr_threads = 1;
        int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false;
        LiteNetworkIO c_io;
        LiteConfig c_config;
        auto ret = parse_func(info_data, info_size, model_name.c_str(),
                              &c_config, &c_io, &device_id, &nr_threads,
                              &is_cpu_inplace_mode, &use_tensorrt);
        config = convert_to_lite_config(c_config);
        network_io = convert_to_lite_io(c_io);
        if (device_id != 0) {
            separate_config_map["device_id"] = device_id;
        }
        if (nr_threads != 1) {
            separate_config_map["nr_threads"] = nr_threads;
        }
        if (is_cpu_inplace_mode != false) {
            separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode;
        }
        if (use_tensorrt != false) {
            separate_config_map["use_tensorrt"] = use_tensorrt;
        }
        return ret;
    };
    lite::register_parse_info_func(info_type, lite_func);
    LITE_CAPI_END();
 }

 int LITE_set_loader_lib_path(const char* loader_path) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(loader_path, "The ptr pass to LITE api is null");
    lite::set_loader_lib_path(loader_path);
    LITE_CAPI_END();
 }

 int LITE_set_persistent_cache(const char* cache_path, int always_sync) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
    lite::set_persistent_cache(cache_path, always_sync);
    LITE_CAPI_END();
 }

 int LITE_set_tensor_rt_cache(const char* cache_path) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
    lite::set_tensor_rt_cache(cache_path);
    LITE_CAPI_END();
 }

 int LITE_set_log_level(LiteLogLevel level) {
    LITE_CAPI_BEGIN();
    lite::set_log_level(level);
    LITE_CAPI_END();
 }

 int LITE_get_log_level(LiteLogLevel* level) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(level, "The ptr pass to LITE api is null");
    *level = lite::get_log_level();
    LITE_CAPI_END();
 }

 int LITE_dump_persistent_cache(const char* cache_path) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(cache_path, "The ptr pass to LITE api is null");
    lite::dump_persistent_cache(cache_path);
    LITE_CAPI_END();
 }

 int LITE_dump_tensor_rt_cache() {
    LITE_CAPI_BEGIN();
    lite::dump_tensor_rt_cache();
    LITE_CAPI_END();
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/network.cpp
+++ b/lite/lite-c/src/network.cpp
@@ -0,0 +1,580 @@
 /**
 * \file lite-c/src/network.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite/network.h"
 #include "common.h"
 #include "lite-c/network_c.h"

 #include "../../src/network_impl_base.h"

 #include <memory>
 #include <mutex>
 #include <unordered_map>
 #include <string.h>

 //! define a default Options
 const LiteOptions default_option = {
        .weight_preprocess = false,
        .fuse_preprocess = false,
        .fake_next_exec = false,
        .var_sanity_check_first_run = true,
        .const_shape = false,
        .force_dynamic_alloc = false,
        .force_output_dynamic_alloc = false,
        .no_profiling_on_shape_change = false,
        .jit_level = 0,
        .comp_node_seq_record_level = 0,
        .graph_opt_level = 2,
        .async_exec_level = 1,
        //! layout transform options
        .enable_nchw44 = 0,
        .enable_nchw44_dot = 0,
        .enable_nchw88 = 0,
        .enable_nhwcd4 = 0,
        .enable_nchw4 = 0,
        .enable_nchw32 = 0,
        .enable_nchw64 = 0,

 };

 //! define a default config
 LiteConfig default_config_t = {.has_compression = false,
                                   .device_id = -1,
                                   .device_type = LiteDeviceType::LITE_CPU,
                                   .backend = LiteBackend::LITE_DEFAULT,
                                   .bare_model_cryption_name = nullptr,
                                   .options = default_option};
 LiteConfig* default_config() {
    return &default_config_t;
 }

 //! define a default IO
 const LiteIO default_io = {.name = nullptr,
                           .is_host = true,
                           .io_type = LiteIOType::LITE_IO_VALUE,
                           .config_layout = default_layout};

 //! define a default NetworkIO
 LiteNetworkIO default_network_io_t = {.inputs = nullptr,
                                          .outputs = nullptr,
                                          .input_size = 0,
                                          .output_size = 0};
 LiteNetworkIO* default_network_io() {
    return &default_network_io_t;
 }

 namespace {
 std::unordered_map<void*, std::shared_ptr<lite::Network>>&
 get_gloabl_network_holder() {
    static thread_local std::unordered_map<void*,
                                           std::shared_ptr<lite::Network>>
            network_holder;
    return network_holder;
 }

 /*!
 * \brief A user-implemented allocator interface
 */
 class UserAllocator : public lite::Allocator {
 public:
    UserAllocator(LiteAllocate allocate_func, LiteFree free_func)
            : m_allocator(allocate_func), m_free(free_func) {
        LITE_ASSERT(m_allocator && m_free);
    }

    //! allocate memory of size in the given device with the given align
    void* allocate(LiteDeviceType device_type, int device_id, size_t size,
                   size_t align) override {
        return m_allocator(device_type, device_id, size, align);
    }

    //! free the memory pointed by ptr in the given device
    void free(LiteDeviceType device_type, int device_id, void* ptr) override {
        m_free(device_type, device_id, ptr);
    }

 private:
    LiteAllocate m_allocator;
    LiteFree m_free;
 };
 }  // namespace

 //! convert c config to lite::config
 lite::Config convert_to_lite_config(const LiteConfig c_config) {
    lite::Config lite_config;
    lite_config.device_type = c_config.device_type;
    if (c_config.bare_model_cryption_name) {
        lite_config.bare_model_cryption_name =
                c_config.bare_model_cryption_name;
    }
    lite_config.backend = c_config.backend;
    lite_config.has_compression = c_config.has_compression;
    lite_config.device_id = c_config.device_id;

    lite_config.options.weight_preprocess = c_config.options.weight_preprocess;
    lite_config.options.fuse_preprocess = c_config.options.fuse_preprocess;
    lite_config.options.fake_next_exec = c_config.options.fake_next_exec;
    lite_config.options.var_sanity_check_first_run =
            c_config.options.var_sanity_check_first_run;
    lite_config.options.const_shape = c_config.options.const_shape;
    lite_config.options.force_dynamic_alloc = c_config.options.const_shape;
    lite_config.options.force_output_dynamic_alloc =
            c_config.options.force_output_dynamic_alloc;
    lite_config.options.no_profiling_on_shape_change =
            c_config.options.no_profiling_on_shape_change;
    lite_config.options.jit_level = c_config.options.jit_level;
    lite_config.options.comp_node_seq_record_level =
            c_config.options.comp_node_seq_record_level;
    lite_config.options.graph_opt_level = c_config.options.graph_opt_level;
    lite_config.options.async_exec_level = c_config.options.async_exec_level;

   lite_config.options.enable_nchw44 = c_config.options.enable_nchw44;
   lite_config.options.enable_nchw44_dot = c_config.options.enable_nchw44_dot;
   lite_config.options.enable_nchw88 = c_config.options.enable_nchw88;
   lite_config.options.enable_nchw4 = c_config.options.enable_nchw4;
   lite_config.options.enable_nhwcd4 = c_config.options.enable_nhwcd4;
   lite_config.options.enable_nchw32 = c_config.options.enable_nchw32;
   lite_config.options.enable_nchw64 = c_config.options.enable_nchw64;

    return lite_config;
 }

 //! convert C NetworkIO io to lite::NetworkIO
 lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io) {
    lite::NetworkIO network_io;
    for (size_t i = 0; i < c_network_io.input_size; i++) {
        LiteIO* c_io = c_network_io.inputs + i;
        LITE_ASSERT(c_io->name, "input name of io tensor must set.");
        network_io.inputs.push_back(
                {c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type,
                 convert_to_layout(c_io->config_layout)});
    }
    for (size_t i = 0; i < c_network_io.output_size; i++) {
        LiteIO* c_io = c_network_io.outputs + i;
        LITE_ASSERT(c_io->name, "output name of io tensor must set.");
        network_io.outputs.push_back(
                {c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type,
                 convert_to_layout(c_io->config_layout)});
    }
    return network_io;
 }

 int LITE_make_default_network(LiteNetwork* network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto lite_network = std::make_shared<lite::Network>();
    get_gloabl_network_holder()[lite_network.get()] = lite_network;
    *network = lite_network.get();
    LITE_CAPI_END();
 }

 int LITE_make_network(LiteNetwork* network, const LiteConfig config,
                      const LiteNetworkIO network_io) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto lite_network = std::make_shared<lite::Network>(
            convert_to_lite_config(config), convert_to_lite_io(network_io));
    get_gloabl_network_holder()[lite_network.get()] = lite_network;
    *network = lite_network.get();
    LITE_CAPI_END();
 }

 int LITE_make_network_config(LiteNetwork* network, const LiteConfig config) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto lite_network =
            std::make_shared<lite::Network>(convert_to_lite_config(config));
    get_gloabl_network_holder()[lite_network.get()] = lite_network;
    *network = lite_network.get();
    LITE_CAPI_END();
 }

 int LITE_load_model_from_mem(LiteNetwork network, void* model_mem,
                             size_t size) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(model_mem, "The model memory pass to LITE api is null");
    static_cast<lite::Network*>(network)->load_model(model_mem, size);
    LITE_CAPI_END();
 }

 int LITE_load_model_from_path(LiteNetwork network, const char* model_path) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(model_path, "The model path pass to LITE api is null");
    static_cast<lite::Network*>(network)->load_model(model_path);
    LITE_CAPI_END();
 }

 int LITE_destroy_network(LiteNetwork network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    get_gloabl_network_holder().erase(network);
    LITE_CAPI_END();
 }

 int LITE_forward(const LiteNetwork network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    static_cast<lite::Network*>(network)->forward();
    LITE_CAPI_END();
 }

 int LITE_wait(const LiteNetwork network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    static_cast<lite::Network*>(network)->wait();
    LITE_CAPI_END();
 }

 int LITE_get_io_tensor(LiteNetwork network, const char* io_name,
                       LiteTensorPhase phase, LiteTensor* tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto io_tensor =
            static_cast<lite::Network*>(network)->get_io_tensor(io_name, phase);
    *tensor = io_tensor.get();
    LITE_CAPI_END();
 }

 int LITE_get_input_name(const LiteNetwork network, size_t index,
                        const char** name) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network && name, "The network pass to LITE api is null");
    *name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
                    ->get_input_name(index);
    LITE_CAPI_END();
 }

 int LITE_get_output_name(const LiteNetwork network, size_t index,
                         const char** name) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(name, "The name ptr pass to LITE api is null");
    *name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
                    ->get_output_name(index);
    LITE_CAPI_END();
 }

 int LITE_get_all_input_name(const LiteNetwork network, size_t* size,
                            const char** name) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto&& names =
            lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
                    ->get_all_input_name();
    if (size)
        *size = names.size();
    if (name) {
        for (auto in_name : names) {
            *name = in_name;
            name++;
        }
    }
    LITE_CAPI_END();
 }

 int LITE_get_all_output_name(const LiteNetwork network, size_t* size,
                             const char** name) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto&& names =
            lite::NetworkHelper::implement(static_cast<lite::Network*>(network))
                    ->get_all_output_name();
    if (size)
        *size = names.size();
    if (name) {
        for (auto in_name : names) {
            *name = in_name;
            name++;
        }
    }
    LITE_CAPI_END();
 }

 int LITE_set_device_id(LiteNetwork network, int device_id) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    static_cast<lite::Network*>(network)->set_device_id(device_id);
    LITE_CAPI_END();
 }

 int LITE_get_device_id(const LiteNetwork network, int* device_id) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(device_id, "The device_id pass to LITE api is null");
    *device_id = static_cast<lite::Network*>(network)->get_device_id();
    LITE_CAPI_END();
 }

 int LITE_set_stream_id(LiteNetwork network, int stream_id) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    static_cast<lite::Network*>(network)->set_stream_id(stream_id);
    LITE_CAPI_END();
 }

 int LITE_get_stream_id(const LiteNetwork network, int* stream_id) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(stream_id, "The stream_id pass to LITE api is null");
    *stream_id = static_cast<lite::Network*>(network)->get_stream_id();
    LITE_CAPI_END();
 }

 int LITE_get_model_extra_info(const LiteNetwork network, const char** info,
                              int* info_size) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(info_size, "The info and info_size are all null");
    auto& extra_info =
            static_cast<lite::Network*>(network)->get_model_extra_info();
    *info_size = extra_info.size();
    *info = extra_info.c_str();
    LITE_MARK_USED_VAR(info);
    LITE_CAPI_END();
 }

 int LITE_get_device_type(const LiteNetwork network,
                         LiteDeviceType* device_type) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(device_type, "The device_type pass to LITE api is null");
    *device_type = static_cast<lite::Network*>(network)->get_device_type();
    LITE_CAPI_END();
 }

 int LITE_set_async_callback(LiteNetwork network,
                            const LiteAsyncCallback async_callback) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(async_callback, "The ptr pass to LITE api is null");
    static_cast<lite::Network*>(network)->set_async_callback(
            std::move(async_callback));
    LITE_CAPI_END();
 }

 int LITE_set_start_callback(LiteNetwork network,
                            const LiteStartCallback start_callback) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto lite_start_callback =
            [start_callback](
                    const std::unordered_map<
                            std::string,
                            std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>&
                            inputs_map) -> void {
        std::vector<LiteIO> ios;
        std::vector<LiteTensor> io_tensors;
        size_t nr_io = 0;
        for (const auto& io : inputs_map) {
            nr_io++;
            auto&& lite_io = io.second.first;
            ios.push_back({lite_io.name.c_str(), lite_io.is_host,
                           lite_io.io_type,
                           convert_to_clayout(lite_io.config_layout)});
            io_tensors.push_back(io.second.second.get());
        }
        start_callback(ios.data(), io_tensors.data(), nr_io);
    };
    static_cast<lite::Network*>(network)->set_start_callback(
            lite_start_callback);
    LITE_CAPI_END();
 }

 int LITE_set_finish_callback(LiteNetwork network,
                             const LiteFinishCallback finish_callback) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto lite_finish_callback =
            [finish_callback](
                    const std::unordered_map<
                            std::string,
                            std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>&
                            outputs_map) -> void {
        std::vector<LiteIO> ios;
        std::vector<LiteTensor> io_tensors;
        size_t nr_io = 0;
        for (const auto& io : outputs_map) {
            nr_io++;
            auto&& lite_io = io.second.first;
            ios.push_back({lite_io.name.c_str(), lite_io.is_host,
                           lite_io.io_type,
                           convert_to_clayout(lite_io.config_layout)});
            io_tensors.push_back(io.second.second.get());
        }
        finish_callback(ios.data(), io_tensors.data(), nr_io);
    };
    static_cast<lite::Network*>(network)->set_finish_callback(
            lite_finish_callback);
    LITE_CAPI_END();
 }

 int LITE_enable_profile_performance(LiteNetwork network,
                                    const char* profile_json_file_path) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    static_cast<lite::Network*>(network)->enable_profile_performance(
            profile_json_file_path);
    LITE_CAPI_END();
 }

 int LITE_is_cpu_inplace_mode(const LiteNetwork network,
                             int* is_cpu_inplace_mode) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network && is_cpu_inplace_mode,
                "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    *is_cpu_inplace_mode = lite::Runtime::is_cpu_inplace_mode(network_shared);
    LITE_CAPI_END();
 }

 int LITE_get_cpu_threads_number(const LiteNetwork network, size_t* nr_threads) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    LITE_ASSERT(nr_threads, "The ptr pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    *nr_threads = lite::Runtime::get_cpu_threads_number(network_shared);
    LITE_CAPI_END();
 }

 int LITE_set_cpu_inplace_mode(LiteNetwork network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_cpu_inplace_mode(network_shared);
    LITE_CAPI_END();
 }

 int LITE_use_tensorrt(LiteNetwork network){
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::use_tensorrt(network_shared);
    LITE_CAPI_END();
 }

 int LITE_set_cpu_threads_number(LiteNetwork network, size_t nr_threads) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_cpu_threads_number(network_shared, nr_threads);
    LITE_CAPI_END();
 }

 int LITE_set_network_algo_policy(LiteNetwork network,
                                 LiteAlgoSelectStrategy strategy) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_network_algo_policy(network_shared, strategy);
    LITE_CAPI_END();
 }

 int LITE_set_network_algo_fastrun_config(LiteNetwork network,
                                         unsigned int shared_batch_size,
                                         int binary_equal_between_batch) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_network_algo_policy(
            network_shared, LiteAlgoSelectStrategy(0), shared_batch_size,
            binary_equal_between_batch);
    LITE_CAPI_END();
 }

 int LITE_set_network_algo_workspace_limit(LiteNetwork network,
                                          size_t workspace_limit) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_network_algo_workspace_limit(network_shared,
                                                    workspace_limit);
    LITE_CAPI_END();
 }

 int LITE_set_runtime_thread_affinity(
        LiteNetwork network,
        const LiteThreadAffinityCallback thread_affinity_callback) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_runtime_thread_affinity(
            network_shared, std::move(thread_affinity_callback));
    LITE_CAPI_END();
 }

 int LITE_set_memory_allocator(LiteNetwork network,
                              const LiteAllocate allocate_fun,
                              const LiteFree free_fun) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network && allocate_fun && free_fun,
                "The ptr pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::set_memory_allocator(
            network_shared,
            std::make_shared<UserAllocator>(allocate_fun, free_fun));
    LITE_CAPI_END();
 }

 int LITE_enable_io_txt_dump(LiteNetwork network, const char* io_txt_out_file) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::enable_io_txt_dump(network_shared, io_txt_out_file);
    LITE_CAPI_END();
 }

 int LITE_enable_io_bin_dump(LiteNetwork network, const char* io_bin_out_dir) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> network_shared{
            static_cast<lite::Network*>(network), [](void*) {}};
    lite::Runtime::enable_io_bin_dump(network_shared, io_bin_out_dir);
    LITE_CAPI_END();
 }

 int LITE_shared_weight_with_network(LiteNetwork dst_network,
                                    const LiteNetwork src_network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(dst_network && src_network,
                "The network pass to LITE api is null");
    const std::shared_ptr<lite::Network> src_shared_net{
            static_cast<lite::Network*>(src_network), [](void*) {}};
    std::shared_ptr<lite::Network> dst_shared_net{
            static_cast<lite::Network*>(dst_network), [](void*) {}};
    lite::Runtime::shared_weight_with_network(dst_shared_net, src_shared_net);
    LITE_CAPI_END();
 }

 int LITE_share_runtime_memroy(LiteNetwork dst_network,
                              LiteNetwork src_network) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(src_network && dst_network,
                "The network pass to LITE api is null");
    std::shared_ptr<lite::Network> src_shared{
            static_cast<lite::Network*>(src_network), [](void*) {}};
    std::shared_ptr<lite::Network> dst_shared{
            static_cast<lite::Network*>(dst_network), [](void*) {}};
    lite::Runtime::share_runtime_memory_with(dst_shared, src_shared);
    LITE_CAPI_END();
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/lite-c/src/tensor.cpp
+++ b/lite/lite-c/src/tensor.cpp
@@ -0,0 +1,257 @@
 /**
 * \file lite-c/src/tensor.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite/tensor.h"
 #include "../../src/tensor_impl_base.h"
 #include "common.h"
 #include "lite-c/tensor_c.h"
 #include <set>
 #include <string>
 #include <unordered_map>

 const LiteLayout default_layout = {.shapes = {0, 0, 0, 0, 0},
                                   .ndim = 0,
                                   .data_type = LiteDataType::LITE_FLOAT};

 const LiteTensorDesc default_desc = {.is_pinned_host = false,
                                     .layout = default_layout,
                                     .device_type = LiteDeviceType::LITE_CPU,
                                     .device_id = 0};
 namespace {
 std::unordered_map<void*, std::shared_ptr<lite::Tensor>>&
 get_global_tensor_holder() {
    static thread_local std::unordered_map<void*, std::shared_ptr<lite::Tensor>>
            global_holder;
    return global_holder;
 }
 std::unordered_map<std::string, lite::LiteAny>&
 get_global_tensor_attr_holder() {
    static thread_local std::unordered_map<std::string, lite::LiteAny>
            global_holder;
    return global_holder;
 }
 }  // namespace

 //! convert the lite::Layout to Layout
 LiteLayout convert_to_clayout(const lite::Layout& layout) {
    LiteLayout clayout;
    clayout.ndim = layout.ndim;
    LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "layout ndim is to large");
    for (size_t i = 0; i < layout.ndim; i++) {
        clayout.shapes[i] = layout.shapes[i];
    }
    clayout.data_type = layout.data_type;
    return clayout;
 }

 //! convert the C Layout to lite::Layout
 lite::Layout convert_to_layout(const LiteLayout& clayout) {
    lite::Layout layout;
    layout.ndim = clayout.ndim;
    LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "clayout ndim is to large");
    for (size_t i = 0; i < layout.ndim; i++) {
        layout.shapes[i] = clayout.shapes[i];
    }
    layout.data_type = clayout.data_type;
    return layout;
 }

 int LITE_make_tensor(const LiteTensorDesc tensor_describe, LiteTensor* tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE_make_tensor is null");
    lite::Layout layout = convert_to_layout(tensor_describe.layout);
    auto lite_tensor = std::make_shared<lite::Tensor>(
            tensor_describe.device_id, tensor_describe.device_type, layout,
            tensor_describe.is_pinned_host);
    get_global_tensor_holder()[lite_tensor.get()] = lite_tensor;
    *tensor = lite_tensor.get();
    LITE_CAPI_END();
 }

 int LITE_destroy_tensor(LiteTensor tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    get_global_tensor_holder().erase(tensor);
    LITE_CAPI_END();
 }

 int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    auto tensor_ptr = static_cast<lite::Tensor*>(tensor);
    tensor_ptr->set_layout(convert_to_layout(layout));
    LITE_CAPI_END();
 }

 int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data,
                             size_t data_length_in_byte) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null");
    static_cast<lite::Tensor*>(tensor)->reset(prepared_data,
                                              data_length_in_byte);
    LITE_CAPI_END();
 }

 int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout,
                      void* prepared_data) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null");
    static_cast<lite::Tensor*>(tensor)->reset(prepared_data,
                                              convert_to_layout(layout));
    LITE_CAPI_END();
 }

 int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor && shape, "The tensor pass to LITE c_api is null");
    std::vector<int> shapes;
    for (int i = 0; i < size; i++) {
        shapes.push_back(shape[i]);
    }
    static_cast<lite::Tensor*>(tensor)->reshape(shapes);
    LITE_CAPI_END();
 }

 int LITE_tensor_slice(const LiteTensor tensor, const size_t* start,
                      const size_t* end, const size_t* step, size_t size,
                      LiteTensor* slice_tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor && start && end && slice_tensor,
                "The tensor pass to LITE c_api is null");
    std::vector<size_t> starts, ends, steps;
    for (size_t i = 0; i < size; i++) {
        starts.push_back(start[i]);
        ends.push_back(end[i]);
        if (step) {
            steps.push_back(step[i]);
        }
    }
    auto ret_tensor =
            static_cast<lite::Tensor*>(tensor)->slice(starts, ends, steps);
    get_global_tensor_holder()[ret_tensor.get()] = ret_tensor;
    *slice_tensor = ret_tensor.get();
    LITE_CAPI_END();
 }

 int LITE_tensor_fill_zero(LiteTensor tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    static_cast<lite::Tensor*>(tensor)->fill_zero();
    LITE_CAPI_END();
 }

 int LITE_tensor_copy(LiteTensor dst_tensor, const LiteTensor src_tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(dst_tensor && src_tensor,
                "The tensor pass to LITE c_api is null");
    static_cast<lite::Tensor*>(dst_tensor)
            ->copy_from(*static_cast<lite::Tensor*>(src_tensor));
    LITE_CAPI_END();
 }

 int LITE_tensor_share_memory_with(LiteTensor dst_tensor,
                                  const LiteTensor src_tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(dst_tensor && src_tensor,
                "The tensor pass to LITE c_api is null");
    static_cast<lite::Tensor*>(dst_tensor)
            ->share_memory_with(*static_cast<lite::Tensor*>(src_tensor));
    LITE_CAPI_END();
 }

 int LITE_get_tensor_memory(const LiteTensor tensor, void** data) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(data, "The data ptr pass to LITE c_api is null");
    *data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr();
    LITE_CAPI_END();
 }

 int LITE_get_tensor_memory_with_index(const LiteTensor tensor,
                                      const size_t* index, size_t size,
                                      void** data) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor && index && data,
                "The tensor pass to LITE c_api is null");
    std::vector<size_t> index_v;
    for (size_t i = 0; i < size; i++) {
        index_v.push_back(index[i]);
    }
    *data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr(index_v);
    LITE_CAPI_END();
 }

 int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, size_t* size) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(size, "The size ptr pass to LITE c_api is null");
    *size = static_cast<lite::Tensor*>(tensor)->get_tensor_total_size_in_byte();
    LITE_CAPI_END();
 }

 int LITE_get_tensor_layout(const LiteTensor tensor, LiteLayout* layout) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(layout, "The layout ptr pass to LITE c_api is null");
    *layout = convert_to_clayout(
            static_cast<lite::Tensor*>(tensor)->get_layout());
    LITE_CAPI_END();
 }

 int LITE_get_tensor_device_type(const LiteTensor tensor,
                           LiteDeviceType* device_type) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(device_type, "The device ptr pass to LITE c_api is null");
    *device_type = static_cast<lite::Tensor*>(tensor)->get_device_type();
    LITE_CAPI_END();
 }

 int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor && device_id, "The tensor pass to LITE c_api is null");
    *device_id = static_cast<lite::Tensor*>(tensor)->get_device_id();
    LITE_CAPI_END();
 }

 int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(is_pinned_host,
                "The is_pinned_host ptr pass to LITE c_api is null");
    *is_pinned_host = static_cast<lite::Tensor*>(tensor)->is_pinned_host();
    LITE_CAPI_END();
 }

 int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null");
    LITE_ASSERT(is_continue, "The is_continue ptr pass to LITE c_api is null");
    *is_continue = static_cast<lite::Tensor*>(tensor)->is_continue_memory();
    LITE_CAPI_END();
 }

 int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim,
                       LiteDeviceType dst_device, int device_id,
                       LiteTensor* result_tensor) {
    LITE_CAPI_BEGIN();
    std::vector<lite::Tensor> v_tensors;
    for (int i = 0; i < nr_tensor; i++) {
        v_tensors.push_back(*static_cast<lite::Tensor*>(tensors[i]));
    }
    auto tensor =
            lite::TensorUtils::concat(v_tensors, dim, dst_device, device_id);
    get_global_tensor_holder()[tensor.get()] = tensor;
    *result_tensor = tensor.get();
    LITE_CAPI_END()
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/pylite/megenginelite/init.py
+++ b/lite/pylite/megenginelite/init.py
@@ -0,0 +1,12 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 from .base import *
 from .global_setting import *
 from .network import *
 from .struct import *
 from .tensor import *
 from .utils import *
--- a/lite/pylite/megenginelite/base.py
+++ b/lite/pylite/megenginelite/base.py
@@ -0,0 +1,152 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import ctypes
 import glob
 import logging
 import os
 import sys
 from ctypes import *

 if sys.platform == "win32":
    lib_path = os.path.join(os.path.dirname(__file__), "libs")
    dll_paths = list(filter(os.path.exists, [lib_path,]))
    assert len(dll_paths) > 0

    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
    has_load_library_attr = hasattr(kernel32, "AddDllDirectory")
    old_error_mode = kernel32.SetErrorMode(0x0001)

    kernel32.LoadLibraryW.restype = ctypes.c_void_p
    if has_load_library_attr:
        kernel32.AddDllDirectory.restype = ctypes.c_void_p
        kernel32.LoadLibraryExW.restype = ctypes.c_void_p

    for dll_path in dll_paths:
        if sys.version_info >= (3, 8):
            os.add_dll_directory(dll_path)
        elif has_load_library_attr:
            res = kernel32.AddDllDirectory(dll_path)
            if res is None:
                err = ctypes.WinError(ctypes.get_last_error())
                err.strerror += ' Error adding "{}" to the DLL search PATH.'.format(
                    dll_path
                )
                raise err
        else:
            print("WARN: python or OS env have some issue, may load DLL failed!!!")

    import glob

    dlls = glob.glob(os.path.join(lib_path, "*.dll"))
    path_patched = False
    for dll in dlls:
        is_loaded = False
        if has_load_library_attr:
            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
            last_error = ctypes.get_last_error()
            if res is None and last_error != 126:
                err = ctypes.WinError(last_error)
                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
                    dll
                )
                raise err
            elif res is not None:
                is_loaded = True
        if not is_loaded:
            if not path_patched:
                os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]])
                path_patched = True
            res = kernel32.LoadLibraryW(dll)
            if res is None:
                err = ctypes.WinError(ctypes.get_last_error())
                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(
                    dll
                )
                raise err

    kernel32.SetErrorMode(old_error_mode)


 class _LiteCLib:
    def __init__(self):
        cwd = os.getcwd()
        package_dir = os.path.dirname(os.path.realpath(__file__))
        debug_path = os.getenv("LITE_LIB_PATH")
        os.chdir(package_dir)
        lite_libs = glob.glob("libs/liblite*")
        os.chdir(cwd)

        if debug_path is None:
            assert len(lite_libs) == 1
            self._lib = CDLL(os.path.join(package_dir, lite_libs[0]))
        else:
            self._lib = CDLL(debug_path)
        self._register_api(
            "LITE_get_version", [POINTER(c_int), POINTER(c_int), POINTER(c_int)]
        )
        self.lib.LITE_get_version.restype = None
        self._register_api("LITE_set_log_level", [c_int])
        self._register_api("LITE_get_log_level", [])
        self._register_api("LITE_get_last_error", [], False)
        self.lib.LITE_get_last_error.restype = c_char_p

    def _errcheck(self, result, func, args):
        if result:
            error = self.lib.LITE_get_last_error()
            msg = error.decode("utf-8")
            logging.error("{}".format(msg))
            raise RuntimeError("{}".format(msg))
        return result

    def _register_api(self, api_name, arg_types, error_check=True):
        func = getattr(self.lib, api_name)
        func.argtypes = arg_types
        func.restype = c_int
        if error_check:
            func.errcheck = self._errcheck

    @property
    def lib(self):
        return self._lib

    @property
    def version(self):
        major = c_int()
        minor = c_int()
        patch = c_int()
        self.lib.LITE_get_version(byref(major), byref(minor), byref(patch))
        return "{}.{}.{}".format(major.value, minor.value, patch.value)

    def set_log_level(self, level):
        self.lib.LITE_set_log_level(level)

    def get_log_level(self):
        return self.lib.LITE_get_log_level()


 _lib = _LiteCLib()
 version = _lib.version
 set_log_level = _lib.set_log_level
 get_log_level = _lib.get_log_level

 _Cnetwork = c_void_p
 _Ctensor = c_void_p


 class _LiteCObjMetaClass(type):
    """metaclass for lite object"""

    def __new__(cls, name, bases, attrs):
        for api in attrs["_api_"]:
            _lib._register_api(*api)
        del attrs["_api_"]
        attrs["_lib"] = _lib.lib
        return super().__new__(cls, name, bases, attrs)


 class _LiteCObjBase(metaclass=_LiteCObjMetaClass):
    _api_ = []
--- a/lite/pylite/megenginelite/global_setting.py
+++ b/lite/pylite/megenginelite/global_setting.py
@@ -0,0 +1,120 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 from ctypes import *

 import numpy as np

 from .base import _Ctensor, _lib, _LiteCObjBase
 from .network import *
 from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure
 from .tensor import *

 LiteDecryptionFunc = CFUNCTYPE(
    c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p
 )


 class _GlobalAPI(_LiteCObjBase):
    """
    get the api from the lib
    """

    _api_ = [
        ("LITE_get_device_count", [c_int, POINTER(c_size_t)]),
        ("LITE_try_coalesce_all_free_memory", []),
        (
            "LITE_register_decryption_and_key",
            [c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t],
        ),
        (
            "LITE_update_decryption_or_key",
            [c_char_p, c_void_p, POINTER(c_uint8), c_size_t],
        ),
        ("LITE_set_loader_lib_path", [c_char_p]),
        ("LITE_set_persistent_cache", [c_char_p, c_int]),
        # ('LITE_set_tensor_rt_cache', [c_char_p]),
        ("LITE_dump_persistent_cache", [c_char_p]),
        ("LITE_dump_tensor_rt_cache", [c_char_p]),
    ]


 def decryption_func(func):
    """the decryption function decorator
    :type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte
    """

    @CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p)
    def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data):
        in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length)
        key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length)
        if c_out_data:
            out_length = func(in_arr, None)
            out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length)
            return func(in_arr, key_arr, out_arr)
        # just query the output length
        else:
            return func(in_arr, key_arr, None)

    return wrapper


 class LiteGlobal(object):
    """
    some global config in lite
    """

    _api = _GlobalAPI()._lib

    @staticmethod
    def register_decryption_and_key(decryption_name, decryption_func, key):
        c_name = c_char_p(decryption_name.encode("utf-8"))
        key_length = len(key)
        c_key = (c_uint8 * key_length)(*key)
        LiteGlobal._api.LITE_register_decryption_and_key(
            c_name, decryption_func, c_key, key_length
        )

    @staticmethod
    def update_decryption_key(decryption_name, key):
        c_name = c_char_p(decryption_name.encode("utf-8"))
        key_length = len(key)
        c_key = (c_uint8 * key_length)(*key)
        LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length)

    @staticmethod
    def set_loader_lib_path(path):
        c_path = c_char_p(path.encode("utf-8"))
        LiteGlobal._api.LITE_set_loader_lib_path(c_path)

    @staticmethod
    def set_persistent_cache(path, always_sync=False):
        c_path = c_char_p(path.encode("utf-8"))
        LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync)

    @staticmethod
    def set_tensorrt_cache(path):
        c_path = c_char_p(path.encode("utf-8"))
        LiteGlobal._api.LITE_set_tensorrt_cache(c_path)

    @staticmethod
    def dump_persistent_cache(path):
        c_path = c_char_p(path.encode("utf-8"))
        LiteGlobal._api.LITE_dump_persistent_cache(c_path)

    @staticmethod
    def dump_tensorrt_cache():
        LiteGlobal._api.LITE_dump_tensorrt_cache()

    @staticmethod
    def get_device_count(device_type):
        count = c_size_t()
        LiteGlobal._api.LITE_get_device_count(device_type, byref(count))
        return count.value

    @staticmethod
    def try_coalesce_all_free_memory():
        LiteGlobal._api.LITE_try_coalesce_all_free_memory()
--- a/lite/pylite/megenginelite/network.py
+++ b/lite/pylite/megenginelite/network.py
@@ -0,0 +1,531 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 from ctypes import *

 import numpy as np

 from .base import _Cnetwork, _Ctensor, _lib, _LiteCObjBase
 from .struct import *
 from .tensor import *


 class LiteOptions(Structure):
    """
    the inference options will be used to config a network
    """

    _fields_ = [
        ("weight_preprocess", c_int),
        ("fuse_preprocess", c_int),
        ("fake_next_exec", c_int),
        ("var_sanity_check_first_run", c_int),
        ("const_shape", c_int),
        ("force_dynamic_alloc", c_int),
        ("force_output_dynamic_alloc", c_int),
        ("no_profiling_on_shape_change", c_int),
        ("jit_level", c_int),
        ("comp_node_seq_record_level", c_int),
        ("graph_opt_level", c_int),
        ("async_exec_level", c_int),
        # layout transform options
        ("enable_nchw44", c_int),
        ("enable_nchw44_dot", c_int),
        ("enable_nchw88", c_int),
        ("enable_nhwcd4", c_int),
        ("enable_nchw4", c_int),
        ("enable_nchw32", c_int),
        ("enable_nchw64", c_int),
    ]

    def __init__(self):
        self.weight_preprocess = False
        self.fuse_preprocess = False
        self.fake_next_exec = False
        self.var_sanity_check_first_run = True
        self.const_shape = False
        self.force_dynamic_alloc = False
        self.force_output_dynamic_alloc = False
        self.no_profiling_on_shape_change = False
        self.jit_level = 0
        self.comp_node_seq_record_level = 0
        self.graph_opt_level = 2
        self.async_exec_level = 1

    def __repr__(self):
        data = {
            "weight_preprocess": bool(self.weight_preprocess),
            "fuse_preprocess": bool(self.fuse_preprocess),
            "fake_next_exec": bool(self.fake_next_exec),
            "var_sanity_check_first_run": bool(self.var_sanity_check_first_run),
            "const_shape": bool(self.const_shape),
            "force_dynamic_alloc": bool(self.force_dynamic_alloc),
            "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc),
            "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change),
            "jit_level": self.jit_level,
            "comp_node_seq_record_level": self.comp_node_seq_record_level,
            "graph_opt_level": self.graph_opt_level,
            "async_exec_level": self.async_exec_level,
        }
        return data.__repr__()


 class LiteConfig(Structure):
    """
    Configuration when load and compile the graph

    bare_model_cryption_name: is the bare model cryption method name, bare
    model is not pack model info inside

    use_loader_dynamic_param: when model forward with device loader of npu,
    use_loader_dynamic_param used to flag whether the loader use device input or
    output, if use device input or output it will set Non-zero , else set zero

    has_compression: flag whether the model is compressed, the compress
    method will used to read the model
    """

    _fields_ = [
        ("has_compression", c_int),
        ("device_id", c_int),
        ("device_type", c_int),
        ("backend", c_int),
        ("bare_model_cryption_name", c_char_p),
        ("options", LiteOptions),
    ]

    def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None):
        self.device_type = device_type
        if option:
            self.options = option
        else:
            self.options = LiteOptions()

        self.bare_model_cryption_name = c_char_p(b"")
        self.use_loader_dynamic_param = 0
        self.has_compression = 0
        self.backend = LiteBackend.LITE_DEFAULT

    def __repr__(self):
        data = {
            "has_compression": bool(self.has_compression),
            "device_id": LiteDeviceType(self.device_id),
            "device_type": LiteDeviceType(self.device_type),
            "backend": LiteBackend(self.backend),
            "bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"),
            "options": self.options,
        }
        return data.__repr__()


 class LiteIO(Structure):
    """
    config the network input and output item

    name: the tensor name in the graph corresponding to the IO

    is_host: Used to mark where the input tensor comes from and the output where copy
    to, if is_host is true, the input is from host and output copy to host,
    otherwise device. Sometimes The input is from device and output no need
    copy to host, default is true.

    io_type: The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or
    output tensor value is invaid, only shape will be set, default is VALUE

    config_layout: The layout of the config from user, if other layout is set before
    forward or get after forward, this layout will by pass. if no other
    layout is set before forward, this layout will work. if this layout is
    no set, the model will forward with its origin layout. if in output, it
    will used to check.
    """

    _fields_ = [
        ("name", c_char_p),
        ("is_host", c_int),
        ("io_type", c_int),
        ("config_layout", LiteLayout),
    ]

    def __init__(
        self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None
    ):
        if type(name) == str:
            self.name = c_char_p(name.encode("utf-8"))
        else:
            self.name = c_char_p(name)

        if layout:
            self.config_layout = layout
        else:
            self.config_layout = LiteLayout()

        self.is_host = is_host
        self.io_type = io_type

    def __repr__(self):
        data = {
            "name": self.name,
            "is_host": bool(self.is_host),
            "io_type": LiteIOType(self.io_type),
            "config_layout": self.config_layout,
        }
        return data.__repr__()

    def __hash__(self):
        return hash(self.name)


 class _LiteNetworkIO(Structure):
    """
    the input and output information when load the network
    """

    _fields_ = [
        ("inputs", POINTER(LiteIO)),
        ("outputs", POINTER(LiteIO)),
        ("input_size", c_size_t),
        ("output_size", c_size_t),
    ]

    def __init__(self):
        self.inputs = POINTER(LiteIO)()
        self.outputs = POINTER(LiteIO)()
        self.input_size = 0
        self.output_size = 0


 class LiteNetworkIO(object):
    """
    the input and output information for user to construct _LiteNetWorkIO
    """

    def __init__(self):
        self.inputs = []
        self.outputs = []

    def add_input(self, input_io):
        assert isinstance(input_io, LiteIO)
        self.inputs.append(input_io)

    def add_output(self, output_io):
        assert isinstance(output_io, LiteIO)
        self.outputs.append(output_io)

    def _create_network_io(self):
        network_io = _LiteNetworkIO()
        length = 1 if len(self.inputs) == 0 else len(self.inputs)
        self.c_inputs = (LiteIO * length)(*self.inputs)
        length = 1 if len(self.outputs) == 0 else len(self.outputs)
        self.c_outputs = (LiteIO * length)(*self.outputs)
        network_io.inputs = pointer(self.c_inputs[0])
        network_io.outputs = pointer(self.c_outputs[0])
        network_io.input_size = len(self.inputs)
        network_io.output_size = len(self.outputs)
        return network_io

    def __repr__(self):
        data = {"inputs": list(self.inputs), "outputs": list(self.outputs)}
        return data.__repr__()


 LiteAsyncCallback = CFUNCTYPE(c_int)


 def start_finish_callback(func):
    @CFUNCTYPE(c_int, POINTER(LiteIO), POINTER(_Ctensor), c_size_t)
    def wrapper(c_ios, c_tensors, size):
        ios = {}
        for i in range(size):
            tensor = LiteTensor()
            tensor._tensor = c_tensors[i]
            tensor.update()
            io = c_ios[i]
            ios[io] = tensor
        return func(ios)

    return wrapper


 class _NetworkAPI(_LiteCObjBase):
    """
    get the network api from the lib
    """

    _api_ = [
        ("LITE_make_default_network", [POINTER(_Cnetwork)]),
        ("LITE_make_network", [POINTER(_Cnetwork), LiteConfig, _LiteNetworkIO]),
        ("LITE_load_model_from_mem", [_Cnetwork, c_void_p, c_size_t]),
        ("LITE_load_model_from_path", [_Cnetwork, c_char_p]),
        ("LITE_shared_weight_with_network", [_Cnetwork, _Ctensor]),
        ("LITE_destroy_network", [_Cnetwork]),
        ("LITE_forward", [_Cnetwork]),
        ("LITE_wait", [_Cnetwork]),
        ("LITE_get_io_tensor", [_Cnetwork, c_char_p, c_int, POINTER(_Ctensor)]),
        ("LITE_get_input_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]),
        ("LITE_get_output_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]),
        ("LITE_get_all_input_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]),
        ("LITE_get_all_output_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]),
        ("LITE_is_cpu_inplace_mode", [_Cnetwork, POINTER(c_int)]),
        ("LITE_get_cpu_threads_number", [_Cnetwork, POINTER(c_size_t)]),
        ("LITE_get_device_id", [_Cnetwork, POINTER(c_int)]),
        ("LITE_set_device_id", [_Cnetwork, c_int]),
        ("LITE_set_cpu_inplace_mode", [_Cnetwork]),
        ("LITE_use_tensorrt", [_Cnetwork]),
        ("LITE_set_cpu_threads_number", [_Cnetwork, c_size_t]),
        ("LITE_set_stream_id", [_Cnetwork, c_int]),
        ("LITE_get_stream_id", [_Cnetwork, POINTER(c_int)]),
        ("LITE_set_network_algo_policy", [_Cnetwork, c_int]),
        ("LITE_set_network_algo_fastrun_config", [_Cnetwork, c_int, c_int]),
        ("LITE_set_network_algo_workspace_limit", [_Cnetwork, c_size_t]),
        ("LITE_share_runtime_memroy", [_Cnetwork, _Cnetwork]),
        ("LITE_enable_profile_performance", [_Cnetwork, c_char_p]),
        ("LITE_enable_io_txt_dump", [_Cnetwork, c_char_p]),
        ("LITE_enable_io_bin_dump", [_Cnetwork, c_char_p]),
        ("LITE_set_async_callback", [_Cnetwork, LiteAsyncCallback]),
        ("LITE_set_start_callback", [_Cnetwork]),
        ("LITE_set_finish_callback", [_Cnetwork]),
    ]


 class LiteNetwork(object):
    """
    the network to load a model and forward
    """

    _api = _NetworkAPI()._lib

    def __init__(self, config=None, io=None):
        """
        create a network with config and networkio
        """
        self._network = _Cnetwork()

        if config:
            self.config = config
        else:
            self.config = LiteConfig()

        if io:
            self.network_io = io
        else:
            self.network_io = LiteNetworkIO()

        c_network_io = self.network_io._create_network_io()
        self._api.LITE_make_network(byref(self._network), self.config, c_network_io)

    def __repr__(self):
        data = {"config": self.config, "IOs": self.network_io}
        return data.__repr__()

    def __del__(self):
        self._api.LITE_destroy_network(self._network)

    def load(self, path):
        c_path = c_char_p(path.encode("utf-8"))
        self._api.LITE_load_model_from_path(self._network, c_path)

    def forward(self):
        self._api.LITE_forward(self._network)

    def wait(self):
        self._api.LITE_wait(self._network)

    def is_cpu_inplace_mode(self):
        """
        whether the network run in cpu inpalce mode
        """
        inplace = c_int()
        self._api.LITE_is_cpu_inplace_mode(self._network, byref(inplace))
        return bool(inplace.value)

    def enable_cpu_inplace_mode(self):
        """
        set cpu forward in inplace mode with which cpu forward only create one
        thread
        Note: this must be set before the network loaded
        """
        self._api.LITE_set_cpu_inplace_mode(self._network)

    def use_tensorrt(self):
        """
        Note: this must be set before the network loaded
        """
        self._api.LITE_use_tensorrt(self._network)

    @property
    def device_id(self):
        """
        get the device id
        """
        device_id = c_int()
        self._api.LITE_get_device_id(self._network, byref(device_id))
        return device_id.value

    @device_id.setter
    def device_id(self, device_id):
        """
        set the device id
        Note: this must be set before the network loaded
        """
        self._api.LITE_set_device_id(self._network, device_id)

    @property
    def stream_id(self):
        """
        get the stream id
        """
        stream_id = c_int()
        self._api.LITE_get_stream_id(self._network, byref(stream_id))
        return stream_id.value

    @stream_id.setter
    def stream_id(self, stream_id):
        """
        set the stream id
        Note: this must be set before the network loaded
        """
        self._api.LITE_set_stream_id(self._network, stream_id)

    @property
    def threads_number(self):
        """
        get the thread number of the netwrok
        """
        nr_thread = c_size_t()
        self._api.LITE_get_cpu_threads_number(self._network, byref(nr_thread))
        return nr_thread.value

    @threads_number.setter
    def threads_number(self, nr_threads):
        """
        set the network forward in multithread mode, and the thread number
        Note: this must be set before the network loaded
        """
        self._api.LITE_set_cpu_threads_number(self._network, nr_threads)

    def get_io_tensor(self, name, phase=LiteTensorPhase.LITE_IO):
        """
        get input or output tensor by its name
        """
        if type(name) == str:
            c_name = c_char_p(name.encode("utf-8"))
        else:
            c_name = c_char_p(name)
        tensor = LiteTensor()
        self._api.LITE_get_io_tensor(
            self._network, c_name, phase, byref(tensor._tensor)
        )
        tensor.update()
        return tensor

    def get_input_name(self, index):
        """
        get the input name by the index in the network
        """
        c_name = c_char_p()
        self._api.LITE_get_input_name(self._network, index, byref(c_name))
        return c_name.value.decode("utf-8")

    def get_output_name(self, index):
        """
        get the output name by the index in the network
        """
        c_name = c_char_p()
        self._api.LITE_get_output_name(self._network, index, byref(c_name))
        return c_name.value.decode("utf-8")

    def get_all_input_name(self):
        """
        get all the input tensor name in the network
        """
        nr_input = c_size_t()
        self._api.LITE_get_all_input_name(self._network, byref(nr_input), None)

        if nr_input.value > 0:
            names = (c_char_p * nr_input.value)()
            self._api.LITE_get_all_input_name(self._network, None, names)
            ret_name = [names[i].decode("utf-8") for i in range(nr_input.value)]
            return ret_name

    def get_all_output_name(self):
        """
        get all the output tensor name in the network
        """
        nr_output = c_size_t()
        self._api.LITE_get_all_output_name(self._network, byref(nr_output), None)

        if nr_output.value > 0:
            names = (c_char_p * nr_output.value)()
            self._api.LITE_get_all_output_name(self._network, None, names)
            ret_name = [names[i].decode("utf-8") for i in range(nr_output.value)]
            return ret_name

    def share_weights_with(self, src_network):
        """
        share weights with the loaded network
        """
        assert isinstance(src_network, LiteNetwork)
        self._api.LITE_shared_weight_with_network(self._network, src_network._network)

    def share_runtime_memroy(self, src_network):
        """
        share runtime memory with the srouce network
        """
        assert isinstance(src_network, LiteNetwork)
        self._api.LITE_share_runtime_memroy(self._network, src_network._network)

    def async_with_callback(self, async_callback):
        async_callback = LiteAsyncCallback(async_callback)
        self._api.LITE_set_async_callback(self._network, async_callback)

    def set_start_callback(self, start_callback):
        """
        when the network start forward, the callback will be called,
        the start_callback with param mapping from LiteIO to the corresponding
        LiteTensor
        """
        self._api.LITE_set_start_callback(self._network, start_callback)

    def set_finish_callback(self, finish_callback):
        """
        when the network finish forward, the callback will be called,
        the finish_callback with param mapping from LiteIO to the corresponding
        LiteTensor
        """
        self._api.LITE_set_finish_callback(self._network, finish_callback)

    def enable_profile_performance(self, profile_file):
        c_file = profile_file.encode("utf-8")
        self._api.LITE_enable_profile_performance(self._network, c_file)

    def set_network_algo_workspace_limit(self, size_limit):
        self._api.LITE_set_network_algo_workspace_limit(self._network, size_limit)

    def set_network_algo_policy(
        self, policy, shared_batch_size=0, binary_equal_between_batch=False
    ):
        """
        shared_batch_size: the batch size used by fastrun,
                    Non-zero value means that fastrun use this batch size
                    regardless of the batch size of the model. Zero means
                    fastrun use batch size of the model
        binary_equal_between_batch: if the content of each input batch is
                    binary equal,whether the content of each output batch is
                    promised to be equal

        """
        self._api.LITE_set_network_algo_policy(self._network, policy)
        self._api.LITE_set_network_algo_fastrun_config(
            self._network, shared_batch_size, binary_equal_between_batch
        )

    def io_txt_dump(self, txt_file):
        c_file = txt_file.encode("utf-8")
        self._api.LITE_enable_io_txt_dump(self._network, c_file)

    def io_bin_dump(self, bin_dir):
        c_dir = bin_dir.encode("utf-8")
        self._api.LITE_enable_io_bin_dump(self._network, c_dir)
--- a/lite/pylite/megenginelite/struct.py
+++ b/lite/pylite/megenginelite/struct.py
@@ -0,0 +1,90 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import logging
 from ctypes import *
 from enum import Enum, IntEnum


 class LiteBackend(IntEnum):
    LITE_DEFAULT = 0


 class LiteDeviceType(IntEnum):
    LITE_CPU = 0
    LITE_CUDA = 1
    LITE_ATLAS = 3
    LITE_NPU = 4
    LITE_DEVICE_DEFAULT = 5


 class LiteDataType(IntEnum):
    LITE_FLOAT = 0
    LITE_HALF = 1
    LITE_INT = 2
    LITE_INT16 = 3
    LITE_INT8 = 4
    LITE_UINT8 = 5


 class LiteTensorPhase(IntEnum):
    LITE_IO = 0
    LITE_INPUT = 1
    LITE_OUTPUT = 2


 class LiteIOType(IntEnum):
    """
    the input and output type, include SHAPE and VALUE
    sometimes user only need the shape of the output tensor
    """

    LITE_IO_VALUE = 0
    LITE_IO_SHAPE = 1


 class LiteAlgoSelectStrategy(IntEnum):
    """
    operation algorithm seletion strategy type, some operations have
    multi algorithms, different algorithm has different attribute, according to
    the strategy, the best algorithm will be selected.

    Note: These strategies can be combined

    LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid,
    use heuristic instead

    LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the
    reproducible algo

    LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best
    algorithm from the reproducible algorithms set

    LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best
    algorithm form the optimzed algorithms, thus profile will process fast

    LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means:
    profile the best algorithm form the optimzed and reproducible algorithms
    """

    LITE_ALGO_HEURISTIC = 1
    LITE_ALGO_PROFILE = 2
    LITE_ALGO_REPRODUCIBLE = 4
    LITE_ALGO_OPTIMIZED = 8


 class LiteLogLevel(IntEnum):
    """
    DEBUG: The most verbose level, printing debugging info
    INFO: The default level
    WARN: Printing warnings
    ERROR: The least verbose level, printing errors only
    """

    DEBUG = 0
    INFO = 1
    WARN = 2
    ERROR = 3
--- a/lite/pylite/megenginelite/tensor.py
+++ b/lite/pylite/megenginelite/tensor.py
@@ -0,0 +1,471 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 from ctypes import *

 import numpy as np

 from .base import _Ctensor, _lib, _LiteCObjBase
 from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure

 MAX_DIM = 7

 _lite_type_to_nptypes = {
    LiteDataType.LITE_INT: np.int32,
    LiteDataType.LITE_FLOAT: np.float32,
    LiteDataType.LITE_UINT8: np.uint8,
    LiteDataType.LITE_INT8: np.int8,
    LiteDataType.LITE_INT16: np.int16,
    LiteDataType.LITE_HALF: np.float16,
 }

 _nptype_to_lite_type = {val: key for key, val in _lite_type_to_nptypes.items()}

 _str_nptypes_to_lite_nptypes = {
    np.dtype("int32"): LiteDataType.LITE_INT,
    np.dtype("float32"): LiteDataType.LITE_FLOAT,
    np.dtype("uint8"): LiteDataType.LITE_UINT8,
    np.dtype("int8"): LiteDataType.LITE_INT8,
    np.dtype("int16"): LiteDataType.LITE_INT16,
    np.dtype("float16"): LiteDataType.LITE_HALF,
 }

 ctype_to_lite_dtypes = {
    c_int: LiteDataType.LITE_INT,
    c_uint: LiteDataType.LITE_INT,
    c_float: LiteDataType.LITE_FLOAT,
    c_ubyte: LiteDataType.LITE_UINT8,
    c_byte: LiteDataType.LITE_INT8,
    c_short: LiteDataType.LITE_INT16,
    c_ushort: LiteDataType.LITE_INT16,
 }


 class LiteLayout(Structure):
    """
    the simple layout description
    """

    _fields_ = [
        ("shapes", c_size_t * MAX_DIM),
        ("ndim", c_size_t),
        ("data_type", c_int),
    ]

    def __init__(self, shape=None, dtype=None):
        if shape:
            shape = list(shape)
            assert len(shape) <= MAX_DIM, "Layout max dim is 7."
            self.shapes = (c_size_t * MAX_DIM)(*shape)
            self.ndim = len(shape)
        else:
            self.shapes = (c_size_t * MAX_DIM)()
            self.ndim = 0
        if not dtype:
            self.data_type = LiteDataType.LITE_FLOAT
        elif isinstance(dtype, LiteDataType):
            self.data_type = dtype
        elif type(dtype) == str:
            self.data_type = _str_nptypes_to_lite_nptypes[np.dtype(dtype)]
        elif isinstance(dtype, np.dtype):
            ctype = np.ctypeslib.as_ctypes_type(dtype)
            self.data_type = ctype_to_lite_dtypes[ctype]
        elif isinstance(dtype, type):
            self.data_type = _nptype_to_lite_type[dtype]
        else:
            raise RuntimeError("unkonw data type")

    def __repr__(self):
        data = {
            "shapes": list(self.shapes),
            "ndim": self.ndim,
            "data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)],
        }
        return data.__repr__()


 class _LiteTensorDesc(Structure):
    """
    warpper of the MegEngine Tensor

    :is_pinned_host: when set, the storage memory of the tensor is pinned memory,
    this is used to Optimize the H2D or D2H memory copy, if the device or layout
    is not set, when copy form other device(CUDA) tensor, this tensor
    will be automatically set to pinned tensor
    """

    _fields_ = [
        ("is_pinned_host", c_int),
        ("layout", LiteLayout),
        ("device_type", c_int),
        ("device_id", c_int),
    ]

    def __init__(self):
        self.layout = LiteLayout()
        self.device_type = LiteDeviceType.LITE_CPU
        self.is_pinned_host = False
        self.device_id = 0

    def __repr__(self):
        data = {
            "is_pinned_host": self.is_pinned_host,
            "layout": LiteLayout(self.layout),
            "device_type": LiteDeviceType(self.device_type.value),
            "device_id": self.device_id,
        }
        return data.__repr__()


 class _TensorAPI(_LiteCObjBase):
    """
    get the api from the lib
    """

    _api_ = [
        ("LITE_make_tensor", [_LiteTensorDesc, POINTER(_Ctensor)]),
        ("LITE_set_tensor_layout", [_Ctensor, LiteLayout]),
        ("LITE_reset_tensor_memory", [_Ctensor, c_void_p, c_size_t]),
        ("LITE_reset_tensor", [_Ctensor, LiteLayout, c_void_p]),
        ("LITE_tensor_reshape", [_Ctensor, POINTER(c_int), c_int]),
        (
            "LITE_tensor_slice",
            [
                _Ctensor,
                POINTER(c_size_t),
                POINTER(c_size_t),
                POINTER(c_size_t),
                c_size_t,
                POINTER(_Ctensor),
            ],
        ),
        (
            "LITE_tensor_concat",
            [POINTER(_Ctensor), c_int, c_int, c_int, c_int, POINTER(_Ctensor),],
        ),
        ("LITE_tensor_fill_zero", [_Ctensor]),
        ("LITE_tensor_copy", [_Ctensor, _Ctensor]),
        ("LITE_tensor_share_memory_with", [_Ctensor, _Ctensor]),
        ("LITE_get_tensor_memory", [_Ctensor, POINTER(c_void_p)]),
        ("LITE_get_tensor_total_size_in_byte", [_Ctensor, POINTER(c_size_t)]),
        ("LITE_get_tensor_layout", [_Ctensor, POINTER(LiteLayout)]),
        ("LITE_get_tensor_device_type", [_Ctensor, POINTER(c_int)]),
        ("LITE_get_tensor_device_id", [_Ctensor, POINTER(c_int)]),
        ("LITE_destroy_tensor", [_Ctensor]),
        ("LITE_is_pinned_host", [_Ctensor, POINTER(c_int)]),
    ]


 class LiteTensor(object):
    """
    the tensor to hold a block of data
    """

    _api = _TensorAPI()._lib

    def __init__(
        self,
        layout=None,
        device_type=LiteDeviceType.LITE_CPU,
        device_id=0,
        is_pinned_host=False,
    ):
        """
        create a Tensor with layout, device, is_pinned_host param
        """
        self._tensor = _Ctensor()
        if layout:
            self._layout = layout
        else:
            self._layout = LiteLayout()
        self._device_type = device_type
        self._device_id = device_id
        self._is_pinned_host = is_pinned_host

        tensor_desc = _LiteTensorDesc()
        tensor_desc.layout = self._layout
        tensor_desc.device_type = device_type
        tensor_desc.device_id = device_id
        tensor_desc.is_pinned_host = is_pinned_host
        self._api.LITE_make_tensor(tensor_desc, byref(self._tensor))

    def __del__(self):
        self._api.LITE_destroy_tensor(self._tensor)

    def fill_zero(self):
        """
        fill the buffer memory with zero
        """
        self._api.LITE_tensor_fill_zero(self._tensor)
        self.update()

    def share_memory_with(self, src_tensor):
        """
        share the same memory with the src_tensor, the self memory will be freed
        """
        assert isinstance(src_tensor, LiteTensor)
        self._api.LITE_tensor_share_memory_with(self._tensor, src_tensor._tensor)
        self.update()

    @property
    def layout(self):
        self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout))
        return self._layout

    @layout.setter
    def layout(self, layout):
        assert isinstance(layout, LiteLayout)
        self._layout = layout
        self._api.LITE_set_tensor_layout(self._tensor, layout)

    @property
    def is_pinned_host(self):
        """
        whether the tensor is pinned tensor
        """
        pinned = c_int()
        self._api.LITE_is_pinned_host(self._tensor, byref(pinned))
        self._is_pinned_host = pinned
        return bool(self._is_pinned_host)

    @property
    def device_type(self):
        """
        get device of the tensor
        """
        device_type = c_int()
        self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type))
        self._device_type = device_type
        return LiteDeviceType(device_type.value)

    @property
    def device_id(self):
        """
        get device id of the tensor
        """
        device_id = c_int()
        self._api.LITE_get_tensor_device_id(self._tensor, byref(device_id))
        self._device_id = device_id.value
        return device_id.value

    @property
    def is_continue(self):
        """
        whether the tensor memory is continue
        """
        is_continue = c_int()
        self._api.LITE_is_memory_continue(self._tensor, byref(is_continue))
        return bool(is_continue.value)

    @property
    def nbytes(self):
        """
        get the length of the meomry in byte
        """
        self.update()
        length = c_size_t()
        self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length))
        return length.value

    def update(self):
        """
        update the member from C, this will auto used after slice, share
        """
        pinned = c_int()
        self._api.LITE_is_pinned_host(self._tensor, byref(pinned))
        self._is_pinned_host = pinned
        device_type = c_int()
        self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type))
        self._device_type = device_type
        self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout))

    def copy_from(self, src_tensor):
        """
        copy memory form the src_tensor
        """
        assert isinstance(src_tensor, LiteTensor)
        self._api.LITE_tensor_copy(self._tensor, src_tensor._tensor)
        self.update()

    def reshape(self, shape):
        """
        reshape the tensor with data not change, only change the shape
        :param shape: int arrary of dst_shape
        """
        shape = list(shape)
        length = len(shape)
        c_shape = (c_int * length)(*shape)
        self._api.LITE_tensor_reshape(self._tensor, c_shape, length)
        self.update()

    def slice(self, start, end, step=None):
        """
        slice the tensor with gaven start, end, step
        :param start: silce begin index of each dim
        :param end: silce end index of each dim
        :param step: silce step of each dim
        """
        start = list(start)
        end = list(end)
        length = len(start)
        assert length == len(end), "slice with different length of start and end."
        if step:
            assert length == len(step), "slice with different length of start and step."
            step = list(step)
        else:
            step = [1 for i in range(length)]
        c_start = (c_size_t * length)(*start)
        c_end = (c_size_t * length)(*end)
        c_step = (c_size_t * length)(*step)
        slice_tensor = LiteTensor()
        self._api.LITE_tensor_slice(
            self._tensor, c_start, c_end, c_step, length, byref(slice_tensor._tensor)
        )
        slice_tensor.update()
        return slice_tensor

    def get_ctypes_memory(self):
        """
        get the memory of the tensor, return c_void_p of the tensor memory
        """
        self.update()
        mem = c_void_p()
        self._api.LITE_get_tensor_memory(self._tensor, byref(mem))
        return mem

    def set_data_by_share(self, data, length=0, layout=None):
        """
        share the data to the tensor
        param data: the data will shared to the tensor, it should be a
        numpy.ndarray or ctypes data
        """
        self.update()
        if isinstance(data, np.ndarray):
            assert (
                self.is_continue
            ), "set_data_by_share can only apply in continue tensor."
            assert (
                self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
            ), "set_data_by_share can only apply in cpu tensor or pinned tensor."

            np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
            c_type = np.ctypeslib.as_ctypes_type(np_type)

            if self.nbytes != data.nbytes:
                self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type])

            self._shared_data = data
            data = data.ctypes.data_as(POINTER(c_type))

        if layout is not None:
            self.layout = layout
        else:
            assert length == 0 or length == self.nbytes, "the data length is not match."
        self._api.LITE_reset_tensor_memory(self._tensor, data, self.nbytes)

    def set_data_by_copy(self, data, data_length=0, layout=None):
        """
        copy the data to the tensor
        param data: the data to copy to tensor, it should be list,
        numpy.ndarraya or ctypes with length
        """
        self.update()
        if layout is not None:
            self.layout = layout

        assert self.is_continue, "set_data_by_copy can only apply in continue tensor."
        assert (
            self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
        ), "set_data_by_copy can only apply in cpu tensor or pinned tensor."

        np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
        c_type = np.ctypeslib.as_ctypes_type(np_type)

        tensor_memory = c_void_p()

        if type(data) == list:
            length = len(data)
            self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
            tensor_length = self.nbytes
            assert (
                length * sizeof(c_type) <= tensor_length
            ), "the length of input data to set to the tensor is too large."
            arr = (c_type * length)(*data)
            memmove(tensor_memory, arr, sizeof(c_type) * length)

        elif type(data) == np.ndarray:
            if self.nbytes != data.nbytes:
                self.layout = LiteLayout(data.shape, data.dtype)
            arr = data.ctypes.data_as(POINTER(c_type))
            self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
            assert self.nbytes == data.nbytes
            memmove(tensor_memory, arr, self.nbytes)
        else:
            assert (
                data_length == self.nbytes or layout is not None
            ), "when input data is ctypes, the length of input data or layout must set"
            self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory))
            memmove(tensor_memory, data, data_length)

    def to_numpy(self):
        """
        get the buffer of the tensor
        """
        self.update()
        if self.nbytes <= 0:
            return np.array([])
        if self.is_continue and (
            self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU
        ):
            ptr = c_void_p()
            self._api.LITE_get_tensor_memory(self._tensor, byref(ptr))

            np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)]
            shape = [self._layout.shapes[i] for i in range(self._layout.ndim)]
            np_arr = np.zeros(shape, np_type)
            if np_arr.nbytes:
                memmove(np_arr.ctypes.data_as(c_void_p), ptr, np_arr.nbytes)
            return np_arr
        else:
            tmp_tensor = LiteTensor(self.layout)
            tmp_tensor.copy_from(self)
            return tmp_tensor.to_numpy()

    def __repr__(self):
        self.update()
        data = {
            "layout": self._layout,
            "device_type": LiteDeviceType(self._device_type.value),
            "device_id": int(self.device_id),
            "is_pinned_host": bool(self._is_pinned_host),
        }
        return data.__repr__()


 def LiteTensorConcat(
    tensors, dim, device_type=LiteDeviceType.LITE_DEVICE_DEFAULT, device_id=-1
 ):
    """
    concat tensor in input dim to one tensor
    dim : the dim to act concat
    device_type: the result tensor device type
    device_id: the result tensor device id
    """
    api = _TensorAPI()._lib
    length = len(tensors)
    c_tensors = [t._tensor for t in tensors]
    c_tensors = (_Ctensor * length)(*c_tensors)
    result_tensor = LiteTensor()
    api.LITE_tensor_concat(
        cast(byref(c_tensors), POINTER(c_void_p)),
        length,
        dim,
        device_type,
        device_id,
        byref(result_tensor._tensor),
    )
    result_tensor.update()
    return result_tensor
--- a/lite/pylite/megenginelite/utils.py
+++ b/lite/pylite/megenginelite/utils.py
@@ -0,0 +1,122 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import threading

 import numpy as np

 from .base import *
 from .struct import *
 from .tensor import *


 class TensorBatchCollector:
    """
    this is a tensor utils to collect subtensor in batch continuous
    """

    def __init__(
        self,
        shape,
        dtype=LiteDataType.LITE_INT8,
        device_type=LiteDeviceType.LITE_CUDA,
        device_id=0,
        is_pinned_host=False,
        tensor=None,
    ):
        self._mutex = threading.Lock()
        self.dev_type = device_type
        self.is_pinned_host = is_pinned_host
        self.dev_id = 0
        self.shape = shape
        self.dtype = LiteLayout(dtype=dtype).data_type
        self._free_list = list(range(self.shape[0]))

        if tensor is not None:
            assert (
                tensor.layout.shapes[0 : tensor.layout.ndim] == shape
            ), "The tensor set to TensorBatchCollector is not right."
            self._tensor = tensor
            self.dtype = tensor.layout.data_type
            self.device_type = tensor.device_type
            self.device_id = tensor.device_type
        else:
            self._tensor = LiteTensor(
                LiteLayout(shape, dtype), device_type, device_id, is_pinned_host
            )

    def collect_id(self, array, batch_id):
        if isinstance(array, np.ndarray):
            shape = array.shape
            assert list(shape) == self.shape[1:]
            in_dtype = ctype_to_lite_dtypes[np.ctypeslib.as_ctypes_type(array.dtype)]
            assert in_dtype == self.dtype
            # get the batch index
            with self._mutex:
                if batch_id in self._free_list:
                    self._free_list.remove(batch_id)
            # get the subtensor
            subtensor = self._tensor.slice([batch_id], [batch_id + 1])
            if subtensor.device_type == LiteDeviceType.LITE_CPU:
                subtensor.set_data_by_copy(array)
            else:
                pinned_tensor = LiteTensor(
                    subtensor.layout, self.dev_type, self.dev_id, True
                )
                pinned_tensor.set_data_by_share(array)
                subtensor.copy_from(pinned_tensor)
        else:
            assert isinstance(array, LiteTensor)
            ndim = array.layout.ndim
            shape = list(array.layout.shapes)[0:ndim]
            assert list(shape) == self.shape[1:]
            in_dtype = array.layout.data_type
            assert in_dtype == self.dtype
            # get the batch index
            with self._mutex:
                if batch_id in self._free_list:
                    self._free_list.remove(batch_id)
            # get the subtensor
            subtensor = self._tensor.slice([batch_id], [batch_id + 1])
            subtensor.copy_from(array)

        return batch_id

    def collect(self, array):
        with self._mutex:
            if len(self._free_list) == 0:
                return -1
            idx = self._free_list.pop(0)
        return self.collect_id(array, idx)

    def collect_by_ctypes(self, data, length):
        """
        collect with ctypes data input
        """
        with self._mutex:
            if len(self._free_list) == 0:
                return -1
            idx = self._free_list.pop(0)
        # get the subtensor
        subtensor = self._tensor.slice([idx], [idx + 1])
        if subtensor.device_type == LiteDeviceType.LITE_CPU:
            subtensor.set_data_by_copy(data, length)
        else:
            pinned_tensor = LiteTensor(
                subtensor.layout, self.dev_type, self.dev_id, True
            )
            pinned_tensor.set_data_by_share(data, length)
            subtensor.copy_from(pinned_tensor)

    def free(self, indexes):
        with self._mutex:
            self._free_list.extend(indexes)

    def get(self):
        return self._tensor

    def to_numpy(self):
        return self._tensor.to_numpy()
--- a/lite/pylite/pylite.md
+++ b/lite/pylite/pylite.md
@@ -0,0 +1,199 @@
 # PyLite

 Lite的python接口提供更加方便灵活的使用Lite进行模型Inference，支持各种平台上运行，X86-CUDA，X86-CPU，Arm-CPU，Arm-CUDA平台。

 ## 安装
 ### whl包安装
 Lite python接口的whl包会随着megbrain的发版发布，版本号和megbrain保持一致，目前发布的Lite的whl包，包括Linux、windows、macos平台，这些平台可以直接通过pip3安装。
 ```shell
    python3 -m pip install --upgrade pip
    python3 -m pip install megenginelite -i  https://pypi.megvii-inc.com/simple
 ```
 ### develop 安装
 开发模式下，可以使用Cmake编译出lite动态库liblite.so/liblite.dll/liblite_shared.dylib，并使用这个动态库进行开发和debug。该方式安装的pylite只能在本地机器上使用，不能copy到其他机器上使用。
 * 编译liblite.so。使用cmake编译出liblite.so
    * clone megbrain工程到本地
    ```shell
    git clone git@git-core.megvii-inc.com:brain-sdk/MegBrain.git
    ```
    * 进行Cmake编译，这里的cmake编译同megbrain的cmake编译，使用参数和宏也完全一样
    * 编译准备
    ```shell
    cd MegBrain
    sh ./third_party/prepare.sh
    mkdir build
    cd build 
    ```
    * 编译X86-CUDA版本
    ```shell
    cmake .. -DMGE_WITH_CUDA=ON -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release &&  make -j$(nproc)
    ```
    * 编译X86 CPU Only版本
    ```shell
    cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release &&  make -j$(nproc)
    ```
    * 编译完成之后，liblite.so 保存在build目录中的lite文件下
    * 将liblite.so copy到megenginelite的python源文件目录下，就可以使用megenginelite了。
    ```shell
    MegBrain的工程目录为 ${mgb_hone}
    cp ${mgb_hone}/build/lite/liblite.so ${mgb_home}/lite/pylite/megenginelite/
    cd ${mgb_home}/lite/pylite
    python3 -m "import megenginelite"
    ```
    这样就可以在${mgb_home}/lite/pylite 目录下面开发和debug lite的python接口了

 ## python3中使用megenginelite
 Lite的python接口是对其C/C++接口的一层封装，他们使用的模型都是相同的模型格式。megenginelite提供两种数据接口，分别是LiteTensor和LiteNetwork。

 ### LiteTensor
 LiteTensor提供了用户对数据的操作接口，提供了接口包括:
 * fill_zero: 将tensor的内存设置为全0
 * share_memory_with: 可以和其他LiteTensor的共享内存
 * copy_from: 从其他LiteTensor中copy数据到自身内存中
 * reshape: 改变该LiteTensor的shape，内存数据保持不变
 * slice: 对该LiteTensor中的数据进行切片，需要分别指定每一维切片的start，end，和step。
 * set_data_by_share: 调用之后使得该LiteTensor中的内存共享自输入的array的内存，输入的array必须是numpy的ndarray，并且tensor在CPU上
 * set_data_by_copy: 该LiteTensor将会从输入的data中copy数据，data可以是list和numpy的ndarray，需要保证data的数据量不超过tensor的容量，tensor在CPU上
 * to_numpy: 将该LiteTensor中数据copy到numpy的array中，返回给用户，如果是非连续的LiteTensor，如slice出来的，将copy到连续的numpy array中，该接口主要数为了debug，有性能问题。

 #### 使用example
 * LiteTensor 设置数据example
 ```
 def test_tensor_set_data():
    layout = LiteLayout([2, 16], "int8")
    tensor = LiteTensor(layout)
    assert tensor.nbytes == 2 * 16

    data = [i for i in range(32)]
    tensor.set_data_by_copy(data)
    real_data = tensor.to_numpy()
    for i in range(32):
        assert real_data[i // 16][i % 16] == i

    arr = np.ones([2, 16], "int8")
    tensor.set_data_by_copy(arr)
    real_data = tensor.to_numpy()
    for i in range(32):
        assert real_data[i // 16][i % 16] == 1

    for i in range(32):
        arr[i // 16][i % 16] = i
    tensor.set_data_by_share(arr)
    real_data = tensor.to_numpy()
    for i in range(32):
        assert real_data[i // 16][i % 16] == i

    arr[0][8] = 100
    arr[1][3] = 20
    real_data = tensor.to_numpy()
    assert real_data[0][8] == 100
    assert real_data[1][3] == 20
 ```
 * tensor 共享内存example
 ```python
 def test_tensor_share_memory_with():
    layout = LiteLayout([4, 32], "int16")
    tensor = LiteTensor(layout)
    assert tensor.nbytes == 4 * 32 * 2

    arr = np.ones([4, 32], "int16")
    for i in range(128):
        arr[i // 32][i % 32] = i
    tensor.set_data_by_share(arr)
    real_data = tensor.to_numpy()
    for i in range(128):
        assert real_data[i // 32][i % 32] == i

    tensor2 = LiteTensor(layout)
    tensor2.share_memory_with(tensor)
    real_data = tensor.to_numpy()
    real_data2 = tensor2.to_numpy()
    for i in range(128):
        assert real_data[i // 32][i % 32] == i
        assert real_data2[i // 32][i % 32] == i

    arr[1][18] = 5
    arr[3][7] = 345
    real_data = tensor2.to_numpy()
    assert real_data[1][18] == 5
    assert real_data[3][7] == 345
 ```
 更多的使用可以参考pylite中test/test_tensor.py中的使用
 ### LiteNetwork
 LiteNetwork主要为用户提供模型载入，运行等功能。使用的模型见lite的readme中关于模型的部分
 * CPU基本模型载入运行的example
 ```
 def test_network_basic():
    source_dir = os.getenv("LITE_TEST_RESOUCE")
    input_data_path = os.path.join(source_dir, "input_data.npy")
    # read input to input_data
    input_data = np.load(input_data_path)
    model_path = os.path.join(source_dir, "shufflenet.mge")

    network = LiteNetwork()
    network.load(model_path)

    input_name = network.get_input_name(0)
    input_tensor = network.get_io_tensor(input_name)
    output_name = network.get_output_name(0)
    output_tensor = network.get_io_tensor(output_name)

    assert input_tensor.layout.shapes[0] == 1
    assert input_tensor.layout.shapes[1] == 3
    assert input_tensor.layout.shapes[2] == 224
    assert input_tensor.layout.shapes[3] == 224
    assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT
    assert input_tensor.layout.ndim == 4

    # copy input data to input_tensor of the network
    input_tensor.set_data_by_copy(input_data)
    for i in range(3):
        network.forward()
        network.wait()

    output_data = output_tensor.to_numpy()
    print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum()))
 ```
 * CUDA上使用device内存作为模型输入，需要在构造network候配置config和IO信息
 ```
 def test_network_device_IO():
    source_dir = os.getenv("LITE_TEST_RESOUCE")
    input_data_path = os.path.join(source_dir, "input_data.npy")
    model_path = os.path.join(source_dir, "shufflenet.mge")
    # read input to input_data
    input_data = np.load(input_data_path)
    input_layout = LiteLayout([1, 3, 224, 224])
    host_input_data = LiteTensor(layout=input_layout)
    host_input_data.set_data_by_share(input_data)
    dev_input_data = LiteTensor(layout=input_layout, device_type=LiteDeviceType.LITE_CUDA)
    dev_input_data.copy_from(host_input_data)

    # construct LiteOption
    options = LiteOptions()
    options.weight_preprocess = 1
    options.var_sanity_check_first_run = 0
    net_config = LiteConfig(device_type=LiteDeviceType.LITE_CUDA, option=options)

    # constuct LiteIO, is_host=False means the input tensor will use device memory
    input_io = LiteIO("data", is_host=False)
    ios = LiteNetworkIO()
    ios.add_input(input_io)

    network = LiteNetwork(config=net_config, io=ios)
    network.load(model_path)

    input_name = network.get_input_name(0)
    dev_input_tensor = network.get_io_tensor(input_name)
    output_name = network.get_output_name(0)
    output_tensor = network.get_io_tensor(output_name)

    # copy input data to input_tensor of the network
    dev_input_tensor.share_memory_with(dev_input_data)
    for i in range(3):
        network.forward()
        network.wait()

    output_data = output_tensor.to_numpy()
    print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum()))
 ```
 更多的使用可以参考pylite中test/test_network.py和test/test_network_cuda.py中的使用
--- a/lite/pylite/requires.txt
+++ b/lite/pylite/requires.txt
@@ -0,0 +1 @@
 numpy>=1.18
--- a/lite/pylite/scripts/format.sh
+++ b/lite/pylite/scripts/format.sh
@@ -0,0 +1,20 @@
 #!/usr/bin/env bash
 set -e
 cd $(dirname $0)/..

 ISORT_ARG=""
 BLACK_ARG=""

 while getopts 'd' OPT; do
    case $OPT in
        d)
            ISORT_ARG="--diff --check-only"
            BLACK_ARG="--diff --check"
            ;;
        ?)
            echo "Usage: `basename $0` [-d]"
    esac
 done

 isort $ISORT_ARG -j $(nproc) -rc megenginelite test
 black $BLACK_ARG --target-version=py35 -- megenginelite test
--- a/lite/pylite/setup.py
+++ b/lite/pylite/setup.py
@@ -0,0 +1,127 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import os
 import re
 import pathlib
 import platform
 from distutils.file_util import copy_file
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext as _build_ext

 class PrecompiledExtesion(Extension):
    def __init__(self, name):
        super().__init__(name, sources=[])

 class build_ext(_build_ext):

    def build_extension(self, ext):
        if not isinstance(ext, PrecompiledExtesion):
            return super().build_extension(ext)

        if not self.inplace:
            fullpath = self.get_ext_fullpath(ext.name)
            extdir = pathlib.Path(fullpath)
            extdir.parent.mkdir(parents=True, exist_ok=True)

            modpath = self.get_ext_fullname(ext.name).split('.')
            if platform.system() == 'Windows':
                modpath[-1] += '.dll'
            elif platform.system() == 'Darwin':
                modpath[-1] += '.dylib'
            else:
                modpath[-1] += '.so'
            modpath = str(pathlib.Path(*modpath).resolve())

            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)

 v = {}
 with open("megenginelite/version.py") as fp:
    exec(fp.read(), v)
 __version__ = v['__version__']

 email = 'megengine@megvii.com'
 # https://www.python.org/dev/peps/pep-0440
 # Public version identifiers: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
 # Local version identifiers: <public version identifier>[+<local version label>]
 # PUBLIC_VERSION_POSTFIX use to handle rc or dev info
 public_version_postfix = os.environ.get('PUBLIC_VERSION_POSTFIX')
 if public_version_postfix:
    __version__ = '{}{}'.format(__version__, public_version_postfix)

 local_version = []
 strip_sdk_info = os.environ.get('STRIP_SDK_INFO', 'False').lower()
 sdk_name = os.environ.get('SDK_NAME', 'cpu')
 if 'true' == strip_sdk_info:
    print('wheel version strip sdk info')
 else:
    local_version.append(sdk_name)
 local_postfix = os.environ.get('LOCAL_VERSION')
 if local_postfix:
    local_version.append(local_postfix)
 if len(local_version):
    __version__ = '{}+{}'.format(__version__, '.'.join(local_version))

 packages = find_packages()
 megenginelite_data = [
    str(f.relative_to('megenginelite'))
    for f in pathlib.Path('megenginelite').glob('**/*')
 ]

 if platform.system() == 'Windows':
    megenginelite_data.remove('libs\\liblite_shared.dll')
 elif platform.system() == 'Darwin':
    megenginelite_data.remove('libs/liblite_shared.dylib')
 else:
    megenginelite_data.remove('libs/liblite_shared.so')

 with open('requires.txt') as f:
    requires = f.read().splitlines()

 prebuild_modules=[PrecompiledExtesion('megenginelite.libs.liblite_shared')]
 setup_kwargs = dict(
    name=package_name,
    version=__version__,
    description='Inference Framework for MegEngine',
    author='Megvii Engine Team',
    author_email=email,
    packages=packages,
    package_data={
        'megenginelite': megenginelite_data,
    },
    ext_modules=prebuild_modules,
    install_requires=requires,
    cmdclass={'build_ext': build_ext},
 )
 setup_kwargs.update(dict(
    classifiers=[
    'Development Status :: 3 - Alpha',
    'Intended Audience :: Developers',
    'Intended Audience :: Education',
    'Intended Audience :: Science/Research',
    'License :: OSI Approved :: Apache Software License',
    'Programming Language :: C++',
    'Programming Language :: Python :: 3',
    'Programming Language :: Python :: 3.5',
    'Programming Language :: Python :: 3.6',
    'Programming Language :: Python :: 3.7',
    'Programming Language :: Python :: 3.8',
    'Topic :: Scientific/Engineering',
    'Topic :: Scientific/Engineering :: Mathematics',
    'Topic :: Scientific/Engineering :: Artificial Intelligence',
    'Topic :: Software Development',
    'Topic :: Software Development :: Libraries',
    'Topic :: Software Development :: Libraries :: Python Modules',
    ],
    license='Apache 2.0',
    keywords='megengine deep learning',
    data_files = [("megengine", [
        "../LICENSE",
        "../ACKNOWLEDGMENTS",
    ])]
 ))

 setup(**setup_kwargs)
--- a/lite/pylite/test/test_global.py
+++ b/lite/pylite/test/test_global.py
@@ -0,0 +1,92 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import os
 import unittest

 import numpy as np

 from megenginelite import *

 set_log_level(2)


 class TestShuffleNet(unittest.TestCase):
    source_dir = os.getenv("LITE_TEST_RESOUCE")
    input_data_path = os.path.join(source_dir, "input_data.npy")
    correct_data_path = os.path.join(source_dir, "output_data.npy")
    correct_data = np.load(correct_data_path).flatten()
    input_data = np.load(input_data_path)

    def check_correct(self, out_data, error=1e-4):
        out_data = out_data.flatten()
        assert np.isfinite(out_data.sum())
        assert self.correct_data.size == out_data.size
        for i in range(out_data.size):
            assert abs(out_data[i] - self.correct_data[i]) < error

    def do_forward(self, network, times=3):
        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        input_tensor.set_data_by_copy(self.input_data)
        for i in range(times):
            network.forward()
            network.wait()

        output_data = output_tensor.to_numpy()
        self.check_correct(output_data)


 class TestGlobal(TestShuffleNet):
    def test_device_count(self):
        LiteGlobal.try_coalesce_all_free_memory()
        count = LiteGlobal.get_device_count(LiteDeviceType.LITE_CPU)
        assert count > 0

    def test_register_decryption_method(self):
        @decryption_func
        def function(in_arr, key_arr, out_arr):
            if not out_arr:
                return in_arr.size
            else:
                for i in range(in_arr.size):
                    out_arr[i] = in_arr[i] ^ key_arr[0] ^ key_arr[0]
                return out_arr.size

        LiteGlobal.register_decryption_and_key("just_for_test", function, [15])
        config = LiteConfig()
        config.bare_model_cryption_name = "just_for_test".encode("utf-8")

        network = LiteNetwork()
        model_path = os.path.join(self.source_dir, "shufflenet.mge")
        network.load(model_path)

        self.do_forward(network)

    def test_update_decryption_key(self):
        wrong_key = [0] * 32
        LiteGlobal.update_decryption_key("AES_default", wrong_key)

        with self.assertRaises(RuntimeError):
            config = LiteConfig()
            config.bare_model_cryption_name = "AES_default".encode("utf-8")
            network = LiteNetwork(config)
            model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge")
            network.load(model_path)

        right_key = [i for i in range(32)]
        LiteGlobal.update_decryption_key("AES_default", right_key)

        config = LiteConfig()
        config.bare_model_cryption_name = "AES_default".encode("utf-8")
        network = LiteNetwork(config)
        model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge")
        network.load(model_path)

        self.do_forward(network)
--- a/lite/pylite/test/test_network.py
+++ b/lite/pylite/test/test_network.py
@@ -0,0 +1,405 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import os
 import unittest

 import numpy as np

 from megenginelite import *

 set_log_level(2)


 def test_version():
    print("Lite verson: {}".format(version))


 def test_network_io():
    input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE)
    input_io2 = LiteIO(
        "data2",
        is_host=True,
        io_type=LiteIOType.LITE_IO_SHAPE,
        layout=LiteLayout([2, 4, 4]),
    )
    io = LiteNetworkIO()
    io.add_input(input_io1)
    io.add_input(input_io2)

    output_io1 = LiteIO("out1", is_host=False)
    output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000]))

    io.add_output(output_io1)
    io.add_output(output_io2)

    assert len(io.inputs) == 2
    assert len(io.outputs) == 2

    assert io.inputs[0] == input_io1
    assert io.outputs[0] == output_io1

    c_io = io._create_network_io()

    assert c_io.input_size == 2
    assert c_io.output_size == 2


 class TestShuffleNet(unittest.TestCase):
    source_dir = os.getenv("LITE_TEST_RESOUCE")
    input_data_path = os.path.join(source_dir, "input_data.npy")
    correct_data_path = os.path.join(source_dir, "output_data.npy")
    model_path = os.path.join(source_dir, "shufflenet.mge")
    correct_data = np.load(correct_data_path).flatten()
    input_data = np.load(input_data_path)

    def check_correct(self, out_data, error=1e-4):
        out_data = out_data.flatten()
        assert np.isfinite(out_data.sum())
        assert self.correct_data.size == out_data.size
        for i in range(out_data.size):
            assert abs(out_data[i] - self.correct_data[i]) < error

    def do_forward(self, network, times=3):
        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        input_tensor.set_data_by_copy(self.input_data)
        for i in range(times):
            network.forward()
            network.wait()

        output_data = output_tensor.to_numpy()
        self.check_correct(output_data)


 class TestNetwork(TestShuffleNet):
    def test_decryption(self):
        model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge")
        config = LiteConfig()
        config.bare_model_cryption_name = "AES_default".encode("utf-8")
        network = LiteNetwork(config)
        network.load(model_path)
        self.do_forward(network)

    def test_pack_model(self):
        model_path = os.path.join(self.source_dir, "test_packed_model_rc4.lite")
        network = LiteNetwork()
        network.load(model_path)
        self.do_forward(network)

    def test_network_basic(self):
        network = LiteNetwork()
        network.load(self.model_path)

        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        assert input_tensor.layout.shapes[0] == 1
        assert input_tensor.layout.shapes[1] == 3
        assert input_tensor.layout.shapes[2] == 224
        assert input_tensor.layout.shapes[3] == 224
        assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT
        assert input_tensor.layout.ndim == 4

        self.do_forward(network)

    def test_network_shared_data(self):
        network = LiteNetwork()
        network.load(self.model_path)

        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        input_tensor.set_data_by_share(self.input_data)
        for i in range(3):
            network.forward()
            network.wait()

        output_data = output_tensor.to_numpy()
        self.check_correct(output_data)

    def test_network_get_name(self):
        network = LiteNetwork()
        network.load(self.model_path)

        input_names = network.get_all_input_name()
        assert input_names[0] == "data"
        output_names = network.get_all_output_name()
        assert output_names[0] == network.get_output_name(0)

        self.do_forward(network)

    def test_network_set_device_id(self):
        network = LiteNetwork()
        assert network.device_id == 0

        network.device_id = 1
        network.load(self.model_path)
        assert network.device_id == 1

        with self.assertRaises(RuntimeError):
            network.device_id = 1

        self.do_forward(network)

    def test_network_set_stream_id(self):
        network = LiteNetwork()
        assert network.stream_id == 0

        network.stream_id = 1
        network.load(self.model_path)
        assert network.stream_id == 1

        with self.assertRaises(RuntimeError):
            network.stream_id = 1

        self.do_forward(network)

    def test_network_set_thread_number(self):
        network = LiteNetwork()
        assert network.threads_number == 1

        network.threads_number = 2
        network.load(self.model_path)
        assert network.threads_number == 2

        with self.assertRaises(RuntimeError):
            network.threads_number = 2

        self.do_forward(network)

    def test_network_cpu_inplace(self):
        network = LiteNetwork()
        assert network.is_cpu_inplace_mode() == False

        network.enable_cpu_inplace_mode()
        network.load(self.model_path)
        assert network.is_cpu_inplace_mode() == True

        with self.assertRaises(RuntimeError):
            network.enable_cpu_inplace_mode()

        self.do_forward(network)

    def test_network_option(self):
        option = LiteOptions()
        option.weight_preprocess = 1
        option.var_sanity_check_first_run = 0

        config = LiteConfig(option=option)
        network = LiteNetwork(config=config)
        network.load(self.model_path)

        self.do_forward(network)

    def test_network_reset_io(self):
        option = LiteOptions()
        option.var_sanity_check_first_run = 0
        config = LiteConfig(option=option)

        input_io = LiteIO("data")
        ios = LiteNetworkIO()
        ios.add_input(input_io)
        network = LiteNetwork(config=config, io=ios)
        network.load(self.model_path)

        input_tensor = network.get_io_tensor("data")
        assert input_tensor.device_type == LiteDeviceType.LITE_CPU

        self.do_forward(network)

    def test_network_by_share(self):
        network = LiteNetwork()
        network.load(self.model_path)

        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        assert input_tensor.device_type == LiteDeviceType.LITE_CPU
        layout = LiteLayout(self.input_data.shape, self.input_data.dtype)
        tensor_tmp = LiteTensor(layout=layout)
        tensor_tmp.set_data_by_share(self.input_data)
        input_tensor.share_memory_with(tensor_tmp)

        for i in range(3):
            network.forward()
            network.wait()

        output_data = output_tensor.to_numpy()
        self.check_correct(output_data)

    def test_network_share_weights(self):
        option = LiteOptions()
        option.var_sanity_check_first_run = 0
        config = LiteConfig(option=option)

        src_network = LiteNetwork(config=config)
        src_network.load(self.model_path)

        new_network = LiteNetwork()
        new_network.enable_cpu_inplace_mode()
        new_network.share_weights_with(src_network)

        self.do_forward(src_network)
        self.do_forward(new_network)

    def test_network_share_runtime_memory(self):
        option = LiteOptions()
        option.var_sanity_check_first_run = 0
        config = LiteConfig(option=option)

        src_network = LiteNetwork(config=config)
        src_network.load(self.model_path)

        new_network = LiteNetwork()
        new_network.enable_cpu_inplace_mode()
        new_network.share_runtime_memroy(src_network)
        new_network.load(self.model_path)

        self.do_forward(src_network)
        self.do_forward(new_network)

    #    def test_network_async(self):
    #        count = 0
    #        finished = False
    #
    #        def async_callback():
    #            nonlocal finished
    #            finished = True
    #            return 0
    #
    #        option = LiteOptions()
    #        option.var_sanity_check_first_run = 0
    #        config = LiteConfig(option=option)
    #
    #        network = LiteNetwork(config=config)
    #        network.load(self.model_path)
    #
    #        network.async_with_callback(async_callback)
    #
    #        input_tensor = network.get_io_tensor(network.get_input_name(0))
    #        output_tensor = network.get_io_tensor(network.get_output_name(0))
    #
    #        input_tensor.set_data_by_share(self.input_data)
    #        network.forward()
    #
    #        while not finished:
    #            count += 1
    #
    #        assert count > 0
    #        output_data = output_tensor.to_numpy()
    #        self.check_correct(output_data)
    #
    #    def test_network_start_callback(self):
    #        network = LiteNetwork()
    #        network.load(self.model_path)
    #        start_checked = False
    #
    #        @start_finish_callback
    #        def start_callback(ios):
    #            nonlocal start_checked
    #            start_checked = True
    #            assert len(ios) == 1
    #            for key in ios:
    #                io = key
    #                data = ios[key].to_numpy().flatten()
    #                input_data = self.input_data.flatten()
    #                assert data.size == input_data.size
    #                assert io.name.decode("utf-8") == "data"
    #                for i in range(data.size):
    #                    assert data[i] == input_data[i]
    #            return 0
    #
    #        network.set_start_callback(start_callback)
    #        self.do_forward(network, 1)
    #        assert start_checked == True
    #
    #    def test_network_finish_callback(self):
    #        network = LiteNetwork()
    #        network.load(self.model_path)
    #        finish_checked = False
    #
    #        @start_finish_callback
    #        def finish_callback(ios):
    #            nonlocal finish_checked
    #            finish_checked = True
    #            assert len(ios) == 1
    #            for key in ios:
    #                io = key
    #                data = ios[key].to_numpy().flatten()
    #                output_data = self.correct_data.flatten()
    #                assert data.size == output_data.size
    #                for i in range(data.size):
    #                    assert data[i] == output_data[i]
    #            return 0
    #
    #        network.set_finish_callback(finish_callback)
    #        self.do_forward(network, 1)
    #        assert finish_checked == True

    def test_enable_profile(self):
        network = LiteNetwork()
        network.load(self.model_path)
        network.enable_profile_performance("./profile.json")

        self.do_forward(network)

        fi = open("./profile.json", "r")
        fi.close()
        os.remove("./profile.json")

    def test_io_txt_dump(self):
        network = LiteNetwork()
        network.load(self.model_path)
        network.io_txt_dump("./io_txt.txt")
        self.do_forward(network)

    def test_io_bin_dump(self):
        import shutil

        folder = "./out"
        network = LiteNetwork()
        network.load(self.model_path)
        if not os.path.exists(folder):
            os.mkdir(folder)
        network.io_bin_dump(folder)
        self.do_forward(network)
        shutil.rmtree(folder)

    def test_algo_workspace_limit(self):
        network = LiteNetwork()
        network.load(self.model_path)
        print("modify the workspace limit.")
        network.set_network_algo_workspace_limit(10000)
        self.do_forward(network)

    def test_network_algo_policy(self):
        network = LiteNetwork()
        network.load(self.model_path)
        network.set_network_algo_policy(
            LiteAlgoSelectStrategy.LITE_ALGO_PROFILE
            | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE
        )
        self.do_forward(network)

    def test_network_algo_policy_ignore_batch(self):
        network = LiteNetwork()
        network.load(self.model_path)
        network.set_network_algo_policy(
            LiteAlgoSelectStrategy.LITE_ALGO_PROFILE,
            shared_batch_size=1,
            binary_equal_between_batch=True,
        )
        self.do_forward(network)
--- a/lite/pylite/test/test_network_cuda.py
+++ b/lite/pylite/test/test_network_cuda.py
@@ -0,0 +1,220 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import functools
 import os
 import unittest

 import numpy as np

 from megenginelite import *

 set_log_level(2)


 def require_cuda(ngpu=1):
    """a decorator that disables a testcase if cuda is not enabled"""

    def dector(func):
        @functools.wraps(func)
        def wrapped(*args, **kwargs):
            if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA) >= ngpu:
                return func(*args, **kwargs)

        return wrapped

    return dector


 class TestShuffleNetCuda(unittest.TestCase):
    source_dir = os.getenv("LITE_TEST_RESOUCE")
    input_data_path = os.path.join(source_dir, "input_data.npy")
    correct_data_path = os.path.join(source_dir, "output_data.npy")
    model_path = os.path.join(source_dir, "shufflenet.mge")
    correct_data = np.load(correct_data_path).flatten()
    input_data = np.load(input_data_path)

    def check_correct(self, out_data, error=1e-4):
        out_data = out_data.flatten()
        assert np.isfinite(out_data.sum())
        assert self.correct_data.size == out_data.size
        for i in range(out_data.size):
            assert abs(out_data[i] - self.correct_data[i]) < error

    def do_forward(self, network, times=3):
        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        input_tensor.set_data_by_copy(self.input_data)
        for i in range(times):
            network.forward()
            network.wait()

        output_data = output_tensor.to_numpy()
        self.check_correct(output_data)


 class TestNetwork(TestShuffleNetCuda):
    @require_cuda()
    def test_network_basic(self):
        config = LiteConfig()
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config)
        network.load(self.model_path)

        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        assert input_tensor.layout.shapes[0] == 1
        assert input_tensor.layout.shapes[1] == 3
        assert input_tensor.layout.shapes[2] == 224
        assert input_tensor.layout.shapes[3] == 224
        assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT
        assert input_tensor.layout.ndim == 4

        self.do_forward(network)

    @require_cuda()
    def test_network_shared_data(self):
        config = LiteConfig()
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config)
        network.load(self.model_path)

        input_name = network.get_input_name(0)
        input_tensor = network.get_io_tensor(input_name)
        output_name = network.get_output_name(0)
        output_tensor = network.get_io_tensor(output_name)

        input_tensor.set_data_by_share(self.input_data)
        for i in range(3):
            network.forward()
            network.wait()

        output_data = output_tensor.to_numpy()
        self.check_correct(output_data)

    @require_cuda(2)
    def test_network_set_device_id(self):
        config = LiteConfig()
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config)
        assert network.device_id == 0

        network.device_id = 1
        network.load(self.model_path)
        assert network.device_id == 1

        with self.assertRaises(RuntimeError):
            network.device_id = 1

        self.do_forward(network)

    @require_cuda()
    def test_network_option(self):
        option = LiteOptions()
        option.weight_preprocess = 1
        option.var_sanity_check_first_run = 0

        config = LiteConfig(option=option)
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config=config)
        network.load(self.model_path)

        self.do_forward(network)

    @require_cuda()
    def test_network_reset_io(self):
        option = LiteOptions()
        option.var_sanity_check_first_run = 0
        config = LiteConfig(option=option)

        config.device_type = LiteDeviceType.LITE_CUDA
        input_io = LiteIO("data")
        ios = LiteNetworkIO()
        ios.add_input(input_io)
        network = LiteNetwork(config=config, io=ios)
        network.load(self.model_path)

        input_tensor = network.get_io_tensor("data")
        assert input_tensor.device_type == LiteDeviceType.LITE_CPU

        self.do_forward(network)

    @require_cuda()
    def test_network_share_weights(self):
        option = LiteOptions()
        option.var_sanity_check_first_run = 0
        config = LiteConfig(option=option)
        config.device_type = LiteDeviceType.LITE_CUDA

        src_network = LiteNetwork(config=config)
        src_network.load(self.model_path)

        new_network = LiteNetwork()
        new_network.enable_cpu_inplace_mode()
        new_network.share_weights_with(src_network)

        self.do_forward(src_network)
        self.do_forward(new_network)

    @require_cuda()
    def test_network_share_runtime_memory(self):
        option = LiteOptions()
        option.var_sanity_check_first_run = 0
        config = LiteConfig(option=option)
        config.device_type = LiteDeviceType.LITE_CUDA

        src_network = LiteNetwork(config=config)
        src_network.load(self.model_path)

        new_network = LiteNetwork()
        new_network.enable_cpu_inplace_mode()
        new_network.share_runtime_memroy(src_network)
        new_network.load(self.model_path)

        self.do_forward(src_network)
        self.do_forward(new_network)

    @require_cuda()
    def test_enable_profile(self):
        config = LiteConfig()
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config)
        network.load(self.model_path)
        network.enable_profile_performance("./profile.json")

        self.do_forward(network)

        fi = open("./profile.json", "r")
        fi.close()
        os.remove("./profile.json")

    @require_cuda()
    def test_algo_workspace_limit(self):
        config = LiteConfig()
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config)
        network.load(self.model_path)
        print("modify the workspace limit.")
        network.set_network_algo_workspace_limit(10000)
        self.do_forward(network)

    @require_cuda()
    def test_network_algo_policy(self):
        config = LiteConfig()
        config.device_type = LiteDeviceType.LITE_CUDA
        network = LiteNetwork(config)
        network.load(self.model_path)
        network.set_network_algo_policy(
            LiteAlgoSelectStrategy.LITE_ALGO_PROFILE
            | LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE
        )
        self.do_forward(network)
--- a/lite/pylite/test/test_tensor.py
+++ b/lite/pylite/test/test_tensor.py
@@ -0,0 +1,291 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import functools

 import numpy as np

 from megenginelite import *


 def require_cuda(func):
    """a decorator that disables a testcase if cuda is not enabled"""

    @functools.wraps(func)
    def wrapped(*args, **kwargs):
        if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA):
            return func(*args, **kwargs)

    return wrapped


 def test_tensor_make():
    empty_layout = LiteLayout()
    assert empty_layout.ndim == 0
    assert empty_layout.data_type == int(LiteDataType.LITE_FLOAT)

    empty_tensor = LiteTensor()
    assert empty_tensor.layout.ndim == empty_layout.ndim
    assert empty_tensor.layout.data_type == empty_layout.data_type

    layout = LiteLayout([4, 16])
    layout = LiteLayout(dtype="float32")
    layout = LiteLayout([4, 16], "float32")
    layout = LiteLayout([4, 16], "float16")
    layout = LiteLayout([4, 16], np.float32)
    layout = LiteLayout([4, 16], np.int8)
    layout = LiteLayout([4, 16], LiteDataType.LITE_FLOAT)

    tensor = LiteTensor(layout)
    tensor = LiteTensor(layout, LiteDeviceType.LITE_CPU)
    assert tensor.layout == layout
    assert tensor.device_type == LiteDeviceType.LITE_CPU
    assert tensor.is_continue == True
    assert tensor.is_pinned_host == False
    assert tensor.nbytes == 4 * 16 * 4
    assert tensor.device_id == 0

    tensor = LiteTensor(layout, device_id=1)
    assert tensor.device_id == 1


 def test_tensor_set_data():
    layout = LiteLayout([2, 16], "int8")
    tensor = LiteTensor(layout)
    assert tensor.nbytes == 2 * 16

    data = [i for i in range(32)]
    tensor.set_data_by_copy(data)
    real_data = tensor.to_numpy()
    for i in range(32):
        assert real_data[i // 16][i % 16] == i

    arr = np.ones([2, 16], "int8")
    tensor.set_data_by_copy(arr)
    real_data = tensor.to_numpy()
    for i in range(32):
        assert real_data[i // 16][i % 16] == 1

    for i in range(32):
        arr[i // 16][i % 16] = i
    tensor.set_data_by_share(arr)
    real_data = tensor.to_numpy()
    for i in range(32):
        assert real_data[i // 16][i % 16] == i

    arr[0][8] = 100
    arr[1][3] = 20
    real_data = tensor.to_numpy()
    assert real_data[0][8] == 100
    assert real_data[1][3] == 20


 def test_fill_zero():
    layout = LiteLayout([4, 8], "int16")
    tensor1 = LiteTensor(layout)
    assert tensor1.nbytes == 4 * 8 * 2

    tensor1.set_data_by_copy([i for i in range(32)])
    real_data = tensor1.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i

    tensor1.fill_zero()
    real_data = tensor1.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == 0


 def test_copy_from():
    layout = LiteLayout([4, 8], "int16")
    tensor1 = LiteTensor(layout)
    tensor2 = LiteTensor(layout)
    assert tensor1.nbytes == 4 * 8 * 2
    assert tensor2.nbytes == 4 * 8 * 2

    tensor1.set_data_by_copy([i for i in range(32)])
    tensor2.copy_from(tensor1)
    real_data = tensor2.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i

    tensor1.set_data_by_copy([i + 5 for i in range(32)])
    tensor2.copy_from(tensor1)
    real_data = tensor2.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i + 5


 def test_reshape():
    layout = LiteLayout([4, 8], "int16")
    tensor1 = LiteTensor(layout)
    assert tensor1.nbytes == 4 * 8 * 2

    tensor1.set_data_by_copy([i for i in range(32)])
    real_data = tensor1.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i

    tensor1.reshape([8, 4])
    real_data = tensor1.to_numpy()
    for i in range(32):
        assert real_data[i // 4][i % 4] == i


 def test_slice():
    layout = LiteLayout([4, 8], "int32")
    tensor1 = LiteTensor(layout)
    assert tensor1.nbytes == 4 * 8 * 4

    tensor1.set_data_by_copy([i for i in range(32)])
    real_data_org = tensor1.to_numpy()
    for i in range(32):
        assert real_data_org[i // 8][i % 8] == i

    tensor2 = tensor1.slice([1, 4], [3, 8])
    assert tensor2.layout.shapes[0] == 2
    assert tensor2.layout.shapes[1] == 4
    assert tensor2.is_continue == False

    real_data = tensor2.to_numpy()
    for i in range(8):
        row = i // 4
        col = i % 4
        assert real_data[row][col] == real_data_org[row + 1][col + 4]


 def test_tensor_share_memory():
    layout = LiteLayout([4, 8], "int16")
    tensor1 = LiteTensor(layout)
    tensor2 = LiteTensor(layout)
    assert tensor1.nbytes == 4 * 8 * 2
    assert tensor2.nbytes == 4 * 8 * 2

    tensor1.set_data_by_copy([i for i in range(32)])
    tensor2.share_memory_with(tensor1)
    real_data = tensor2.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i

    tensor1.set_data_by_copy([i + 5 for i in range(32)])
    real_data = tensor2.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i + 5


 def test_tensor_share_ctype_memory():
    layout = LiteLayout([4, 8], "int16")
    tensor1 = LiteTensor(layout)
    assert tensor1.nbytes == 4 * 8 * 2

    arr = np.ones([4, 8], "int16")
    for i in range(32):
        arr[i // 8][i % 8] = i
    tensor1.set_data_by_share(arr.ctypes.data, 4 * 8 * 2)
    real_data = tensor1.to_numpy()
    for i in range(32):
        assert real_data[i // 8][i % 8] == i


@require_cuda
 def test_tensor_share_ctype_memory_device():
    layout = LiteLayout([4, 8], "int16")
    tensor_cpu = LiteTensor(
        layout=layout, device_type=LiteDeviceType.LITE_CUDA, is_pinned_host=True
    )
    tensor_cuda1 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA)
    tensor_cuda2 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA)
    assert tensor_cpu.nbytes == 4 * 8 * 2
    assert tensor_cuda1.nbytes == 4 * 8 * 2
    assert tensor_cuda2.nbytes == 4 * 8 * 2

    arr = np.ones([4, 8], "int16")
    for i in range(32):
        arr[i // 8][i % 8] = i
    tensor_cpu.set_data_by_share(arr.ctypes.data, 4 * 8 * 2)
    tensor_cuda1.copy_from(tensor_cpu)
    device_mem = tensor_cuda1.get_ctypes_memory()
    tensor_cuda2.set_data_by_share(device_mem, tensor_cuda1.nbytes)
    real_data1 = tensor_cuda1.to_numpy()
    real_data2 = tensor_cuda2.to_numpy()
    for i in range(32):
        assert real_data1[i // 8][i % 8] == i
        assert real_data2[i // 8][i % 8] == i


 def test_tensor_share_memory_with():
    layout = LiteLayout([4, 32], "int16")
    tensor = LiteTensor(layout)
    assert tensor.nbytes == 4 * 32 * 2

    arr = np.ones([4, 32], "int16")
    for i in range(128):
        arr[i // 32][i % 32] = i
    tensor.set_data_by_share(arr)
    real_data = tensor.to_numpy()
    for i in range(128):
        assert real_data[i // 32][i % 32] == i

    tensor2 = LiteTensor(layout)
    tensor2.share_memory_with(tensor)
    real_data = tensor.to_numpy()
    real_data2 = tensor2.to_numpy()
    for i in range(128):
        assert real_data[i // 32][i % 32] == i
        assert real_data2[i // 32][i % 32] == i

    arr[1][18] = 5
    arr[3][7] = 345
    real_data = tensor2.to_numpy()
    assert real_data[1][18] == 5
    assert real_data[3][7] == 345


 def test_empty_tensor():
    empty_tensor = LiteTensor()
    assert empty_tensor.layout.ndim == 0
    assert empty_tensor.layout.data_type == int(LiteDataType.LITE_FLOAT)
    # check empty tensor to numpy
    data = empty_tensor.to_numpy()


 def test_tensor_by_set_copy_with_new_layout():
    layout = LiteLayout([4, 32], "int16")
    tensor = LiteTensor(layout)
    assert tensor.nbytes == 4 * 32 * 2

    arr = np.ones([8, 64], "int32")
    tensor.set_data_by_copy(arr)
    new_layout = tensor.layout
    assert new_layout.ndim == 2
    assert new_layout.shapes[0] == 8
    assert new_layout.shapes[1] == 64

    tensor = LiteTensor(layout)
    tensor.set_data_by_share(arr)
    new_layout = tensor.layout
    assert new_layout.ndim == 2
    assert new_layout.shapes[0] == 8
    assert new_layout.shapes[1] == 64


 def test_tensor_concat():
    layout = LiteLayout([4, 32], "int16")
    tensors = []
    arr = np.ones([4, 32], "int16")
    for j in range(4):
        for i in range(128):
            arr[i // 32][i % 32] = j
        tensor = LiteTensor(layout)
        tensor.set_data_by_copy(arr)
        tensors.append(tensor)
    new_tensor = LiteTensorConcat(tensors, 0)

    real_data = new_tensor.to_numpy()
    for j in range(4):
        for i in range(128):
            index = j * 128 + i
            assert real_data[index // 32][index % 32] == j
--- a/lite/pylite/test/test_utils.py
+++ b/lite/pylite/test/test_utils.py
@@ -0,0 +1,199 @@
 # -*- coding: utf-8 -*-
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import functools

 import numpy as np

 from megenginelite import *


 def require_cuda(func):
    """a decorator that disables a testcase if cuda is not enabled"""

    @functools.wraps(func)
    def wrapped(*args, **kwargs):
        if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA):
            return func(*args, **kwargs)

    return wrapped


@require_cuda
 def test_tensor_collect_batch():
    batch_tensor = TensorBatchCollector(
        [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
    )
    arr = np.ones([8, 8], "int32")
    for i in range(4):
        batch_tensor.collect(arr)
        arr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 8
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(64):
            assert data[i][j // 8][j % 8] == i + 1


 def test_tensor_collect_batch_cpu():
    batch_tensor = TensorBatchCollector(
        [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU
    )
    arr = np.ones([8, 8], "int32")
    for i in range(4):
        batch_tensor.collect(arr)
        arr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 8
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(64):
            assert data[i][j // 8][j % 8] == i + 1


@require_cuda
 def test_tensor_collect_batch_by_index():
    batch_tensor = TensorBatchCollector(
        [4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
    )
    arr = np.ones([8, 8], "int32")
    arr += 1  # ==2
    batch_tensor.collect_id(arr, 1)
    arr -= 1  # ==1
    batch_tensor.collect_id(arr, 0)
    arr += 2  # ==3
    batch_tensor.collect_id(arr, 2)
    arr += 1  # ==4
    batch_tensor.collect_id(arr, 3)

    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 8
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(64):
            assert data[i][j // 8][j % 8] == i + 1


@require_cuda
 def test_tensor_collect_batch_tensor():
    batch_tensor = TensorBatchCollector(
        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
    )
    nparr = np.ones([6, 8], "int32")
    tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT))
    for i in range(4):
        tensor.set_data_by_share(nparr)
        batch_tensor.collect(tensor)
        nparr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 6
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(48):
            assert data[i][j // 8][j % 8] == i + 1


 def test_tensor_collect_batch_tensor_cpu():
    batch_tensor = TensorBatchCollector(
        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU
    )
    nparr = np.ones([6, 8], "int32")
    tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT))
    for i in range(4):
        tensor.set_data_by_share(nparr)
        batch_tensor.collect(tensor)
        nparr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 6
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(48):
            assert data[i][j // 8][j % 8] == i + 1


@require_cuda
 def test_tensor_collect_batch_ctypes():
    batch_tensor = TensorBatchCollector(
        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA
    )
    nparr = np.ones([6, 8], "int32")
    for i in range(4):
        in_data = nparr.ctypes.data
        batch_tensor.collect_by_ctypes(in_data, nparr.nbytes)
        nparr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 6
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(48):
            assert data[i][j // 8][j % 8] == i + 1


 def test_tensor_collect_batch_ctypes_cpu():
    batch_tensor = TensorBatchCollector(
        [4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU
    )
    nparr = np.ones([6, 8], "int32")
    for i in range(4):
        in_data = nparr.ctypes.data
        batch_tensor.collect_by_ctypes(in_data, nparr.nbytes)
        nparr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 6
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(48):
            assert data[i][j // 8][j % 8] == i + 1


@require_cuda
 def test_tensor_collect_batch_device_tensor():
    all_tensor = LiteTensor(
        LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT),
        device_type=LiteDeviceType.LITE_CUDA,
    )
    batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor)
    nparr = np.ones([6, 8], "int32")
    tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT))
    for i in range(4):
        tensor.set_data_by_share(nparr)
        batch_tensor.collect(tensor)
        nparr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 6
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(48):
            assert data[i][j // 8][j % 8] == i + 1


@require_cuda
 def test_tensor_collect_batch_device_numpy():
    all_tensor = LiteTensor(
        LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT),
        device_type=LiteDeviceType.LITE_CUDA,
    )
    batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor)
    nparr = np.ones([6, 8], "int32")
    for i in range(4):
        batch_tensor.collect(nparr)
        nparr += 1
    data = batch_tensor.to_numpy()
    assert data.shape[0] == 4
    assert data.shape[1] == 6
    assert data.shape[2] == 8
    for i in range(4):
        for j in range(48):
            assert data[i][j // 8][j % 8] == i + 1
--- a/lite/src/decryption/aes_decrypt.h
+++ b/lite/src/decryption/aes_decrypt.h
@@ -0,0 +1,53 @@
 /**
 * \file src/decryption/aes_decrypt.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "./mbedtls/aes.h"
 #include "decrypt_base.h"

 namespace lite {

 class AESDcryption {
 public:
    static std::vector<uint8_t> decrypt_model(const void* model_mem,
                                              size_t size,
                                              const std::vector<uint8_t>& key) {
        mbedtls_aes_context ctx;
        mbedtls_aes_init(&ctx);
        mbedtls_aes_setkey_dec(&ctx, key.data(), 256);

        auto data = static_cast<const uint8_t*>(model_mem);
        //! first 16 bytes is IV
        uint8_t iv[16];
        //! last 8 bytes is file size(length)
        auto length_ptr = data + size - 8;
        size_t length = 0;
        for (int i = 0; i < 8; i++) {
            length |= length_ptr[i] << (8 * (7 - i));
        }
        std::copy(data, data + 16, iv);
        auto output = std::vector<uint8_t>(size - 24);
        mbedtls_aes_crypt_cbc(&ctx, MBEDTLS_AES_DECRYPT, size - 24, iv,
                              data + 16, output.data());
        mbedtls_aes_free(&ctx);
        output.erase(output.begin() + length, output.end());
        return output;
    }

    static std::vector<uint8_t> get_decrypt_key() {
        std::vector<uint8_t> key = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
                                    0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
                                    0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
                                    0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B,
                                    0x1C, 0x1D, 0x1E, 0x1F};
        return key;
    }
 };
 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/decryption/decrypt_base.h
+++ b/lite/src/decryption/decrypt_base.h
@@ -0,0 +1,49 @@
 /**
 * \file src/decryption/decrypt_base.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include "lite/global.h"
 #include "misc.h"

 namespace lite {

 struct DecryptionStaticData {
    std::unordered_map<
            std::string,
            std::pair<DecryptionFunc, std::shared_ptr<std::vector<uint8_t>>>>
            decryption_methods;
    LITE_MUTEX map_mutex;
 };

 DecryptionStaticData& decryption_static_data();

 template <int count>
 struct DecryptionRegister;

 }  // namespace lite

 #define CONCAT_IMPL(a, b) a##b
 #define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b)

 #define REGIST_DECRYPTION_METHOD(name_, func_, key_) \
    REGIST_DECRYPTION_METHOD_WITH_NUM(__COUNTER__, name_, func_, key_)

 #define REGIST_DECRYPTION_METHOD_WITH_NUM(number_, name_, func_, key_) \
    template <>                                                        \
    struct DecryptionRegister<number_> {                               \
        DecryptionRegister() {                                         \
            register_decryption_and_key(name_, func_, key_);           \
        }                                                              \
    };                                                                 \
    namespace {                                                        \
    DecryptionRegister<number_> MACRO_CONCAT(decryption_, number_);    \
    }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/decryption/mbedtls/aes.cc
+++ b/lite/src/decryption/mbedtls/aes.cc
--- a/lite/src/decryption/mbedtls/aes.h
+++ b/lite/src/decryption/mbedtls/aes.h
@@ -0,0 +1,349 @@
 /**
 * \file aes.h
 *
 * \brief AES block cipher
 *
 *  Copyright (C) 2006-2015, ARM Limited, All Rights Reserved
 *  SPDX-License-Identifier: Apache-2.0
 *
 *  Licensed under the Apache License, Version 2.0 (the "License"); you may
 *  not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 *  This file is part of mbed TLS (https://tls.mbed.org)
 */

 /**
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #ifndef MBEDTLS_AES_H
 #define MBEDTLS_AES_H

 #if !defined(MBEDTLS_CONFIG_FILE)
 #include "config.h"
 #else
 #include MBEDTLS_CONFIG_FILE
 #endif

 #include <stddef.h>
 #include <stdint.h>

 /* padlock.c and aesni.c rely on these values! */
 #define MBEDTLS_AES_ENCRYPT 1
 #define MBEDTLS_AES_DECRYPT 0

 #define MBEDTLS_ERR_AES_INVALID_KEY_LENGTH -0x0020 /**< Invalid key length. */
 #define MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH \
    -0x0022 /**< Invalid data input length. */

 #if (defined(__ARMCC_VERSION) || defined(_MSC_VER)) && !defined(inline) && \
        !defined(__cplusplus)
 #define inline __inline
 #endif

 #if !defined(MBEDTLS_AES_ALT)
 // Regular implementation
 //

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * \brief          AES context structure
 *
 * \note           buf is able to hold 32 extra bytes, which can be used:
 *                 - for alignment purposes if VIA padlock is used, and/or
 *                 - to simplify key expansion in the 256-bit case by
 *                 generating an extra round key
 */
 typedef struct {
    int nr;           /*!<  number of rounds  */
    uint32_t* rk;     /*!<  AES round keys    */
    uint32_t buf[68]; /*!<  unaligned data    */
 } mbedtls_aes_context;

 /**
 * \brief          Initialize AES context
 *
 * \param ctx      AES context to be initialized
 */
 void mbedtls_aes_init(mbedtls_aes_context* ctx);

 /**
 * \brief          Clear AES context
 *
 * \param ctx      AES context to be cleared
 */
 void mbedtls_aes_free(mbedtls_aes_context* ctx);

 /**
 * \brief          AES key schedule (encryption)
 *
 * \param ctx      AES context to be initialized
 * \param key      encryption key
 * \param keybits  must be 128, 192 or 256
 *
 * \return         0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH
 */
 int mbedtls_aes_setkey_enc(mbedtls_aes_context* ctx, const unsigned char* key,
                           unsigned int keybits);

 /**
 * \brief          AES key schedule (decryption)
 *
 * \param ctx      AES context to be initialized
 * \param key      decryption key
 * \param keybits  must be 128, 192 or 256
 *
 * \return         0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH
 */
 int mbedtls_aes_setkey_dec(mbedtls_aes_context* ctx, const unsigned char* key,
                           unsigned int keybits);

 /**
 * \brief          AES-ECB block encryption/decryption
 *
 * \param ctx      AES context
 * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
 * \param input    16-byte input block
 * \param output   16-byte output block
 *
 * \return         0 if successful
 */
 int mbedtls_aes_crypt_ecb(mbedtls_aes_context* ctx, int mode,
                          const unsigned char input[16],
                          unsigned char output[16]);

 #if defined(MBEDTLS_CIPHER_MODE_CBC)
 /**
 * \brief          AES-CBC buffer encryption/decryption
 *                 Length should be a multiple of the block
 *                 size (16 bytes)
 *
 * \note           Upon exit, the content of the IV is updated so that you can
 *                 call the function same function again on the following
 *                 block(s) of data and get the same result as if it was
 *                 encrypted in one call. This allows a "streaming" usage.
 *                 If on the other hand you need to retain the contents of the
 *                 IV, you should either save it manually or use the cipher
 *                 module instead.
 *
 * \param ctx      AES context
 * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
 * \param length   length of the input data
 * \param iv       initialization vector (updated after use)
 * \param input    buffer holding the input data
 * \param output   buffer holding the output data
 *
 * \return         0 if successful, or MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH
 */
 int mbedtls_aes_crypt_cbc(mbedtls_aes_context* ctx, int mode, size_t length,
                          unsigned char iv[16], const unsigned char* input,
                          unsigned char* output);
 #endif /* MBEDTLS_CIPHER_MODE_CBC */

 #if defined(MBEDTLS_CIPHER_MODE_CFB)
 /**
 * \brief          AES-CFB128 buffer encryption/decryption.
 *
 * Note: Due to the nature of CFB you should use the same key schedule for
 * both encryption and decryption. So a context initialized with
 * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and
 * MBEDTLS_AES_DECRYPT.
 *
 * \note           Upon exit, the content of the IV is updated so that you can
 *                 call the function same function again on the following
 *                 block(s) of data and get the same result as if it was
 *                 encrypted in one call. This allows a "streaming" usage.
 *                 If on the other hand you need to retain the contents of the
 *                 IV, you should either save it manually or use the cipher
 *                 module instead.
 *
 * \param ctx      AES context
 * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
 * \param length   length of the input data
 * \param iv_off   offset in IV (updated after use)
 * \param iv       initialization vector (updated after use)
 * \param input    buffer holding the input data
 * \param output   buffer holding the output data
 *
 * \return         0 if successful
 */
 int mbedtls_aes_crypt_cfb128(mbedtls_aes_context* ctx, int mode, size_t length,
                             size_t* iv_off, unsigned char iv[16],
                             const unsigned char* input, unsigned char* output);

 /**
 * \brief          AES-CFB8 buffer encryption/decryption.
 *
 * Note: Due to the nature of CFB you should use the same key schedule for
 * both encryption and decryption. So a context initialized with
 * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and
 * MBEDTLS_AES_DECRYPT.
 *
 * \note           Upon exit, the content of the IV is updated so that you can
 *                 call the function same function again on the following
 *                 block(s) of data and get the same result as if it was
 *                 encrypted in one call. This allows a "streaming" usage.
 *                 If on the other hand you need to retain the contents of the
 *                 IV, you should either save it manually or use the cipher
 *                 module instead.
 *
 * \param ctx      AES context
 * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
 * \param length   length of the input data
 * \param iv       initialization vector (updated after use)
 * \param input    buffer holding the input data
 * \param output   buffer holding the output data
 *
 * \return         0 if successful
 */
 int mbedtls_aes_crypt_cfb8(mbedtls_aes_context* ctx, int mode, size_t length,
                           unsigned char iv[16], const unsigned char* input,
                           unsigned char* output);
 #endif /*MBEDTLS_CIPHER_MODE_CFB */

 #if defined(MBEDTLS_CIPHER_MODE_CTR)
 /**
 * \brief               AES-CTR buffer encryption/decryption
 *
 * Warning: You have to keep the maximum use of your counter in mind!
 *
 * Note: Due to the nature of CTR you should use the same key schedule for
 * both encryption and decryption. So a context initialized with
 * mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and
 * MBEDTLS_AES_DECRYPT.
 *
 * \param ctx           AES context
 * \param length        The length of the data
 * \param nc_off        The offset in the current stream_block (for resuming
 *                      within current cipher stream). The offset pointer to
 *                      should be 0 at the start of a stream.
 * \param nonce_counter The 128-bit nonce and counter.
 * \param stream_block  The saved stream-block for resuming. Is overwritten
 *                      by the function.
 * \param input         The input data stream
 * \param output        The output data stream
 *
 * \return         0 if successful
 */
 int mbedtls_aes_crypt_ctr(mbedtls_aes_context* ctx, size_t length,
                          size_t* nc_off, unsigned char nonce_counter[16],
                          unsigned char stream_block[16],
                          const unsigned char* input, unsigned char* output);
 #endif /* MBEDTLS_CIPHER_MODE_CTR */

 /**
 * \brief           Internal AES block encryption function
 *                  (Only exposed to allow overriding it,
 *                  see MBEDTLS_AES_ENCRYPT_ALT)
 *
 * \param ctx       AES context
 * \param input     Plaintext block
 * \param output    Output (ciphertext) block
 *
 * \return          0 if successful
 */
 int mbedtls_internal_aes_encrypt(mbedtls_aes_context* ctx,
                                 const unsigned char input[16],
                                 unsigned char output[16]);

 /**
 * \brief           Internal AES block decryption function
 *                  (Only exposed to allow overriding it,
 *                  see MBEDTLS_AES_DECRYPT_ALT)
 *
 * \param ctx       AES context
 * \param input     Ciphertext block
 * \param output    Output (plaintext) block
 *
 * \return          0 if successful
 */
 int mbedtls_internal_aes_decrypt(mbedtls_aes_context* ctx,
                                 const unsigned char input[16],
                                 unsigned char output[16]);

 #if !defined(MBEDTLS_DEPRECATED_REMOVED)
 #if defined(MBEDTLS_DEPRECATED_WARNING)
 #define MBEDTLS_DEPRECATED __attribute__((deprecated))
 #else
 #define MBEDTLS_DEPRECATED
 #endif
 /**
 * \brief           Internal AES block encryption function
 *                  (Only exposed to allow overriding it,
 *                  see MBEDTLS_AES_ENCRYPT_ALT)
 *
 * \deprecated      Superseded by mbedtls_aes_encrypt_ext() in 2.5.0
 *
 * \param ctx       AES context
 * \param input     Plaintext block
 * \param output    Output (ciphertext) block
 */
 MBEDTLS_DEPRECATED static inline void mbedtls_aes_encrypt(
        mbedtls_aes_context* ctx, const unsigned char input[16],
        unsigned char output[16]) {
    mbedtls_internal_aes_encrypt(ctx, input, output);
 }

 /**
 * \brief           Internal AES block decryption function
 *                  (Only exposed to allow overriding it,
 *                  see MBEDTLS_AES_DECRYPT_ALT)
 *
 * \deprecated      Superseded by mbedtls_aes_decrypt_ext() in 2.5.0
 *
 * \param ctx       AES context
 * \param input     Ciphertext block
 * \param output    Output (plaintext) block
 */
 MBEDTLS_DEPRECATED static inline void mbedtls_aes_decrypt(
        mbedtls_aes_context* ctx, const unsigned char input[16],
        unsigned char output[16]) {
    mbedtls_internal_aes_decrypt(ctx, input, output);
 }

 #undef MBEDTLS_DEPRECATED
 #endif /* !MBEDTLS_DEPRECATED_REMOVED */

 #ifdef __cplusplus
 }
 #endif

 #else /* MBEDTLS_AES_ALT */
 #include "aes_alt.h"
 #endif /* MBEDTLS_AES_ALT */

 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * \brief          Checkup routine
 *
 * \return         0 if successful, or 1 if the test failed
 */
 int mbedtls_aes_self_test(int verbose);

 #ifdef __cplusplus
 }
 #endif

 #endif /* aes.h */
--- a/lite/src/decryption/mbedtls/config.h
+++ b/lite/src/decryption/mbedtls/config.h
@@ -0,0 +1,5 @@
 #pragma once

 #define MBEDTLS_AES_C
 #define MBEDTLS_AES_ROM_TABLES
 #define MBEDTLS_CIPHER_MODE_CBC
--- a/lite/src/decryption/rc4/rc4_cryption_base.h
+++ b/lite/src/decryption/rc4/rc4_cryption_base.h
@@ -0,0 +1,156 @@
 /**
 * \file src/decryption/rc4/rc4_cryption_base.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #pragma once

 #include <algorithm>
 #include <cstdint>

 namespace lite {
 namespace rc4 {

 #define m256(x) static_cast<uint8_t>(x)

 /*! \brief Pseudo-random byte stream for RC4.
 */
 class RC4RandStream {
 public:
    RC4RandStream() = default;

    RC4RandStream(uint64_t key) { reset(key); }

    void reset(uint64_t init_key) {
        i_ = j_ = 0;
        for (int i = 0; i < 256; i++)
            s_[i] = i;
        uint8_t j = 0;
        for (int i = 0; i < 256; i++) {
            j = j + s_[i] + m256(init_key >> ((i % 8) * 8));
            std::swap(s_[i], s_[j]);
        }
        // drop
        for (int i = 0; i < 768; i++) {
            next8();
        }
        for (int i = 0, t = next8(); i < t; i++) {
            next8();
        }
    }

    uint8_t next8() {
        i_++;
        uint8_t a = s_[i_];
        j_ += a;
        uint8_t b = s_[j_];
        s_[i_] = b;
        s_[j_] = a;
        uint8_t c = s_[m256((i_ << 5) ^ (j_ >> 3))] +
                    s_[m256((j_ << 5) ^ (i_ >> 3))];
        return (s_[m256(a + b)] + s_[c ^ 0xAA]) ^ s_[m256(j_ + b)];
    }

    uint64_t next64() {
        uint64_t rst;
        uint8_t* buf = reinterpret_cast<uint8_t*>(&rst);
        for (int i = 0; i < 8; i++) {
            buf[i] = next8();
        }
        return rst;
    }

 private:
    uint8_t s_[256], i_ = 0, j_ = 0;
 };
 #undef m256

 /*!
 * \brief fast and secure 64-bit hash
 * see https://code.google.com/p/fast-hash/
 */
 class FastHash64 {
 public:
    FastHash64(uint64_t seed)
            : hash_{seed},
              mul0_{key_gen_hash_mul0()},
              mul1_{key_gen_hash_mul1()} {}

    void feed(uint64_t val) {
        val ^= val >> 23;
        val *= mul0_;
        val ^= val >> 47;
        hash_ ^= val;
        hash_ *= mul1_;
    }

    uint64_t get() { return hash_; }

 private:
    uint64_t hash_;
    const uint64_t mul0_, mul1_;

    static uint64_t key_gen_hash_mul0() {
        uint64_t rst;
        uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
        buf[2] = 50;
        buf[3] = 244;
        buf[6] = 39;
        buf[1] = 92;
        buf[5] = 89;
        buf[4] = 155;
        buf[0] = 55;
        buf[7] = 33;
        return rst;
    }

    static uint64_t key_gen_hash_mul1() {
        uint64_t rst;
        uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
        buf[6] = 3;
        buf[2] = 109;
        buf[7] = 136;
        buf[1] = 25;
        buf[5] = 85;
        buf[0] = 101;
        buf[4] = 242;
        buf[3] = 30;
        return rst;
    }
 };

 // The encryption keys are always inlined.
 static inline uint64_t key_gen_enc_key() {
    uint64_t rst;
    uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
    buf[4] = 120;
    buf[3] = 121;
    buf[7] = 122;
    buf[6] = 123;
    buf[0] = 124;
    buf[5] = 125;
    buf[2] = 126;
    buf[1] = 127;
    return rst;
 }

 static inline uint64_t key_gen_hash_key() {
    uint64_t rst;
    uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst);
    buf[2] = 101;
    buf[5] = 102;
    buf[4] = 103;
    buf[7] = 104;
    buf[1] = 105;
    buf[3] = 106;
    buf[6] = 107;
    buf[0] = 108;
    return rst;
 }
 }  // namespace rc4
 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/decryption/rc4/rc4_cryption_impl.cpp
+++ b/lite/src/decryption/rc4/rc4_cryption_impl.cpp
@@ -0,0 +1,219 @@
 /**
 * \file src/decryption/rc4/rc4_cryption_impl.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "rc4_cryption_impl.h"
 #include "../../misc.h"

 #include <cstring>

 using namespace lite;

 /*!
 * \brief Read the input stream once in order to initialize the decryption
 *        state.
 */
 void RC4Impl::init_rc4_state() {
    rc4::RC4RandStream enc_stream(m_enc_key);
    rc4::FastHash64 dechash(m_hash_key);

    size_t offset = 0;

    std::vector<uint64_t> buffer(128);
    size_t remaining = m_model_length - sizeof(uint64_t);
    while (remaining > 0) {
        size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t));
        memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset,
               toread);
        offset += toread;
        remaining -= toread;

        for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) {
            uint64_t value = buffer[i];
            value ^= enc_stream.next64();
            dechash.feed(value);
        }
    }

    uint64_t hashvalue;
    memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset,
           sizeof(hashvalue));
    offset += sizeof(hashvalue);

    hashvalue ^= dechash.get() ^ enc_stream.next64();
    m_state.hash_stream.reset(hashvalue);
    m_state.enc_stream.reset(m_enc_key);
 }

 std::vector<uint8_t> RC4Impl::decrypt_model() {
    std::vector<uint8_t> result(m_model_length, 0);

    uint8_t* ptr = result.data();
    for (size_t i = 0; i < m_model_length; ++i) {
        ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i];
        ptr[i] ^= m_state.hash_stream.next8() ^ m_state.enc_stream.next8();
    }
    return result;
 }

 /*! \brief Encrypt the data in m_buffer.
 *
 * The basic idea is to calculate a 64-bit hash from the buffer and append
 * it to the end of the buffer. The basic requirement is that the change of
 * every byte including the hash value will destroy the whole model in every
 * byte.
 *
 * Encryption:
 *
 * 1. First calculate a 64-bit hash, called plain hash value, from the
 * buffer.
 * 2. Initialize a RC4 stream with the plain hash value.
 * 3. Obfuscate the model body with the RC4 stream defined in step 2.
 * 4. Calculate the hash value of the obfuscated model, called hash value
 *    after hashing.
 * 5. Encrypt the model body with a RC4 stream made from the encryption key.
 * 6. Bit-xor the hash value after hashing with the plain hash value, called
 *    mixed hash.
 * 7. Encrypt the mixed hash with the RC4 stream defined in step 5, called
 * the protected hash.
 * 8. Append the protected hash to the buffer.
 *
 * Decryption:
 * 1. Decrypt the model body with a RC4 stream made from the encryption key,
 *    which is the reverse of step 5 and 7 of encryption and get the mixed
 *    hash.
 * 2. Calculate the hash value of the decrypted model, which equals to the
 *    hash value after hashing in step 4 of encryption.
 * 3. Bit-xor the hash value after hashing and the mixed hash to get the
 * plain hash value, which is the reverse of step 6 of encryption.
 * 4. Un-obfuscate the model body with the plain hash value, which is the
 *    reverse of step 3 of encryption.
 *
 * Think:
 * 1. If any byte in the model body is broken, the hash value after hashing
 *    will be broken in step 2, and hence the plain hash value in step 3
 * will be also broken, and finally, the model body will be broken in
 * step 4.
 * 2. If the protected hash is broken, the plain hash value in step 3 will
 * be broken, and finally the model body will be broken.
 */
 std::vector<uint8_t> RC4Impl::encrypt_model() {
    size_t total_length = (m_model_length + (sizeof(size_t) - 1)) /
                          sizeof(size_t) * sizeof(size_t);
    std::vector<uint8_t> pad_model(total_length, 0);
    memcpy(pad_model.data(), m_model_mem, m_model_length);

    // Calculate the hash of the model.
    rc4::FastHash64 plainhash(m_hash_key);
    uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data());
    size_t len = pad_model.size() / sizeof(uint64_t);

    for (size_t i = 0; i < len; ++i)
        plainhash.feed(ptr[i]);
    uint64_t plainhash_value = plainhash.get();

    // Encrypt the model.
    rc4::RC4RandStream hash_enc(plainhash_value);
    rc4::RC4RandStream outmost_enc(m_enc_key);
    rc4::FastHash64 afterhashenc_hash(m_hash_key);

    for (size_t i = 0; i < len; ++i) {
        uint64_t value = ptr[i] ^ hash_enc.next64();
        afterhashenc_hash.feed(value);
        ptr[i] = value ^ outmost_enc.next64();
    }

    uint64_t protected_hash =
            plainhash_value ^ afterhashenc_hash.get() ^ outmost_enc.next64();

    size_t end = pad_model.size();
    pad_model.resize(pad_model.size() + sizeof(uint64_t));
    ptr = reinterpret_cast<uint64_t*>(&pad_model[end]);
    *ptr = protected_hash;
    return pad_model;
 }

 /*!
 * \brief Read the input stream once in order to initialize the decryption
 *        state.
 */
 void SimpleFastRC4Impl::init_sfrc4_state() {
    rc4::RC4RandStream enc_stream(m_enc_key);
    rc4::FastHash64 dechash(m_hash_key);

    size_t offset = 0;
    std::vector<uint64_t> buffer(128);
    size_t remaining = m_model_length - sizeof(uint64_t);
    while (remaining > 0) {
        size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t));
        memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset,
               toread);
        offset += toread;
        remaining -= toread;

        for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) {
            uint64_t value = buffer[i];
            dechash.feed(value);
        }
    }

    uint64_t hashvalue;
    memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset,
           sizeof(hashvalue));

    offset += sizeof(hashvalue);

    /*! \brief test the hash_val. */
    if (hashvalue != dechash.get())
        LITE_THROW(
                "The checksum of the file cannot be verified. The file may "
                "be encrypted in the wrong algorithm or different keys.");

    m_state.hash_stream.reset(m_hash_key);
    m_state.enc_stream.reset(m_enc_key);
 }

 std::vector<uint8_t> SimpleFastRC4Impl::decrypt_model() {
    std::vector<uint8_t> result(m_model_length, 0);
    uint8_t* ptr = result.data();
    for (size_t i = 0; i < m_model_length; ++i) {
        ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i];
        ptr[i] ^= m_state.enc_stream.next8();
    }
    return result;
 }

 std::vector<uint8_t> SimpleFastRC4Impl::encrypt_model() {
    size_t total_length = (m_model_length + (sizeof(size_t) - 1)) /
                          sizeof(size_t) * sizeof(size_t);
    std::vector<uint8_t> pad_model(total_length, 0);
    memcpy(pad_model.data(), m_model_mem, m_model_length);

    // Calculate the hash of the model.
    rc4::FastHash64 enchash(m_hash_key);
    uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data());
    size_t len = pad_model.size() / sizeof(uint64_t);

    // Encrypt the model.
    rc4::RC4RandStream out_enc(m_enc_key);
    for (size_t i = 0; i < len; ++i) {
        ptr[i] = ptr[i] ^ out_enc.next64();
        enchash.feed(ptr[i]);
    }

    uint64_t hash_value = enchash.get();

    size_t end = pad_model.size();
    pad_model.resize(pad_model.size() + sizeof(uint64_t));
    ptr = reinterpret_cast<uint64_t*>(&pad_model[end]);
    *ptr = hash_value;

    return pad_model;
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/decryption/rc4/rc4_cryption_impl.h
+++ b/lite/src/decryption/rc4/rc4_cryption_impl.h
@@ -0,0 +1,79 @@
 /**
 * \file src/decryption/rc4/rc4_cryption_impl.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #pragma once
 #include "rc4_cryption_base.h"

 #include <memory>
 #include <vector>

 namespace lite {

 class RC4Impl {
    struct RC4State {
        rc4::RC4RandStream enc_stream;
        rc4::RC4RandStream hash_stream;
    } m_state;

 public:
    RC4Impl(const void* model_mem, size_t size, const std::vector<uint8_t>& key)
            : m_model_mem(model_mem), m_model_length(size) {
        const uint8_t* data = key.data();
        m_hash_key = *reinterpret_cast<const uint64_t*>(data);
        m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8);
    }

    std::vector<uint8_t> encrypt_model();
    std::vector<uint8_t> decrypt_model();

    /*! \brief Read the input stream once in order to initialize the decryption
     *         state.
     */
    void init_rc4_state();

 private:
    const void* m_model_mem;
    size_t m_model_length;

    uint64_t m_hash_key;
    uint64_t m_enc_key;
 };

 class SimpleFastRC4Impl {
    struct SFRC4State {
        rc4::RC4RandStream enc_stream;
        rc4::RC4RandStream hash_stream;
    } m_state;

 public:
    SimpleFastRC4Impl(const void* model_mem, size_t size,
                      const std::vector<uint8_t>& key)
            : m_model_mem(model_mem), m_model_length(size) {
        const uint8_t* data = key.data();
        m_hash_key = *reinterpret_cast<const uint64_t*>(data);
        m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8);
    }
    std::vector<uint8_t> encrypt_model();
    std::vector<uint8_t> decrypt_model();

    /*! \brief Read the input stream once in order to initialize the decryption
     *         state.
     */
    void init_sfrc4_state();

 private:
    const void* m_model_mem;
    size_t m_model_length;

    uint64_t m_hash_key;
    uint64_t m_enc_key;
 };

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/decryption/rc4_cryption.cpp
+++ b/lite/src/decryption/rc4_cryption.cpp
@@ -0,0 +1,58 @@
 /**
 * \file src/decryption/rc4_cryption.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "rc4_cryption.h"
 #include "rc4/rc4_cryption_impl.h"

 #include <vector>

 using namespace lite;

 std::vector<uint8_t> RC4::decrypt_model(const void* model_mem, size_t size,
                                        const std::vector<uint8_t>& key) {
    RC4Impl rc4_impl(model_mem, size, key);
    rc4_impl.init_rc4_state();
    return rc4_impl.decrypt_model();
 }

 std::vector<uint8_t> RC4::encrypt_model(const void* model_mem, size_t size,
                                        const std::vector<uint8_t>& key) {
    RC4Impl rc4_impl(model_mem, size, key);
    return rc4_impl.encrypt_model();
 }

 std::vector<uint8_t> RC4::get_decrypt_key() {
    std::vector<uint8_t> keys(128, 0);
    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
    data[0] = rc4::key_gen_hash_key();
    data[1] = rc4::key_gen_enc_key();
    return keys;
 };

 std::vector<uint8_t> SimpleFastRC4::decrypt_model(
        const void* model_mem, size_t size, const std::vector<uint8_t>& key) {
    SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key);
    simple_fast_rc4_impl.init_sfrc4_state();
    return simple_fast_rc4_impl.decrypt_model();
 }
 std::vector<uint8_t> SimpleFastRC4::encrypt_model(
        const void* model_mem, size_t size, const std::vector<uint8_t>& key) {
    SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key);
    return simple_fast_rc4_impl.encrypt_model();
 }

 std::vector<uint8_t> SimpleFastRC4::get_decrypt_key() {
    std::vector<uint8_t> keys(128, 0);
    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
    data[0] = rc4::key_gen_hash_key();
    data[1] = rc4::key_gen_enc_key();
    return keys;
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/decryption/rc4_cryption.h
+++ b/lite/src/decryption/rc4_cryption.h
@@ -0,0 +1,44 @@
 /**
 * \file src/decryption/rc4_cryption.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #pragma once

 #include "rc4/rc4_cryption_base.h"

 #include <vector>

 namespace lite {

 class RC4 {
 public:
    static std::vector<uint8_t> decrypt_model(const void* model_mem,
                                              size_t size,
                                              const std::vector<uint8_t>& key);

    static std::vector<uint8_t> encrypt_model(const void* model_mem,
                                              size_t size,
                                              const std::vector<uint8_t>& key);

    static std::vector<uint8_t> get_decrypt_key();
 };

 class SimpleFastRC4 {
 public:
    static std::vector<uint8_t> decrypt_model(const void* model_mem,
                                              size_t size,
                                              const std::vector<uint8_t>& key);
    static std::vector<uint8_t> encrypt_model(const void* model_mem,
                                              size_t size,
                                              const std::vector<uint8_t>& key);

    static std::vector<uint8_t> get_decrypt_key();
 };

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/function_base.h
+++ b/lite/src/function_base.h
@@ -0,0 +1,53 @@
 /**
 * \file src/function_base.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include <unordered_map>
 #include "misc.h"
 #include "type_info.h"
 // template <typename tensor_type, typename ...Arg>
 namespace lite {
 class TensorImplDft;
 class NetworkImplDft;
 namespace {

 template <typename class_type>
 struct class_type_name {
    std::string operator()() { return ""; }
 };
 #define ADD_STATEMENT(class_name, backend_name)            \
    template <>                                            \
    struct class_type_name<class_name> {                   \
        std::string operator()() { return #backend_name; } \
    }
 ADD_STATEMENT(TensorImplDft, Dft);
 ADD_STATEMENT(NetworkImplDft, Dft);
 #undef ADD_STATEMENT
 }  // namespace

 // if it can't find the function, ignore
 template <typename tensor_type, typename ret_type, typename... Args>
 ret_type try_call_func(std::string func_name, Args... args) {
    mark_used_variable(func_name);
    mark_used_variable(args...);
    return nullptr;
 }

 // if it can't find the function, throw error
 template <typename tensor_type, typename ret_type, typename... Args>
 ret_type call_func(std::string func_name, Args... args) {
    mark_used_variable(args...);
    auto backend_name = class_type_name<tensor_type>()();
    auto msg_info =
            func_name + "  is not aviliable in " + backend_name + " backend.";
    LITE_THROW(msg_info.c_str());
 }
 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/global.cpp
+++ b/lite/src/global.cpp
@@ -0,0 +1,256 @@
 /**
 * \file src/global.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <lite_build_config.h>

 #include "lite/global.h"
 #include "decryption/aes_decrypt.h"
 #include "decryption/decrypt_base.h"
 #include "decryption/rc4_cryption.h"
 #include "misc.h"
 #include "parse_info/parse_info_base.h"
 #include "parse_info/default_parse.h"

 #if LITE_BUILD_WITH_MGE
 #include "megbrain/common.h"
 #include "megbrain/comp_node.h"
 #include "megbrain/serialization/extern_c_opr.h"
 #include "megbrain/version.h"
 #include "megcore_opencl.h"
 #include "mge/algo_cache/file_cache.h"
 #include "mge/common.h"
 #if MGB_ENABLE_TENSOR_RT
 #include "megbrain/tensorrt/tensorrt_engine_cache.h"
 #endif
 #if LITE_WITH_CUDA
 #include "mge/algo_cache/redis_cache.h"
 #endif
 #endif

 #include <mutex>
 #include <unordered_map>

 using namespace lite;

 lite::DecryptionStaticData& lite::decryption_static_data() {
    static lite::DecryptionStaticData global_map;
    return global_map;
 }

 void lite::get_version(int& major, int& minor, int& patch) {
 #if LITE_BUILD_WITH_MGE
    auto version = mgb::get_version();
    major = version.major;
    minor = version.minor;
    patch = version.patch;
 #else
    //! without mge, the version set the max version
    major = 8;
    minor = 9999;
    patch = 0;
 #endif
 }

 size_t lite::get_device_count(LiteDeviceType device_type) {
 #if LITE_BUILD_WITH_MGE
    auto mgb_device_type = to_compnode_locator(device_type).type;
    return mgb::CompNode::get_device_count(mgb_device_type);
 #else
    LITE_MARK_USED_VAR(device_type);
    LITE_THROW("no lite backend avialible, please check build macro.");
 #endif
 }

 bool lite::register_decryption_and_key(std::string decrypt_name,
                                       const DecryptionFunc& func,
                                       const std::vector<uint8_t>& key) {
    LITE_LOCK_GUARD(decryption_static_data().map_mutex);
    auto& global_map = decryption_static_data().decryption_methods;
    if (global_map.find(decrypt_name) != global_map.end()) {
        LITE_THROW(ssprintf("The decryption method %s is already registered.",
                            decrypt_name.c_str()));
        return false;
    } else {
        auto key_pointer = std::make_shared<std::vector<uint8_t>>(key);
        global_map[decrypt_name] = {func, key_pointer};
        LITE_LOG("Registered ecryption method %s.", decrypt_name.c_str());
        return true;
    }
 }

 bool lite::update_decryption_or_key(std::string decrypt_name,
                                    const DecryptionFunc& func,
                                    const std::vector<uint8_t>& key) {
    LITE_LOCK_GUARD(decryption_static_data().map_mutex);
    auto& global_map = decryption_static_data().decryption_methods;
    if (global_map.find(decrypt_name) != global_map.end()) {
        std::shared_ptr<std::vector<uint8_t>> key_pointer;
        DecryptionFunc new_func;
        if (func) {
            new_func = func;
            LITE_LOG("%s decryption function is updated.",
                     decrypt_name.c_str());
        } else {
            new_func = global_map[decrypt_name].first;
        }
        if (key.size()) {
            key_pointer = std::make_shared<std::vector<uint8_t>>(key);
            LITE_LOG("%s decryption key is updated.", decrypt_name.c_str());
        } else {
            key_pointer = global_map[decrypt_name].second;
        }
        global_map[decrypt_name] = {new_func, key_pointer};
        return true;
    } else {
        LITE_THROW(ssprintf("The decryption method %s is not registered.",
                            decrypt_name.c_str()));
        return false;
    }
 }

 lite::ParseInfoStaticData& lite::parse_info_static_data() {
    static lite::ParseInfoStaticData global_map;
    return global_map;
 }

 bool lite::register_parse_info_func(std::string info_type,
                                    const ParseInfoFunc& parse_func) {
    LITE_LOCK_GUARD(parse_info_static_data().map_mutex);
    auto& global_map = parse_info_static_data().parse_info_methods;
    if (global_map.find(info_type) != global_map.end()) {
        LITE_THROW(ssprintf("The parse info method %s is already registered.",
                            info_type.c_str()));
        return false;
    } else {
        global_map[info_type] = parse_func;
        LITE_LOG("Registered infomation parser method %s.", info_type.c_str());
        return true;
    }
 }

 #if LITE_BUILD_WITH_MGE

 namespace {
 struct CacheControl {
    LITE_MUTEX cache_mutex;
    std::string cache_type = "file";
    std::atomic_size_t config_algo_times{0};
    std::atomic_size_t config_trt_times{0};
 };
 CacheControl cache_control;
 }  // namespace


 void lite::try_coalesce_all_free_memory() {
    mgb::CompNode::try_coalesce_all_free_memory();
 }

 void lite::set_loader_lib_path(const std::string& loader_path) {
    const char* lib_path = loader_path.c_str();
    LITE_LOG("load a device loader of path %s.", lib_path);
    auto handle = dlopen(lib_path, RTLD_LAZY);
    LITE_ASSERT(handle, "failed to open c opr lib %s: %s", lib_path, dlerror());
    const char* entry = MGB_C_OPR_INIT_FUNC_STR;
    auto func = dlsym(handle, entry);
    LITE_ASSERT(func, "can not resolve %s: %s", entry, dlerror());
    typedef void (*entry_f_t)(void*);
    reinterpret_cast<entry_f_t>(func)(
            reinterpret_cast<void*>(&mgb_get_extern_c_opr_api_versioned));
 }

 void lite::set_persistent_cache(const std::string& cache_path,
                                bool always_sync) {
    LITE_LOCK_GUARD(cache_control.cache_mutex);
    cache_control.cache_type = "file";
    if (cache_control.config_algo_times >= 1) {
        LITE_WARN(
                "The cache has been set，maybe some model is using now, change "
                "it now may cause unknow error!!");
    }
    cache_control.config_algo_times++;
    mgb::PersistentCache::set_impl(std::make_shared<InFilePersistentCache>(
            cache_path.c_str(), always_sync));
 }

 void lite::dump_persistent_cache(const std::string& cache_path) {
    LITE_LOCK_GUARD(cache_control.cache_mutex);
    LITE_ASSERT(cache_control.cache_type == "file",
                "now cache type is redis, it can't be dumped.");
    static_cast<InFilePersistentCache&>(mgb::PersistentCache::inst())
            .dump_cache(cache_path.c_str());
 }

 //! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
 void lite::set_tensor_rt_cache(std::string tensorrt_cache_path) {
 #if MGB_ENABLE_TENSOR_RT
    LITE_LOCK_GUARD(cache_control.cache_mutex);
    if (cache_control.config_trt_times >= 1) {
        LITE_WARN(
                "The trt cache has been set，maybe some model is using now, "
                "change it now may cause unknow error!!");
    }
    cache_control.config_trt_times++;
    mgb::TensorRTEngineCache::enable_engine_cache(true);
    mgb::TensorRTEngineCache::set_impl(
            std::make_shared<mgb::TensorRTEngineCacheIO>(tensorrt_cache_path));
 #else
    LITE_MARK_USED_VAR(tensorrt_cache_path);
    LITE_THROW("TensorRT is disable at compile time.");
 #endif
 }

 void lite::dump_tensor_rt_cache() {
 #if MGB_ENABLE_TENSOR_RT
    if (mgb::TensorRTEngineCache::enable_engine_cache()) {
        mgb::TensorRTEngineCache::inst().dump_cache();
    }
 #else
    LITE_THROW("TensorRT is disable at compile time.");
 #endif
 }

 #else  //LITE_BUILD_WITH_MGE
 void lite::try_coalesce_all_free_memory() {}

 void lite::set_loader_lib_path(const std::string& ) {
    LITE_THROW("mge is disbale at build time, please build with mge");
 }

 void lite::set_persistent_cache(const std::string&, bool) {
    LITE_THROW("mge is disbale at build time, please build with mge");
 }

 void lite::dump_persistent_cache(const std::string& ) {
    LITE_THROW("mge is disbale at build time, please build with mge");
 }

 //! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine
 void lite::set_tensor_rt_cache(std::string ) {
    LITE_THROW("mge is disbale at build time, please build with mge");
 }

 void lite::dump_tensor_rt_cache() {
    LITE_THROW("mge is disbale at build time, please build with mge");
 }
 #endif
 namespace lite {
 REGIST_DECRYPTION_METHOD("AES_default", lite::AESDcryption::decrypt_model,
                         lite::AESDcryption::get_decrypt_key());

 REGIST_DECRYPTION_METHOD("RC4_default", lite::RC4::decrypt_model,
                         lite::RC4::get_decrypt_key());

 REGIST_DECRYPTION_METHOD("SIMPLE_FAST_RC4_default",
                         lite::SimpleFastRC4::decrypt_model,
                         lite::SimpleFastRC4::get_decrypt_key());

 REGIST_PARSE_INFO_FUNCTION("LITE_default", lite::default_parse_info);
 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/lite_build_config.h.in
+++ b/lite/src/lite_build_config.h.in
@@ -0,0 +1,37 @@
 /**
 * \file lite/src/lite_build_config.h.in
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */
 #ifndef _HEADER_LITE_BUILD_CONFIG
 #define _HEADER_LITE_BUILD_CONFIG

 #cmakedefine01 LITE_ENABLE_LOGGING
 #cmakedefine01 LITE_ENABLE_EXCEPTION
 #cmakedefine01 LITE_WITH_CUDA
 #cmakedefine01 LITE_ASSERT_LOC

 #ifndef LITE_ENABLE_LOGGING
 #define LITE_ENABLE_LOGGING 1
 #endif

 #ifndef LITE_ENABLE_EXCEPTION
 #if __cpp_exceptions || __EXCEPTIONS || \
        (defined(_MSC_VER) && defined(_CPPUNWIND))
 #define LITE_ENABLE_EXCEPTION 1
 #else
 #define LITE_ENABLE_EXCEPTION 0
 #endif
 #endif

 #ifndef LITE_WITH_CUDA
 #define LITE_WITH_CUDA 0
 #endif

 #ifndef LITE_ASSERT_LOC
 #define LITE_ASSERT_LOC 0
 #endif
 #endif  // _HEADER_LITE_BUILD_CONFIG
--- a/lite/src/mge/algo_cache/file_cache.cpp
+++ b/lite/src/mge/algo_cache/file_cache.cpp
@@ -0,0 +1,254 @@
 /**
 * \file lite/src/mge/algo_cache/file_cache.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "../common.h"
 #include "file_cache.h"

 using namespace lite;

 //////////////////////// InFilePersistentCache::InputMemory ///////////////
 class InFilePersistentCache::InputMemory {
    const uint8_t* m_ptr;
    size_t m_offset = 0;
    size_t m_size;

 public:
    InputMemory(const uint8_t* bin, size_t size) : m_ptr{bin}, m_size{size} {}

    template <typename T>
    void read(T& val) {
        static_assert(std::is_trivially_copyable<T>::value,
                      "only support trivially copyable type");
        LITE_ASSERT(m_offset + sizeof(T) <= m_size);
        memcpy(&val, m_ptr, sizeof(T));
        m_offset += sizeof(T);
        m_ptr += sizeof(T);
    }

    template <typename T>
    void read(T* buf, size_t size) {
        static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
                      "only support read bytes");
        LITE_ASSERT(m_offset + size <= m_size);
        memcpy(buf, m_ptr, size);
        m_offset += size;
        m_ptr += size;
    }
 };

 //////////////////////// InFilePersistentCache::InputFile ///////////////
 class InFilePersistentCache::InputFile {
    FILE* m_fp;

 public:
    InputFile(const char* path) : m_fp{fopen(path, "rb")} {
        LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno));
    }
    ~InputFile() {
        if (m_fp) {
            fclose(m_fp);
        }
    }

    template <typename T>
    void read(T& val) {
        static_assert(std::is_trivially_copyable<T>::value,
                      "only support trivially copyable type");
        auto ret = fread(&val, sizeof(T), 1, m_fp);
        LITE_ASSERT(ret == 1);
    }

    template <typename T>
    void read(T* buf, size_t size) {
        static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
                      "only support read bytes");
        auto ret = fread(buf, size, 1, m_fp);
        LITE_ASSERT(ret == 1);
    }
 };

 //////////////////////// InFilePersistentCache::OutputFile ///////////////
 class InFilePersistentCache::OutputFile {
    FILE* m_fp;

 public:
    OutputFile(const char* path) : m_fp{fopen(path, "wb")} {
        LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno));
    }
    ~OutputFile() {
        if (m_fp) {
            fclose(m_fp);
        }
    }

    template <typename T>
    void write(T val) {
        auto ret = fwrite(&val, sizeof(T), 1, m_fp);
        LITE_ASSERT(ret == 1);
    }

    template <typename T>
    void write(const T* buf, size_t size) {
        static_assert(sizeof(T) == 1, "only support write bytes");
        auto ret = fwrite(buf, size, 1, m_fp);
        LITE_ASSERT(ret == 1);
    }

    void flush() { fflush(m_fp); }

    void set_head() { fseek(m_fp, 0, SEEK_SET); }
 };

 //////////////////////// InFilePersistentCache::BlobStorage ///////////////

 template <typename Input>
 InFilePersistentCache::BlobStorage&
 InFilePersistentCache::BlobStorage::init_from_input(Input& inp) {
    uint32_t data_size;
    inp.read(data_size);
    size = data_size;
    data_refhold = std::make_unique<uint8_t[]>(size);
    inp.read(data_refhold.get(), size);
    ptr = data_refhold.get();
    return *this;
 }

 void InFilePersistentCache::BlobStorage::write_to_file(
        OutputFile& out_file) const {
    uint32_t u_size = size;
    out_file.write(u_size);
    out_file.write(data_refhold.get(), u_size);
 }

 InFilePersistentCache::BlobStorage&
 InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) {
    data_refhold = std::make_unique<uint8_t[]>(b.size + 1);
    memcpy(data_refhold.get(), b.ptr, b.size);
    data_refhold.get()[b.size] = 0;  // for C-string safety
    ptr = data_refhold.get();
    size = b.size;
    return *this;
 }

 //////////////////////// InFilePersistentCache //////////////////////

 template <typename Input>
 void InFilePersistentCache::read_cache(Input& inp) {
    uint32_t nr_category;
    inp.read(nr_category);
    char category_buf[256];
    for (uint32_t i = 0; i < nr_category; i++) {
        uint32_t category_size;
        inp.read(category_size);
        inp.read(category_buf, category_size);
        category_buf[category_size] = '\0';

        std::string category(category_buf);
        mgb_log_debug("load new category: %s", category_buf);

        // read bobs
        uint32_t nr_bobs;
        inp.read(nr_bobs);
        for (uint32_t j = 0; j < nr_bobs; j++) {
            BlobStorage key_storage;
            key_storage.init_from_input(inp).init_hash();
            mgb_log_debug("read key: %zu", key_storage.hash);
            m_cache[category][std::move(key_storage)].init_from_input(inp);
        }
    }
 }

 InFilePersistentCache::InFilePersistentCache(const char* path,
                                             bool always_open) {
    if (!access(path, F_OK)) {
        mgb_log_debug("use fastrun cache: %s", path);
        InputFile inp(path);
        read_cache<InputFile>(inp);
    }
    if (always_open) {
        m_always_open_file = std::make_shared<OutputFile>(path);
    }
 }

 InFilePersistentCache::InFilePersistentCache(const uint8_t* bin, size_t size) {
    LITE_ASSERT(bin);
    InputMemory inp(bin, size);
    read_cache<InputMemory>(inp);
 }

 void InFilePersistentCache::dump_cache(const char* path) {
    OutputFile out_file(path);
    dump_cache(&out_file);
 }

 void InFilePersistentCache::dump_cache(OutputFile* out_file) {
    uint32_t nr_category = m_cache.size();
    out_file->write(nr_category);

    for (const auto& cached_category : m_cache) {
        uint32_t category_size = cached_category.first.size();
        out_file->write(category_size);
        out_file->write(cached_category.first.data(), category_size);
        mgb_log_debug("write new category: %s", cached_category.first.c_str());

        uint32_t nr_bobs = cached_category.second.size();
        out_file->write(nr_bobs);
        for (const auto& item : cached_category.second) {
            mgb_log_debug("dump key: %zu", item.first.hash);
            item.first.write_to_file(*out_file);
            item.second.write_to_file(*out_file);
        }
    }
 }

 mgb::Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get(
        const std::string& category, const Blob& key) {
    decltype(m_cache.begin()) iter0;
    {
        MGB_LOCK_GUARD(m_mtx);
        iter0 = m_cache.find(category);
        if (iter0 == m_cache.end())
            return mgb::None;
    }

    BlobStorage key_storage;
    key_storage.Blob::operator=(key);
    key_storage.init_hash();

    MGB_LOCK_GUARD(m_mtx);

    auto iter1 = iter0->second.find(key_storage);
    if (iter1 == iter0->second.end())
        return mgb::None;
    return iter1->second;
 }

 void InFilePersistentCache::put(const std::string& category, const Blob& key,
                                const Blob& value) {
    BlobStorage key_storage;
    key_storage.init_data_ref(key).init_hash();

    MGB_LOCK_GUARD(m_mtx);
    auto size0 = m_cache.size();
    m_cache[category][std::move(key_storage)].init_data_ref(value);
    if (m_cache.size() > size0) {
        mgb_log_debug("new cache category: %s", category.c_str());
    }
    if (m_always_open_file) {
        m_always_open_file->set_head();
        dump_cache(m_always_open_file.get());
        m_always_open_file->flush();
    }
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/algo_cache/file_cache.h
+++ b/lite/src/mge/algo_cache/file_cache.h
@@ -0,0 +1,85 @@
 /**
 * \file lite/src/mge/algo_cache/file_cache.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"
 #if LITE_BUILD_WITH_MGE

 #include "megbrain/utils/persistent_cache.h"

 namespace lite {

 /**
 * dump format:
 *
 * all integers in local endian (effectively little endian as I can see)
 *
 * dump format:
 * <nr_category|uint32_t><category_size|uint32_t><category|uint8_t*>
 * <nr_bob|uint32_t>[<key_size|uint32_t><key|uint8_t*><data_size|
 * uint32_t><data|uint8_t*>]*
 */
 //! TODO: fix one thread set cache when other threads is using old cache
 class InFilePersistentCache final : public mgb::PersistentCache {
    class InputFile;
    class InputMemory;
    class OutputFile;
    struct BlobStorage : public Blob {
        std::unique_ptr<uint8_t[]> data_refhold;
        size_t hash = 0;

        template <typename Input>
        BlobStorage& init_from_input(Input& inp);
        void write_to_file(OutputFile& out_file) const;
        BlobStorage& init_data_ref(const Blob& b);

        BlobStorage& init_hash() {
            hash = mgb::XXHash{}.update(ptr, size).digest();
            return *this;
        }

        bool operator==(const BlobStorage& rhs) const {
            return size == rhs.size && !memcmp(ptr, rhs.ptr, size);
        }

        struct Hash {
            size_t operator()(const BlobStorage& b) const { return b.hash; }
        };
    };
    std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage,
                                                       BlobStorage::Hash>>
            m_cache;
    LITE_MUTEX m_mtx;
    std::shared_ptr<OutputFile> m_always_open_file;

    template <typename Input>
    void read_cache(Input& inp);

 public:
    InFilePersistentCache() = default;
    InFilePersistentCache(const char* path, bool always_open = false);
    InFilePersistentCache(const uint8_t* bin, size_t size);

    /**
     * \warning You should invoke \c dump_cache mannually to save the cache
     * file.
     */
    void dump_cache(const char* path);
    void dump_cache(OutputFile* out_file);

    mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override;
    void put(const std::string& category, const Blob& key,
             const Blob& value) override;
 };

 }  // namespace lite
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/algo_cache/redis_cache.cpp
+++ b/lite/src/mge/algo_cache/redis_cache.cpp
@@ -0,0 +1,241 @@
 /**
 * \file lite/src/mge/algo_cache/redis_cache.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA
 #include "../../misc.h"
 #include "redis_cache.h"

 #include <iostream>
 #include <vector>

 namespace {

 /*
 ** Translation Table as described in RFC1113
 */
 static const char cb64[] =
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

 /*
 ** Translation Table to decode:
 *https://github.com/dgiardini/imgcalkap/blob/master/base64.c
 */
 static const char cd64[] =
        "|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`"
        "abcdefghijklmnopq";

 /*
 ** encodeblock
 **
 ** encode 3 8-bit binary bytes as 4 '6-bit' characters
 */
 void encodeblock(unsigned char in[3], unsigned char out[4], int len) {
    out[0] = cb64[in[0] >> 2];
    out[1] = cb64[((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4)];
    out[2] = (unsigned char)(len > 1 ? cb64[((in[1] & 0x0f) << 2) |
                                            ((in[2] & 0xc0) >> 6)]
                                     : '=');
    out[3] = (unsigned char)(len > 2 ? cb64[in[2] & 0x3f] : '=');
 }

 /*
 ** decodeblock
 **
 ** decode 4 '6-bit' characters into 3 8-bit binary bytes
 */
 void decodeblock(unsigned char in[4], unsigned char out[3]) {
    out[0] = (unsigned char)(in[0] << 2 | in[1] >> 4);
    out[1] = (unsigned char)(in[1] << 4 | in[2] >> 2);
    out[2] = (unsigned char)(((in[2] << 6) & 0xc0) | in[3]);
 }

 /**
 * Encode string to base64 string
 * @param input - source string
 * @param outdata - target base64 string
 * @param linesize - max size of line
 */
 void encode(const std::vector<std::uint8_t>& input,
            std::vector<std::uint8_t>& outdata, int linesize = 76) {
    outdata.clear();

    unsigned char in[3], out[4];
    int i, len, blocksout = 0;
    size_t j = 0;

    auto* indata = reinterpret_cast<const unsigned char*>(input.data());
    unsigned int insize = input.size();

    while (j <= insize) {
        len = 0;
        for (i = 0; i < 3; i++) {
            in[i] = (unsigned char)indata[j];
            j++;
            if (j <= insize) {
                len++;
            } else {
                in[i] = 0;
            }
        }
        if (len) {
            encodeblock(in, out, len);
            for (i = 0; i < 4; i++) {
                outdata.push_back(out[i]);
            }
            blocksout++;
        }
        if (blocksout >= (linesize / 4) || (j == insize)) {
            if (blocksout) {
                outdata.push_back('\r');
                outdata.push_back('\n');
            }
            blocksout = 0;
        }
    }
 }

 /**
 * Decode base64 string ot source
 * @param input - base64 string
 * @param outdata - source string
 */
 void decode(const std::vector<std::uint8_t>& input,
            std::vector<std::uint8_t>& outdata) {
    outdata.clear();

    unsigned char in[4], out[3], v;
    int i, len;
    size_t j = 0;

    auto* indata = reinterpret_cast<const unsigned char*>(input.data());
    unsigned int insize = input.size();

    while (j <= insize) {
        for (len = 0, i = 0; i < 4 && (j <= insize); i++) {
            v = 0;
            while ((j <= insize) && v == 0) {
                v = (unsigned char)indata[j++];
                v = (unsigned char)((v < 43 || v > 122) ? 0 : cd64[v - 43]);
                if (v) {
                    v = (unsigned char)((v == '$') ? 0 : v - 61);
                }
            }
            if (j <= insize) {
                len++;
                if (v) {
                    in[i] = (unsigned char)(v - 1);
                }
            } else {
                in[i] = 0;
            }
        }
        if (len) {
            decodeblock(in, out);
            for (i = 0; i < len - 1; i++) {
                outdata.push_back(out[i]);
            }
        }
    }
 }

 /**
 * Encode binary data to base64 buffer
 * @param input - source data
 * @param outdata - target base64 buffer
 * @param linesize
 */
 void encode(const std::string& input, std::string& outdata, int linesize = 76) {
    std::vector<std::uint8_t> out;
    std::vector<std::uint8_t> in(input.begin(), input.end());
    encode(in, out, linesize);
    outdata = std::string(out.begin(), out.end());
 }

 /**
 * Decode base64 buffer to source binary data
 * @param input - base64 buffer
 * @param outdata - source binary data
 */
 void decode(const std::string& input, std::string& outdata) {
    std::vector<std::uint8_t> in(input.begin(), input.end());
    std::vector<std::uint8_t> out;
    decode(in, out);
    outdata = std::string(out.begin(), out.end());
 }

 }  // namespace

 using namespace lite;

 RedisCache::RedisCache(std::string redis_ip, size_t port, std::string password)
        : m_ip(redis_ip), m_port(port), m_password(password) {
    m_client.auth(password);
    m_client.connect(
            m_ip, m_port,
            [](const std::string& host, std::size_t port,
               cpp_redis::connect_state status) {
                if (status == cpp_redis::connect_state::dropped) {
                    LITE_LOG("client disconnected from %s.", host.c_str());
                    LITE_LOG("Redis server connect to %s :%zu failed.",
                             host.c_str(), port);
                }
            },
            std::uint32_t(200));
 }

 mgb::Maybe<mgb::PersistentCache::Blob> RedisCache::get(
        const std::string& category, const mgb::PersistentCache::Blob& key) {
    LITE_LOCK_GUARD(m_mtx);
    if (m_old == nullptr) {
        return mgb::None;
    }
    auto mem_result = m_old->get(category, key);
    if (mem_result.valid())
        return mem_result;

    std::string key_str(static_cast<const char*>(key.ptr), key.size);
    std::string redis_key_str;
    encode(category + '@' + key_str, redis_key_str, 24);
    auto result = m_client.get(redis_key_str);
    m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100));
    LITE_ASSERT(is_valid());
    auto content = result.get();
    if (content.is_null())
        return mgb::None;
    std::string decode_content;
    decode(content.as_string(), decode_content);
    m_old->put(category, key, {decode_content.data(), decode_content.length()});

    return m_old->get(category, key);
 }

 void RedisCache::put(const std::string& category, const Blob& key,
                     const mgb::PersistentCache::Blob& value) {
    // ScopedTimer t1(std::string("put") + category);
    LITE_LOCK_GUARD(m_mtx);
    std::string key_str(static_cast<const char*>(key.ptr), key.size);
    std::string redis_key_str;
    encode(category + '@' + key_str, redis_key_str);
    std::string value_str(static_cast<const char*>(value.ptr), value.size);
    std::string redis_value_str;
    encode(value_str, redis_value_str);

    auto result = m_client.set(redis_key_str, redis_value_str);
    if (m_old == nullptr) {
        return;
    }
    m_old->put(category, key, value);
    m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100));
    LITE_ASSERT(is_valid());
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/algo_cache/redis_cache.h
+++ b/lite/src/mge/algo_cache/redis_cache.h
@@ -0,0 +1,47 @@
 /**
 * \file lite/src/mge/algo_cache/redis_cache.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"

 #if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA
 #include <cpp_redis/cpp_redis>
 #include <string>
 #include <vector>
 #include "megbrain/utils/persistent_cache.h"

 namespace lite {

 //! TODO: fix one thread set cache when other threads is using old cache
 class RedisCache final : public mgb::PersistentCache {
 public:
    RedisCache(std::string redis_ip, size_t port, std::string password);

    bool is_valid() { return m_client.is_connected(); }
    ~RedisCache() {}
    void init(std::shared_ptr<mgb::PersistentCache> old) { m_old = old; }

    mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override;

    void put(const std::string& category, const Blob& key,
             const Blob& value) override;

 private:
    std::shared_ptr<mgb::PersistentCache> m_old;
    LITE_MUTEX m_mtx;
    cpp_redis::client m_client;
    const std::string m_ip;
    const size_t m_port;
    const std::string m_password;
 };

 }  // namespace lite
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/common.cpp
+++ b/lite/src/mge/common.cpp
@@ -0,0 +1,191 @@
 /**
 * \file src/mge/common.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "common.h"
 #include "megdnn/dtype.h"

 using namespace lite;
 using namespace mgb;

 enum class CompressionMethod {
    NO_COMPRESSION = 0,
    FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS = 1,
    FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS = 2,
 };

 void lite::decompressed_tensor_value_loader(
        void* ptr_, const mgb::TensorLayout& layout,
        mgb::serialization::InputFile& fin) {
    uint8_t compress_flag;
    fin.read(&compress_flag, sizeof(compress_flag));
    size_t num_weights = layout.total_nr_elems();
    switch (CompressionMethod(compress_flag)) {
        case CompressionMethod::NO_COMPRESSION: {
            mgb::serialization::GraphLoadConfig::default_tensor_value_loader(
                    ptr_, layout, fin);
            break;
        }
        case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS: {
            if (ptr_) {
                float stride, base;
                std::vector<uint8_t> weights(num_weights);
                fin.read(&stride, sizeof(stride));
                fin.read(&base, sizeof(base));
                fin.read(weights.data(), num_weights * sizeof(uint8_t));
                auto* ptr = static_cast<float*>(ptr_);
                for (size_t i = 0; i < num_weights; ++i)
                    ptr[i] = stride * weights[i] + base;
            } else {
                fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint8_t));
            }
            break;
        }
        case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS: {
            if (ptr_) {
                float stride, base;
                std::vector<uint16_t> weights(num_weights);
                fin.read(&stride, sizeof(stride));
                fin.read(&base, sizeof(base));
                fin.read(weights.data(), num_weights * sizeof(uint16_t));
                auto* ptr = static_cast<float*>(ptr_);
                for (size_t i = 0; i < num_weights; ++i)
                    ptr[i] = stride * weights[i] + base;
            } else {
                fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint16_t));
            }
            break;
        }
        default:
            LITE_THROW("Unexpected compression method");
    }
 }

 LTensorLayout lite::to_impl_layout(const Layout& layout) {
    mgb::TensorLayout mge_layout;
    mge_layout.ndim = layout.ndim;
    LITE_ASSERT(layout.ndim < TensorShape::MAX_NDIM,
                "lite layout ndim is to large");
    for (size_t i = 0; i < layout.ndim; i++) {
        mge_layout.shape[i] = layout.shapes[i];
    }
    mge_layout.init_contiguous_stride();
    switch (layout.data_type) {
        case LiteDataType::LITE_FLOAT:
            mge_layout.dtype = mgb::dtype::Float32();
            break;
        case LiteDataType::LITE_HALF:
            mge_layout.dtype = mgb::dtype::Float16();
            break;
        case LiteDataType::LITE_INT:
            mge_layout.dtype = mgb::dtype::Int32();
            break;
        case LiteDataType::LITE_INT8:
            mge_layout.dtype = mgb::dtype::Int8();
            break;
        case LiteDataType::LITE_UINT8:
            mge_layout.dtype = mgb::dtype::Uint8();
            break;
        case LiteDataType::LITE_INT16:
            mge_layout.dtype = mgb::dtype::Int16();
            break;
        default:
            LITE_THROW(mgb::ssprintf("unsupport dtype in lite enum id is %d.",
                                     static_cast<int>(layout.data_type)));
    }
    return mge_layout;
 }

 Layout lite::to_lite_layout(const LTensorLayout& mge_layout) {
    Layout layout;
    if (!mge_layout.dtype.valid()) {
        return layout;
    }
    layout.ndim = mge_layout.ndim;
    LITE_ASSERT(layout.ndim < layout.MAXDIM, "tensor layout ndim is to large");
    for (size_t i = 0; i < layout.ndim; i++) {
        layout.shapes[i] = mge_layout.shape[i];
    }
    switch (mge_layout.dtype.enumv()) {
        case mgb::DTypeEnum::Float32:
            layout.data_type = LiteDataType::LITE_FLOAT;
            break;
        case mgb::DTypeEnum::Float16:
            layout.data_type = LiteDataType::LITE_HALF;
            break;
        case mgb::DTypeEnum::Int32:
            layout.data_type = LiteDataType::LITE_INT;
            break;
        case mgb::DTypeEnum::Int16:
            layout.data_type = LiteDataType::LITE_INT16;
            break;
        case mgb::DTypeEnum::Int8:
            layout.data_type = LiteDataType::LITE_INT8;
            break;
        case mgb::DTypeEnum::Uint8:
            layout.data_type = LiteDataType::LITE_UINT8;
            break;
        default:
            LITE_THROW(mgb::ssprintf("unsupport dtype in lite : %s.",
                                     mge_layout.to_string().c_str()));
    }
    return layout;
 }

 mgb::CompNode::Locator lite::to_compnode_locator(const LiteDeviceType& device) {
    mgb::CompNode::Locator loc;
    switch (device) {
        case LiteDeviceType::LITE_CPU:
            loc.type = mgb::CompNode::DeviceType::CPU;
            break;
        case LiteDeviceType::LITE_CUDA:
            loc.type = mgb::CompNode::DeviceType::CUDA;
            break;
        case LiteDeviceType::LITE_ATLAS:
            loc.type = mgb::CompNode::DeviceType::ATLAS;
            break;
        case LiteDeviceType::LITE_OPENCL:
            loc.type = mgb::CompNode::DeviceType::OPENCL;
            break;
        case LiteDeviceType::LITE_DEVICE_DEFAULT:
            loc.type = mgb::CompNode::DeviceType::UNSPEC;
            break;
        default:
            LITE_THROW(
                    ssprintf("lite unsupported compnode type: enum value: %d.",
                             (int)(device)));
    }
    return loc;
 }

 LiteDeviceType lite::get_device_from_locator(
        const mgb::CompNode::Locator& locator) {
    switch (locator.type) {
        case mgb::CompNode::DeviceType::CPU:
        case mgb::CompNode::DeviceType::MULTITHREAD:
            return LiteDeviceType::LITE_CPU;
        case mgb::CompNode::DeviceType::CUDA:
            return LiteDeviceType::LITE_CUDA;
        case mgb::CompNode::DeviceType::ATLAS:
            return LiteDeviceType::LITE_ATLAS;
        case mgb::CompNode::DeviceType::OPENCL:
            return LiteDeviceType::LITE_OPENCL;
        case mgb::CompNode::DeviceType::UNSPEC:
            return LiteDeviceType::LITE_DEVICE_DEFAULT;
        default:
            LITE_THROW(
                    ssprintf("lite unsupported compnode type: enum value: %d.",
                             (int)(locator.type)));
    }
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/common.h
+++ b/lite/src/mge/common.h
@@ -0,0 +1,66 @@
 /**
 * \file src/mge/common.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "../misc.h"
 #include "lite/network.h"
 #include "lite/tensor.h"
 #include "megbrain/comp_node.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/tensor.h"

 //! rename mge name L*
 namespace lite {
 using LTensorLayout = mgb::TensorLayout;
 using LComputingGraph = mgb::ComputingGraph;
 using LDeviceTensorStorage = mgb::DeviceTensorStorage;
 }  // namespace lite

 namespace lite {
 /*!
 * \brief transform mgelite Layout to mgb TensorLayout
 */
 LTensorLayout to_impl_layout(const Layout& layout);

 /*!
 * \brief transform mgb TensorLayout to mgelite Layout
 */
 Layout to_lite_layout(const mgb::TensorLayout& mge_layout);

 /*!
 * \brief transform mgelite device to mgb CompNode Locator
 */
 mgb::CompNode::Locator to_compnode_locator(const LiteDeviceType& device);

 /*!
 * \brief transform mgb CompNode Locator to lite Device
 */
 LiteDeviceType get_device_from_locator(const mgb::CompNode::Locator& locator);

 /*! \brief A megbrain tensor loader with weight decompression.
 *
 * The weight to be compressed must start with a byte of compression flag (CF).
 *
 * 1. CF = 0: no compression.
 * 2. CF = 1: float32 stride + float32 base + uint8 weight (return s*w+b)
 * 3. CF = 2: float32 stride + float32 base + uint16 weight (return s*w+b)
 *
 */
 void decompressed_tensor_value_loader(void* ptr_,
                                      const mgb::TensorLayout& layout,
                                      mgb::serialization::InputFile& fin);

 }  // namespace lite
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/function_dft.h
+++ b/lite/src/mge/function_dft.h
@@ -0,0 +1,212 @@

 /**
 * \file src/mge/function_dft.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #if LITE_BUILD_WITH_MGE
 #include "function_base.h"
 #include "network_impl.h"
 #include "network_impl_base.h"
 #include "tensor_impl.h"
 namespace lite {

 #define THROW_FUNC_ERROR(func_name)                                   \
    auto msg_info = func_name + "  is not aviliable in Dft backend."; \
    LITE_THROW(msg_info.c_str())

 // the functions used for dft's tensor.cpp are as followed:

 template <>
 inline std::shared_ptr<Tensor::TensorImplBase>
 call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
        std::string func_name) {
    if (func_name == "create_tensor") {
        return std::make_shared<TensorImplDft>();
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline std::shared_ptr<Tensor::TensorImplBase>
 call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
        std::string func_name, LiteDeviceType device_type,
        bool is_pinned_host) {
    if (func_name == "create_tensor") {
        return std::make_shared<TensorImplDft>(device_type, is_pinned_host);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline std::shared_ptr<Tensor::TensorImplBase>
 call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
        std::string func_name, int device_id, LiteDeviceType device_type,
        const Layout layout, bool is_pinned_host) {
    if (func_name == "create_tensor") {
        return std::make_shared<TensorImplDft>(device_id, device_type, layout,
                                               is_pinned_host);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline std::shared_ptr<Tensor::TensorImplBase>
 call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
        std::string func_name, LiteDeviceType device_type, const Layout layout,
        bool is_pinned_host) {
    if (func_name == "create_tensor") {
        return std::make_shared<TensorImplDft>(device_type, layout,
                                               is_pinned_host);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline std::shared_ptr<Tensor::TensorImplBase>
 call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>(
        std::string func_name, int device_id, int stream_id,
        LiteDeviceType device_type, bool is_pinned_host) {
    if (func_name == "create_tensor") {
        return std::make_shared<TensorImplDft>(device_id, stream_id,
                                               device_type, is_pinned_host);
    }
    THROW_FUNC_ERROR(func_name);
 }

 // the functions used for dft's network.cpp are as followed:

 template <>
 inline std::unique_ptr<Network::NetworkImplBase>
 call_func<NetworkImplDft, std::unique_ptr<Network::NetworkImplBase>>(
        std::string func_name) {
    if (func_name == "create_network") {
        return std::make_unique<NetworkImplDft>();
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline Network::NetworkImplBase*
 try_call_func<NetworkImplDft, Network::NetworkImplBase*>(
        std::string func_name) {
    if (func_name == "parse_model") {
        return new NetworkImplDft();
    }
    THROW_FUNC_ERROR(func_name);
 }

 #define CALL_FUNC(func_name, ...) \
    network_impl->cast_final_safe<NetworkImplDft>().func_name(__VA_ARGS__)

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl,
        size_t num) {
    if (func_name == "set_cpu_threads_number") {
        CALL_FUNC(set_cpu_threads_number, num);
    } else if (func_name == "set_network_algo_workspace_limit") {
        CALL_FUNC(set_network_algo_workspace_limit, num);
    } else {
        THROW_FUNC_ERROR(func_name);
    }
 }

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl) {
    if (func_name == "use_tensorrt") {
        CALL_FUNC(use_tensorrt);
    } else if (func_name == "set_cpu_inplace_mode") {
        CALL_FUNC(set_cpu_inplace_mode);
    } else {
        THROW_FUNC_ERROR(func_name);
    }
 }

 template <>
 inline size_t call_func<NetworkImplDft, size_t>(
        std::string func_name, Network::NetworkImplBase* network_impl) {
    if (func_name == "get_cpu_threads_number") {
        return CALL_FUNC(get_cpu_threads_number);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline bool call_func<NetworkImplDft, bool>(
        std::string func_name, Network::NetworkImplBase* network_impl) {
    if (func_name == "is_cpu_inplace_mode") {
        return CALL_FUNC(is_cpu_inplace_mode);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl,
        ThreadAffinityCallback thread_affinity_callback) {
    if (func_name == "set_runtime_thread_affinity") {
        return CALL_FUNC(set_runtime_thread_affinity,
                         std::move(thread_affinity_callback));
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl,
        LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size,
        bool binary_equal_between_batch) {
    if (func_name == "set_network_algo_policy") {
        return CALL_FUNC(set_network_algo_policy, strategy, shared_batch_size,
                         binary_equal_between_batch);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl,
        std::shared_ptr<Allocator> user_allocator) {
    if (func_name == "set_memory_allocator") {
        return CALL_FUNC(set_memory_allocator, user_allocator);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl,
        std::string file_name) {
    if (func_name == "enable_io_txt_dump") {
        return CALL_FUNC(enable_io_txt_dump, file_name);
    } else if (func_name == "enable_io_bin_dump") {
        return CALL_FUNC(enable_io_bin_dump, file_name);
    }
    THROW_FUNC_ERROR(func_name);
 }

 template <>
 inline void call_func<NetworkImplDft, void>(
        std::string func_name, Network::NetworkImplBase* network_impl,
        Network::NetworkImplBase* src_network_impl) {
    if (func_name == "share_runtime_memory_with") {
        CALL_FUNC(share_runtime_memory_with, src_network_impl);
    } else if (func_name == "shared_weight_with") {
        CALL_FUNC(shared_weight_with, src_network_impl);
    } else {
        THROW_FUNC_ERROR(func_name);
    }
 }
 #undef THROW_FUNC_ERROR

 }  // namespace lite
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/memory_allocator.h
+++ b/lite/src/mge/memory_allocator.h
@@ -0,0 +1,69 @@
 /**
 * \file src/mge/memory_alloctor.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "common.h"
 #include "megbrain/dtype.h"
 #include "network_impl.h"

 #include "megbrain/graph/cg.h"

 namespace lite {

 class UserStaticMemAlloc final : public mgb::cg::DeviceMemoryAllocator {
    std::shared_ptr<Allocator> m_allocator = nullptr;

 public:
    UserStaticMemAlloc(std::shared_ptr<Allocator> allocator)
            : m_allocator(allocator) {}

    void alloc_static(LComputingGraph*, LDeviceTensorStorage& dest,
                      size_t size) override {
        if (size < dest.size()) {
            return;
        }
        auto cn = dest.comp_node_allow_invalid();
        LITE_ASSERT(cn.valid(), "The compnode is invalid when alloc memory.");
        LiteDeviceType device_type =
                get_device_from_locator(cn.locator_logical());
        int device_id = cn.locator_logical().device;
        auto ptr_alloc = static_cast<mgb::dt_byte*>(m_allocator->allocate(
                device_type, device_id, size, cn.get_mem_addr_alignment()));
        auto storage = std::shared_ptr<mgb::dt_byte>(
                ptr_alloc,
                [allocator = m_allocator, device_type, device_id](void* ptr) {
                    allocator->free(device_type, device_id, ptr);
                });
        dest.reset(cn, size, storage);
    }
    void alloc_dynamic(mgb::VarNode*, mgb::DeviceTensorStorage& dest,
                       size_t size) override {
        alloc_static(nullptr, dest, size);
    }

    void defrag_prealloc_contig(mgb::ComputingGraph*, mgb::CompNode comp_node,
                                size_t size) override {
        LiteDeviceType device_type =
                get_device_from_locator(comp_node.locator_logical());
        int device_id = comp_node.locator_logical().device;
        auto ptr_tmp =
                m_allocator->allocate(device_type, device_id, size,
                                      comp_node.get_mem_addr_alignment());
        m_allocator->free(device_type, device_id, ptr_tmp);
    }
 };

 }  // namespace lite
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/network_impl.cpp
+++ b/lite/src/mge/network_impl.cpp
@@ -0,0 +1,781 @@
 /**
 * \file src/mge/network_impl.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "network_impl.h"
 #include "common.h"
 #include "lite/network.h"
 #include "memory_allocator.h"
 #include "parse_model/model_parser.h"
 #include "parse_info/parse_info_base.h"

 #include "megbrain/common.h"
 #include "megbrain/comp_node.h"
 #include "megbrain/comp_node_env.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/graph.h"
 #include "megbrain/graph/cg.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/tensor.h"

 #if MGB_OPENCL
 #include "megcore_opencl.h"
 #endif

 #include <fstream>
 #include <memory>
 #include <set>

 using namespace lite;
 using namespace mgb;

 LITE_DYN_TYPE_OBJ_FINAL_IMPL(NetworkImplDft);

 void NetworkImplDft::set_config(const Config& config) {
    m_user_config = std::make_unique<Config>();
    *m_user_config = config;
    m_load_config.comp_graph = mgb::ComputingGraph::make();
    m_compnode_locator = to_compnode_locator(m_user_config->device_type);
    m_compnode_locator.device = config.device_id;
 }

 void NetworkImplDft::shared_weight_with(const NetworkImplBase* src_network) {
    application_config();
    const auto& src_impl = src_network->cast_final_safe<NetworkImplDft>();
    LITE_ASSERT(src_impl.m_loader,
                "Clone network must after the network is loaded.");
    m_load_result = src_impl.m_loader->load(m_load_config, true);

    //! flag weather the mode is cross compnode model
    cross_compnode_model_detect();

    //! update the IO of the network
    update_io();

    //! replace the IO when there is device input or output
    compile_graph();
 }

 void NetworkImplDft::application_config() {
    auto device_type = m_user_config->device_type;
    m_compnode_locator.type = to_compnode_locator(device_type).type;
    m_compnode_locator.device = m_user_config->device_id;
    if (m_nr_threads > 1 && device_type == LiteDeviceType::LITE_CPU) {
        m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD;
        m_compnode_locator.device = m_user_config->device_id;
    }
    //! model options
 #define ConfigOption(mge_name, lite_name) \
    options.mge_name = m_user_config->options.lite_name;

    auto&& options = m_load_config.comp_graph->options();
    ConfigOption(graph_opt.weight_preprocess, weight_preprocess);
    ConfigOption(graph_opt.fuse_preprocess, fuse_preprocess);
    ConfigOption(fake_next_exec, fake_next_exec);
    ConfigOption(var_sanity_check_first_run, var_sanity_check_first_run);
    m_load_config.const_var_shape = m_user_config->options.const_shape;
    ConfigOption(force_dynamic_alloc, force_dynamic_alloc);
    ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc);
    ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change);
    LITE_ASSERT(m_user_config->options.jit_level == 0 ||
                        (m_user_config->options.jit_level > 0 &&
                         device_type == LiteDeviceType::LITE_CUDA),
                "jit only support in cuda device.");
    ConfigOption(graph_opt.jit, jit_level);
    ConfigOption(comp_node_seq_record_level, comp_node_seq_record_level);
    ConfigOption(graph_opt_level, graph_opt_level);
    ConfigOption(async_exec_level, async_exec_level);

 #undef ConfigOption
 #define ConfigOptionLayoutTransform(name) \
    if (m_user_config->options.name) {    \
        options.graph_opt.name();         \
    }
    ConfigOptionLayoutTransform(enable_nchw44);
    ConfigOptionLayoutTransform(enable_nchw44_dot);
    ConfigOptionLayoutTransform(enable_nchw88);
    ConfigOptionLayoutTransform(enable_nhwcd4);
    ConfigOptionLayoutTransform(enable_nchw4);
    ConfigOptionLayoutTransform(enable_nchw32);
    ConfigOptionLayoutTransform(enable_nchw64);
 #undef ConfigOptionLayoutTransform
    if (m_user_config->has_compression) {
        m_load_config.tensor_value_loader = decompressed_tensor_value_loader;
    }

    //! if device is LITE_NONE, the compnode information is stored in model
    if (device_type != LiteDeviceType::LITE_DEVICE_DEFAULT) {
        //! currently not set Locator type because an atlas mgb model is a
        //! cross-compnode graph
        if (device_type == LiteDeviceType::LITE_ATLAS) {
            m_load_config.comp_node_mapper =
                    [this](mgb::CompNode::Locator& loc) {
                        if (loc.type == mgb::CompNode::DeviceType::ATLAS) {
                            loc.device = m_compnode_locator.device;
                            loc.stream = m_compnode_locator.stream;
                        } else if (loc.type ==
                                   mgb::CompNode::DeviceType::MULTITHREAD) {
                            loc.stream = m_nr_threads;
                        }
                    };
        } else {
            m_load_config.comp_node_mapper =
                    [this](mgb::CompNode::Locator& loc) {
                        loc = m_compnode_locator;
                    };
        }
    }
 }

 void NetworkImplDft::set_memory_allocator(
        std::shared_ptr<Allocator> user_allocator) {
    auto allocator = std::make_shared<UserStaticMemAlloc>(user_allocator);
    LITE_ASSERT(m_load_config.comp_graph);
    m_load_config.comp_graph->set_device_memory_allocator(allocator);
 }

 //! share the runtime memory with other network, the weights is not shared
 void NetworkImplDft::share_runtime_memory_with(
        Network::NetworkImplBase* network_impl) {
    LITE_ASSERT(network_impl);
    LITE_ASSERT(m_load_config.comp_graph);
    m_load_config.comp_graph->share_device_memory_with(
            *(network_impl->cast_final_safe<NetworkImplDft>()
                      .m_load_config.comp_graph));
 }

 void NetworkImplDft::set_cpu_inplace_mode() {
    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU,
                "cpu inplace mode is only avaliable in CPU.");
    m_is_cpu_inplace_mode = true;
    if (m_compnode_locator.type == mgb::CompNode::DeviceType::CPU) {
        m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT;
    } else {
        LITE_ASSERT(
                m_compnode_locator.type == CompNode::DeviceType::MULTITHREAD,
                "cpu inplace mode is only avaliable in CPU.");
        m_compnode_locator.device =
                mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
    }
 }

 void NetworkImplDft::set_cpu_threads_number(size_t nr_threads) {
    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU,
                "multi threads mode is only avaliable in CPU.");
    if (nr_threads > 1) {
        m_nr_threads = nr_threads;
        m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD;
        m_compnode_locator.nr_threads = nr_threads;
    }
 }

 void NetworkImplDft::set_runtime_thread_affinity(
        const ThreadAffinityCallback& thread_affinity_callback) {
    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU,
                "multi threads mode is only avaliable in CPU.");
    mgb::CompNode::Locator loc;
    m_load_config.comp_node_mapper(loc);
    auto cn = mgb::CompNode::load(loc);
    if (m_nr_threads > 1) {
        mgb::CompNodeEnv::from_comp_node(cn).cpu_env().set_affinity(
                thread_affinity_callback);
    } else {
        mgb::CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
                [thread_affinity_callback](void) {
                    thread_affinity_callback(0);
                });
    }
 }

 void NetworkImplDft::set_device_id(int device_id) {
    m_compnode_locator.device = device_id;
    m_user_config->device_id = device_id;
 }

 void NetworkImplDft::set_stream_id(int stream_id) {
    m_compnode_locator.stream = stream_id;
 }

 void NetworkImplDft::use_tensorrt() {
    auto&& options = m_load_config.comp_graph->options();
    options.graph_opt.tensorrt = true;
 }

 //! set the callback in async model
 void NetworkImplDft::set_async_callback(const AsyncCallback& callback) {
    LITE_ASSERT(!m_is_cpu_inplace_mode,
                "cpu inplace mode not support async mode");
    LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU ||
                        m_user_config->device_type == LiteDeviceType::LITE_CUDA,
                "Now only cpu and cuda>10.0 support async mode");
    m_async = true;
    m_async_callback = std::move(callback);
 }

 void NetworkImplDft::make_output_spec() {
    m_output_spec.clear();
    for (auto&& out : m_network_io->outputs) {
        if (m_load_result.output_var_map.count(out.name)) {
            auto&& load_out = m_load_result.output_var_map[out.name];
            auto cb = [&out, this](const mgb::DeviceTensorND& dv) mutable {
                mgb::CompNode comp_node = dv.comp_node();
                if (out.io_type == LiteIOType::LITE_IO_SHAPE) {
                    auto mgb_layout = dv.layout();
                    out.lite_tensor->set_layout(to_lite_layout(mgb_layout));
                } else {
                    TensorHelper::implement(out.lite_tensor)
                            ->cast_final_safe<TensorImplDft>()
                            .copy_from_mge_tensor(dv);
                    out.lite_tensor->update_from_implement();
                }
                if (m_async) {
                    out.have_sync = true;
                    bool need_exec_cb = true;
                    for (auto&& j : m_network_io->outputs) {
                        if (!j.have_sync) {
                            need_exec_cb = false;
                        }
                    }
                    if (need_exec_cb) {
                        for (auto&& j : m_network_io->outputs) {
                            j.have_sync = false;
                        }
                        comp_node.add_callback([this]() { finish(); });
                    }
                }
            };
            m_output_spec.emplace_back(load_out, std::move(cb));
        } else {
            LITE_THROW(ssprintf("no output named : %s in the mode",
                                out.name.c_str()));
        }
    }
 }

 void NetworkImplDft::replace_dev_input_pass() {
    mgb::CompNode::Locator locator;
    m_load_config.comp_node_mapper(locator);
    //! CPU is not need use device input
    if (locator.type == mgb::CompNode::DeviceType::CPU) {
        return;
    }
    //! repalce the H2D with VolatileSharedDeviceTensor, and keep the dev tensor
    //! in m_network_io.input, user can directly change the dev tensor
    //! storage through m_network_io.input.lite_tensor->reset() befor forward
    using DeviceTensorMap =
            std::unordered_map<std::string,
                               std::shared_ptr<mgb::DeviceTensorND>>;
    DeviceTensorMap name2dev_tensor;

    mgb::ThinHashMap<mgb::HostTensorND*, mgb::SymbolVar> host_val2var;

    //! construct host_val2var that maps from host tensor to corresponding var
    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
        if (opr->same_type<mgb::opr::Host2DeviceCopy>()) {
            mgb::HostTensorND* tensor =
                    opr->cast_final<mgb::opr::Host2DeviceCopy>()
                            .host_data()
                            .get();
            host_val2var[tensor] = opr->output(0);
        }
    };
    mgb::cg::DepOprIter dep_iter{on_opr};
    for (auto i : m_load_result.output_var_list) {
        dep_iter.add(i.node()->owner_opr());
    }

    mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> inp_var_map, out_var_map;

    mgb::SmallVector<std::string> to_clear;
    for (auto&& config_in : m_network_io->inputs) {
        if (!config_in.is_host) {
            auto host_val = m_load_result.tensor_map[config_in.name];
            auto dev_val = TensorHelper::implement(config_in.lite_tensor)
                                   ->cast_final_safe<TensorImplDft>()
                                   .m_dev_tensor;
            auto dev_var = mgb::opr::VolatileSharedDeviceTensor::make(
                    *m_load_result.graph, dev_val, {config_in.name});
            inp_var_map[host_val2var.at(host_val.get())] = dev_var;
            name2dev_tensor[config_in.name] = dev_val;
        }
    }
    auto new_ovar =
            mgb::cg::replace_vars(m_load_result.output_var_list, inp_var_map);
    for (size_t i = 0; i < new_ovar.size(); ++i) {
        out_var_map[m_load_result.output_var_list[i]] = new_ovar[i];
    }
    for (auto&& i : m_load_result.output_var_map) {
        i.second = out_var_map.at(i.second);
    }
    for (auto&& i : m_load_result.output_var_map_id) {
        i.second = out_var_map.at(i.second);
    }
    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
        new_ovar[i].rename(m_load_result.output_var_list[i].node()->name());
    }
    m_load_result.output_var_list = std::move(new_ovar);
 }

 void NetworkImplDft::cross_compnode_model_detect() {
    mgb::ThinHashSet<LiteDeviceType> nr_used_device_type;
    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
        for (auto j : opr->output()) {
            if (j->comp_node() != mgb::CompNode::default_cpu()) {
                nr_used_device_type.insert(
                        get_device_from_locator(j->comp_node().locator()));
            }
        }
    };
    mgb::cg::DepOprIter dep_iter{on_opr};
    for (auto i : m_load_result.output_var_list) {
        dep_iter.add(i.node()->owner_opr());
    }
    m_nr_device_type  = nr_used_device_type.size();
 }

 void NetworkImplDft::load_model(
        std::shared_ptr<void> model_mem, size_t size,
        std::unordered_map<std::string, LiteAny> separate_config_map) {
    if (!m_loader) {
        m_input_file = mgb::serialization::InputFile::make_mem_proxy(
                model_mem, size, false);
        auto format =
                mgb::serialization::GraphLoader::identify_graph_dump_format(
                        *m_input_file);
        if (!format.valid()) {
            LITE_THROW("invalid model format");
        }
        m_loader = mgb::serialization::GraphLoader::make(
                std::move(m_input_file), format.val());
    }


    //! applay the user configration to mge model
    application_config();

    //! config some flag get from json config file
    if (separate_config_map.find("device_id") != separate_config_map.end()) {
        set_device_id(separate_config_map["device_id"].unsafe_cast<int>());
    }
    if (separate_config_map.find("number_threads") !=
                separate_config_map.end() &&
        separate_config_map["number_threads"].unsafe_cast<size_t>() > 1) {
        set_cpu_threads_number(
                separate_config_map["number_threads"].unsafe_cast<size_t>());
    }
    if (separate_config_map.find("enable_inplace_model") !=
                separate_config_map.end() &&
        separate_config_map["enable_inplace_model"].unsafe_cast<bool>()) {
        set_cpu_inplace_mode();
    }
    if (separate_config_map.find("use_tensorrt") != separate_config_map.end() &&
        separate_config_map["use_tensorrt"].unsafe_cast<bool>()) {
        use_tensorrt();
    }

    m_load_result = m_loader->load(m_load_config, true);

    cross_compnode_model_detect();

    //! update the IO of the network
    update_io();

    //! replace the IO when there is device input or output
    compile_graph();
 }

 void NetworkImplDft::compile_graph() {
    modify_exection_policy();
    replace_dev_input_pass();
    make_output_spec();
    m_execute_func = m_load_result.graph_compile(m_output_spec);
 }

 void NetworkImplDft::start() const {
    if (m_start_callback) {
        std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>>
                input_io_map;
        for (auto&& io_inner : m_network_io->inputs) {
            input_io_map[io_inner.name] = {
                    IO{io_inner.name, io_inner.is_host, io_inner.io_type,
                       io_inner.config_layout},
                    io_inner.lite_tensor};
        }
        m_start_callback(input_io_map);
    }
 }

 void NetworkImplDft::forward() {
    start();
    LITE_ASSERT(m_execute_func, "forward must be called after network loaded.");
    m_execute_func->execute();
 }

 void NetworkImplDft::wait() {
    if (!m_async) {
        m_execute_func->wait();
    }
    finish();
 }

 void NetworkImplDft::finish() const {
    if (m_async) {
        LITE_ASSERT(m_async_callback,
                    "The callback func must set when async mode.");
        m_async_callback();
    }
    if (m_finish_callback) {
        std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>>
                output_io_map;
        for (auto&& io_inner : m_network_io->outputs) {
            output_io_map[io_inner.name] = {
                    IO{io_inner.name, io_inner.is_host, io_inner.io_type,
                       io_inner.config_layout},
                    io_inner.lite_tensor};
        }
        m_finish_callback(output_io_map);
    }
    output_plugin_result();
 }

 void NetworkImplDft::set_io(const NetworkIO& network_io) {
    m_network_io = std::make_unique<NetworkIOInner>();
    for (auto&& in : network_io.inputs) {
        m_network_io->inputs.emplace_back(in);
    }
    for (auto&& out : network_io.outputs) {
        m_network_io->outputs.emplace_back(out);
    }
 }

 void NetworkImplDft::update_io() {
    update_input();
    update_output();
 }

 void NetworkImplDft::update_input() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
    auto stream_id = m_compnode_locator.stream;
    //! if cpu all input and output are host
    if (device_type == LiteDeviceType::LITE_CPU) {
        for (auto&& in : m_network_io->inputs) {
            in.is_host = true;
        }
    }
    //! if cross compnode model, modify the device input if it is not valid
    if (m_nr_device_type > 1) {
        for (auto&& in_tensor_iter : m_load_result.tensor_map) {
            for (auto&& config_in : m_network_io->inputs) {
                //! if tensor is set to device input
                if (in_tensor_iter.first == config_in.name &&
                    !config_in.is_host) {
                    //! if the origin compnode of the tensor is not the device,
                    //! set the input to host
                    if (get_device_from_locator(
                                in_tensor_iter.second->comp_node().locator()) ==
                        LiteDeviceType::LITE_CPU) {
                        config_in.is_host = true;
                        LITE_WARN(
                                "The input tensor %s of the cross device model "
                                "should not from device.",
                                config_in.name.c_str());
                    }
                }
            }
        }
    }
    for (auto&& in_tensor_iter : m_load_result.tensor_map) {
        bool found = false;
        for (auto&& config_in : m_network_io->inputs) {
            if (in_tensor_iter.first == config_in.name) {
                found = true;
                if (config_in.is_host) {
                    config_in.lite_tensor = std::make_shared<Tensor>(
                            device_id, stream_id, device_type, true);
                    TensorHelper::implement(config_in.lite_tensor)
                            ->cast_final_safe<TensorImplDft>()
                            .m_host_tensor = in_tensor_iter.second;
                    config_in.lite_tensor->update_from_implement();
                } else {
                    config_in.lite_tensor = std::make_shared<Tensor>(
                            device_id, stream_id, device_type);
                    config_in.lite_tensor->set_layout(
                            to_lite_layout(in_tensor_iter.second->layout()));
                }
                if (config_in.config_layout.ndim &&
                    !(config_in.config_layout ==
                      config_in.lite_tensor->get_layout())) {
                    config_in.lite_tensor->set_layout(config_in.config_layout);
                }
            }
        }
        if (!found) {
            IOInner io_in;
            io_in.name = in_tensor_iter.first;
            io_in.lite_tensor = std::make_shared<Tensor>(device_id, stream_id,
                                                         device_type, true);
            TensorHelper::implement(io_in.lite_tensor)
                    ->cast_final_safe<TensorImplDft>()
                    .m_host_tensor = in_tensor_iter.second;
            io_in.lite_tensor->update_from_implement();
            m_network_io->inputs.push_back(io_in);
        }
    }
    //! delete the IO that is not the network
    for (auto it = m_network_io->inputs.begin();
         it != m_network_io->inputs.end();) {
        if (it->lite_tensor == nullptr) {
            LITE_LOG("%s is not the network input, ignore it.",
                     it->name.c_str());
            it = m_network_io->inputs.erase(it);
        } else {
            it++;
        }
    }
 }

 void NetworkImplDft::update_output() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
    auto stream_id = m_compnode_locator.stream;
    if (device_type == LiteDeviceType::LITE_CPU) {
        for (auto&& out : m_network_io->outputs) {
            out.is_host = true;
        }
    }
    //! delete the output that is not the network
    for (auto out_it = m_network_io->outputs.begin();
         out_it != m_network_io->outputs.end();) {
        if (std::find_if(m_load_result.output_var_list.begin(),
                         m_load_result.output_var_list.end(),
                         [out_it](const mgb::SymbolVar var) {
                             return var.node()->name() == out_it->name;
                         }) == m_load_result.output_var_list.end()) {
            LITE_LOG("%s is not the network output, ignore it.",
                     out_it->name.c_str());
            out_it = m_network_io->outputs.erase(out_it);
        } else {
            out_it++;
        }
    }
    //! user config the output tensor, so only compute the config output
    if (m_compute_configured_output_only) {
        LITE_ASSERT(m_network_io->outputs.size() > 0,
                    "compute configured output only with no configure output.");
        for (auto out_it = m_network_io->outputs.begin();
             out_it != m_network_io->outputs.end(); out_it++) {
            //! use pinned memory to copy form device
            if (out_it->is_host) {
                out_it->lite_tensor = std::make_shared<Tensor>(
                        device_id, stream_id, device_type, true);
            } else {
                out_it->lite_tensor = std::make_shared<Tensor>(
                        device_id, stream_id, device_type);
            }
        }
        //! user not set, use default output
    } else {
        for (auto&& out : m_load_result.output_var_list) {
            auto it = std::find_if(m_network_io->outputs.begin(),
                                   m_network_io->outputs.end(),
                                   [&out](const IOInner io) {
                                       return io.name == out.node()->name();
                                   });
            if (it != m_network_io->outputs.end()) {
                if (it->is_host) {
                    it->lite_tensor = std::make_shared<Tensor>(
                            device_id, stream_id, device_type, true);
                } else {
                    it->lite_tensor = std::make_shared<Tensor>(
                            device_id, stream_id, device_type);
                }
            } else {
                IOInner output;
                output.name = out.node()->name();
                output.lite_tensor = std::make_shared<Tensor>(
                        device_id, stream_id, device_type, true);
                m_network_io->outputs.push_back({output});
            }
        }
    }
 }

 std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(std::string io_name,
                                                      LiteTensorPhase phase) {
    if (phase == LiteTensorPhase::LITE_INPUT ||
        phase == LiteTensorPhase::LITE_IO) {
        for (auto&& config_in : m_network_io->inputs) {
            if (io_name == config_in.name) {
                return config_in.lite_tensor;
            }
        }
    }
    if (phase == LiteTensorPhase::LITE_OUTPUT ||
        phase == LiteTensorPhase::LITE_IO) {
        for (auto&& config_out : m_network_io->outputs) {
            if (io_name == config_out.name) {
                config_out.lite_tensor->update_from_implement();
                return config_out.lite_tensor;
            }
        }
    }
    LITE_THROW(mgb::ssprintf(
            "tensor name must be %s input tensor name or the registered "
            "output tensor name if NetworkIO is set, if NetworkIO is not set, "
            "the output tensor is all the network output tensor, or the output "
            "tensor is only the registered tensor.",
            io_name.c_str()));
    return nullptr;
 }

 std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) {
    return get_io_tensor(get_input_name(index));
 }

 std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) {
    return get_io_tensor(get_output_name(index));
 }

 //! set opr algorithm selection strategy in the network
 void NetworkImplDft::set_network_algo_policy(LiteAlgoSelectStrategy strategy,
                                             uint32_t shared_batch_size,
                                             bool binary_equal_between_batch) {
    using S = megdnn::param::ExecutionPolicy::Strategy;
    auto dst_strategy = static_cast<S>(0);
    if (static_cast<uint32_t>(strategy) &
        LiteAlgoSelectStrategy::LITE_ALGO_HEURISTIC) {
        dst_strategy = dst_strategy | S::HEURISTIC;
    }
    if (static_cast<uint32_t>(strategy) &
        LiteAlgoSelectStrategy::LITE_ALGO_PROFILE) {
        dst_strategy = dst_strategy | S::PROFILE;
    }
    if (static_cast<uint32_t>(strategy) &
        LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE) {
        dst_strategy = dst_strategy | S::REPRODUCIBLE;
    }
    if (static_cast<uint32_t>(strategy) &
        LiteAlgoSelectStrategy::LITE_ALGO_OPTIMIZED) {
        dst_strategy = dst_strategy | S::OPTIMIZED;
    }
    m_execution_policy = dst_strategy;

    auto&& fast_run_config =
            m_load_config.comp_graph->options().fast_run_config;
    fast_run_config.binary_equal_between_batch = binary_equal_between_batch;
    fast_run_config.shared_batch_size = shared_batch_size;

    if (m_execute_func) {
        LITE_WARN(
                "set_network_algo_policy maybe cause error after loaded "
                "network!!!!");
        modify_exection_policy();
    }
 }

 void NetworkImplDft::modify_exection_policy() {
    mgb::SymbolVarArray vars;
    for (auto i : m_output_spec) {
        vars.push_back(i.first);
    }
    if (static_cast<uint32_t>(m_execution_policy) != 0)
        mgb::gopt::modify_opr_algo_strategy_inplace(vars, m_execution_policy);
 }

 //! set opr algorithm selection strategy in the network
 void NetworkImplDft::set_network_algo_workspace_limit(size_t workspace_limit) {
    mgb::SymbolVarArray vars;
    for (auto i : m_output_spec) {
        vars.push_back(i.first);
    }
    mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit);
 }

 //! get the input tensor name in the order of graph
 std::vector<const char*> NetworkImplDft::get_all_output_name() const {
    std::vector<const char*> output_names;
    for (auto& output : m_network_io->outputs) {
        output_names.push_back(output.name.c_str());
    }
    return output_names;
 }

 //! get the input tensor name in the order of graph
 std::vector<const char*> NetworkImplDft::get_all_input_name() const {
    std::vector<const char*> input_names;
    for (auto& input : m_load_result.tensor_map) {
        input_names.push_back(input.first.c_str());
    }
    return input_names;
 }

 //! get the output tensor name in the order of graph
 const char* NetworkImplDft::get_output_name(size_t index) const {
    LITE_ASSERT(
            index < m_load_result.output_var_list.size(),
            "The output tensor index is large than the total outputs number.");
    return m_load_result.output_var_list[index].node()->name().c_str();
 }

 //! get the input tensor name in the order of graph
 const char* NetworkImplDft::get_input_name(size_t index) const {
    LITE_ASSERT(
            index < m_load_result.tensor_map.size(),
            "The input tensor index is large than the total inputs number.");
    size_t i = 0;
    for (auto& input : m_load_result.tensor_map) {
        if (i == index) {
            return input.first.c_str();
        }
        i++;
    }
    LITE_THROW(ssprintf("no input tensor of index %zu.", index));
 }

 //! Plugin part
 void NetworkImplDft::enable_profile_performance(std::string profile_json_file) {
 #if MGB_ENABLE_JSON
 #if MGB_OPENCL
    mgb::CompNode::enable_opencl_profile(true);
 #endif
    m_profiler = std::make_unique<mgb::GraphProfiler>(
            m_load_config.comp_graph.get());
    m_profiler_output_file = profile_json_file;
 #else
    LITE_MARK_USED_VAR(profile_json_file);
    LITE_THROW("JSON is disable at compile time.");
 #endif
 }

 void NetworkImplDft::enable_io_txt_dump(std::string io_txt_out_file) {
    auto iodump = std::make_unique<mgb::TextOprIODump>(
            m_load_config.comp_graph.get(), io_txt_out_file.c_str());
    iodump->print_addr(false);
    m_iodump = std::move(iodump);
 }

 void NetworkImplDft::enable_io_bin_dump(std::string io_bin_out_dir) {
    m_iodump = std::make_unique<mgb::BinaryOprIODump>(
            m_load_config.comp_graph.get(), io_bin_out_dir.c_str());
 }

 void inline NetworkImplDft::output_plugin_result() const {
 #if MGB_ENABLE_JSON
    if (m_profiler && m_execute_func) {
        m_profiler->to_json_full(m_execute_func.get())
                ->writeto_fpath(m_profiler_output_file);
    }
 #endif
 }
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/network_impl.h
+++ b/lite/src/mge/network_impl.h
@@ -0,0 +1,242 @@
 /**
 * \file src/mge/network_impl.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "lite/network.h"
 #include "network_impl_base.h"
 #include "tensor_impl.h"

 #include "megbrain/graph/bases.h"
 #include "megbrain/plugin/opr_io_dump.h"
 #include "megbrain/plugin/profiler.h"
 #include "megbrain/serialization/extern_c_opr.h"
 #include "megbrain/serialization/file.h"
 #include "megbrain/serialization/load_dump_config.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/utils/thin/hash_table.h"

 #include <memory>
 #include <unordered_map>

 namespace lite {

 /*!
 * \brief implement the Network, contain the mgb related member
 */
 class NetworkImplDft final : public Network::NetworkImplBase {
    LITE_DYN_TYPE_OBJ_FINAL_DECL;

 public:
    using S = megdnn::param::ExecutionPolicy::Strategy;
    //! set the config of the network, include:
    //! the inference device
    //! the other inference options, such as record_level, weight_preprocess...
    void set_config(const Config& config) override;

    //! set the special io infomation, if not set, default io tensor will used,
    //! this is special for input/output is not host tensor, default the
    //! input/output tensors are host tensor
    void set_io(const NetworkIO& network_io) override;

    //! only compute the output tensor in user configured
    void compute_only_configured_output() override {
        m_compute_configured_output_only = true;
    }

    //! get the network input and ouput tensor, the layout of which is
    //! sync from mge tensor
    std::shared_ptr<Tensor> get_io_tensor(
            std::string io_name,
            LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override;

    //! get the input tensor by index in the load_result tensormap
    std::shared_ptr<Tensor> get_input_tensor(size_t index) override;

    //! get the output tensor by index in the load_result output_var_list
    std::shared_ptr<Tensor> get_output_tensor(size_t index) override;

    //! get all the input tensor name in the order in load return
    std::vector<const char*> get_all_input_name() const override;

    //! get all the output tensor name in the order in load return
    std::vector<const char*> get_all_output_name() const override;

    //! get the input tensor name in the order in load return
    const char* get_input_name(size_t index) const override;

    //! get the output tensor name in the order in load return
    const char* get_output_name(size_t index) const override;

    //! set the callback in async model
    void set_async_callback(const AsyncCallback& callback) override;

    //! set the start callback which will execute before network forward
    void set_start_callback(const StartCallback& callback) override {
        m_start_callback = std::move(callback);
    }

    //! set the finish callback which will execute after network forward
    void set_finish_callback(const FinishCallback& callback) override {
        m_finish_callback = std::move(callback);
    }

    //! load the model and get the m_load_result
    void load_model(std::shared_ptr<void> model_mem, size_t size,
                    std::unordered_map<std::string, LiteAny>
                            separate_config_map = {}) override;

    //! forward the network with filled input data and fill the output data
    //! to the output tensor
    void forward() override;

    //! in sync model, wait utile the inference finish
    void wait() override;

    virtual LiteDeviceType get_device_type() const override {
        return m_user_config->device_type;
    }

    //! Set cpu default mode when device is CPU, in some low computation
    //! device or single core device, this mode will get good performace
    void set_cpu_inplace_mode();
    bool is_cpu_inplace_mode() const { return m_is_cpu_inplace_mode; }

    //! When device is CPU, this interface will set the to be loaded model
    //! run in multi thread mode with the given thread number.
    void set_cpu_threads_number(size_t nr_threads);
    size_t get_cpu_threads_number() const { return m_nr_threads; }

    //! set device id, default device id = 0
    void set_device_id(int device_id) override;
    int get_device_id() const override { return m_compnode_locator.device; };

    LiteBackend get_backend_type() const override {
        return LiteBackend::LITE_DEFAULT;
    }
    //! set stream id, default stream id = 0
    void set_stream_id(int stream_id) override;
    int get_stream_id() const override { return m_compnode_locator.stream; };

    //! enable tensorrt
    void use_tensorrt();

    //! enable profile the network, a JSON format file will be generated
    void enable_profile_performance(
            std::string profile_json_file_path) override;

    /********************** mge special function ************************/
    //! load a new network which will share weights with src network
    void shared_weight_with(const NetworkImplBase* src_network);

    //! share the runtime memory with other network, the weights is not shared
    void share_runtime_memory_with(NetworkImplBase* network);
    //! set threads affinity callback;
    void set_runtime_thread_affinity(
            const ThreadAffinityCallback& thread_affinity_callback);

    //! set the network memroy allocator, the allocator is defined by user
    void set_memory_allocator(std::shared_ptr<Allocator> user_allocator);

    //! set opr algorithm selection strategy in the network
    void set_network_algo_policy(LiteAlgoSelectStrategy strategy,
                                 uint32_t shared_batch_size,
                                 bool binary_equal_between_batch);

    //! set workspace_limit for oprs with multiple algorithms, set
    //! workspace limitation can save memory but may influence the performance
    void set_network_algo_workspace_limit(size_t workspace_limit);

    //! Dump input/output values of all internal variables to output file,
    //! in text format
    void enable_io_txt_dump(std::string io_txt_out_file);

    //! Dump input/output values of all internal variables to output
    //! directory, in binary format
    void enable_io_bin_dump(std::string io_bin_out_dir);

 private:
    //! construct the outputspec according to the m_network_io, and set the
    //! call_back to the outputspec
    void make_output_spec();

    //! modify the execution policy
    void modify_exection_policy();

    //! if the input is dev tensor, the pass will replace the H2D Opr to
    //! VolatileSharedDeviceTensor Opr
    void replace_dev_input_pass();

    //! check whether the model is cross compnode
    void cross_compnode_model_detect();

    //! when the model have loaded, update the IO, if not set networkio, update
    //! the networkio with the IO of loaded model
    void update_io();

    void update_input();
    void update_output();

    //! when the model info have loaded, update the config according the model
    //! info, finaly use it in compute graph
    void application_config();

    //! after finish forwarding the netwark, output the result of plugin to file
    void output_plugin_result() const;

    //! when finish forwarding the network, the function will be called
    void finish() const;

    //! before forwarding the network, the function will be called
    void start() const;

    //! compile the graph to get the execute function
    void compile_graph();

 private:
    bool m_async = false;
    bool m_is_cpu_inplace_mode = false;
    int m_nr_device_type = 0;
    size_t m_nr_threads = 1;
    bool m_compute_configured_output_only = false;
    mgb::CompNode::Locator m_compnode_locator;

    AsyncCallback m_async_callback = nullptr;
    std::unique_ptr<NetworkIOInner> m_network_io;
    std::unique_ptr<Config> m_user_config;
    std::unique_ptr<mgb::cg::AsyncExecutable> m_execute_func;

    //! The model load related data
    S m_execution_policy = static_cast<S>(0);
    std::unique_ptr<mgb::serialization::InputFile> m_input_file;
    mgb::serialization::GraphLoadConfig m_load_config;
    mgb::serialization::GraphLoader::LoadResult m_load_result;
    mgb::ComputingGraph::OutputSpec m_output_spec;
    std::shared_ptr<mgb::serialization::GraphLoader> m_loader;

    //! start and finish callback
    StartCallback m_start_callback = nullptr;
    FinishCallback m_finish_callback = nullptr;

    //! profile and io dump related data
 #if MGB_ENABLE_JSON
    std::unique_ptr<mgb::GraphProfiler> m_profiler;
    std::string m_profiler_output_file;
 #endif
    std::unique_ptr<mgb::OprIODumpBase> m_iodump;
 };

 }  // namespace lite

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/tensor_impl.cpp
+++ b/lite/src/mge/tensor_impl.cpp
@@ -0,0 +1,435 @@
 /**
 * \file inlude/mge/tensor.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "tensor_impl.h"
 #include "common.h"

 #include "lite/tensor.h"

 #include "megbrain/comp_node.h"
 #include "megbrain/tensor.h"

 #include <memory>

 using namespace lite;

 /**********************TensorImpl****************************/

 LITE_DYN_TYPE_OBJ_FINAL_IMPL(TensorImplDft);

 TensorImplDft::TensorImplDft() {
    m_host_tensor =
            std::make_shared<mgb::HostTensorND>(mgb::CompNode::default_cpu());
 }

 TensorImplDft::TensorImplDft(LiteDeviceType device, bool is_pinned_host) {
    auto cn = mgb::CompNode::load(to_compnode_locator(device));
    if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) {
        device = LiteDeviceType::LITE_CPU;
    }
    if (device == LiteDeviceType::LITE_CPU) {
        m_host_tensor = std::make_shared<mgb::HostTensorND>(
                mgb::CompNode::default_cpu());
    } else if (is_pinned_host) {
        m_host_tensor = std::make_shared<mgb::HostTensorND>(cn);
    } else {
        m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn);
    }
 }

 TensorImplDft::TensorImplDft(LiteDeviceType device, const Layout& layout,
                             bool is_pinned_host) {
    auto cn = mgb::CompNode::load(to_compnode_locator(device));
    auto mge_layout = to_impl_layout(layout);
    if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) {
        device = LiteDeviceType::LITE_CPU;
    }
    if (device == LiteDeviceType::LITE_CPU) {
        m_host_tensor = std::make_shared<mgb::HostTensorND>(
                mgb::CompNode::default_cpu(), mge_layout);
    } else if (is_pinned_host) {
        m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout);
    } else {
        m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn, mge_layout);
    }
 }

 TensorImplDft::TensorImplDft(int device_id, LiteDeviceType device_type,
                             const Layout& layout, bool is_pinned_host) {
    auto locator = to_compnode_locator(device_type);
    locator.device = device_id;
    auto cn = mgb::CompNode::load(locator);
    if (device_type == LiteDeviceType::LITE_DEVICE_DEFAULT) {
        device_type = LiteDeviceType::LITE_CPU;
    }
    if (layout.ndim) {
        auto mge_layout = to_impl_layout(layout);
        if (device_type == LiteDeviceType::LITE_CPU) {
            m_host_tensor = std::make_shared<mgb::HostTensorND>(
                    mgb::CompNode::default_cpu(), mge_layout);
        } else if (is_pinned_host) {
            m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout);
        } else {
            m_dev_tensor =
                    std::make_shared<mgb::DeviceTensorND>(cn, mge_layout);
        }
    } else {
        if (device_type == LiteDeviceType::LITE_CPU) {
            m_host_tensor = std::make_shared<mgb::HostTensorND>(
                    mgb::CompNode::default_cpu());
        } else if (is_pinned_host) {
            m_host_tensor = std::make_shared<mgb::HostTensorND>(cn);
        } else {
            m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn);
        }
    }
 }

 TensorImplDft::TensorImplDft(int device_id, int stream_id,
                             LiteDeviceType device_type, bool is_pinned_host) {
    auto locator = to_compnode_locator(device_type);
    locator.device = device_id;
    locator.stream = stream_id;
    auto cn = mgb::CompNode::load(locator);
    if (get_device_from_locator(locator) == LiteDeviceType::LITE_CPU) {
        m_host_tensor = std::make_shared<mgb::HostTensorND>(
                mgb::CompNode::default_cpu());
    } else if (is_pinned_host) {
        m_host_tensor = std::make_shared<mgb::HostTensorND>(cn);
    } else {
        m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn);
    }
 }

 LiteDeviceType TensorImplDft::get_device_type() const {
    if (is_host()) {
        return LiteDeviceType::LITE_CPU;
    } else {
        return get_device_from_locator(m_dev_tensor->comp_node().locator());
    }
 }

 int TensorImplDft::get_device_id() const {
    if (is_host()) {
        return m_host_tensor->comp_node().locator().device;
    } else {
        return m_dev_tensor->comp_node().locator().device;
    }
 }

 bool TensorImplDft::is_pinned_host() const {
    return is_host() &&
           get_device_from_locator(m_host_tensor->comp_node().locator()) !=
                   LiteDeviceType::LITE_CPU;
 }

 void TensorImplDft::set_mge_tensor_compnode(const mgb::CompNode& comp_node) {
    if (is_host()) {
        m_host_tensor->comp_node(comp_node, true);
    } else {
        m_dev_tensor->comp_node(comp_node, true);
    }
 }

 Layout TensorImplDft::get_layout() const {
    if (is_host()) {
        return to_lite_layout(m_host_tensor->layout());
    } else {
        return to_lite_layout(m_dev_tensor->layout());
    }
 }

 void* TensorImplDft::get_memory_ptr() const {
    if (is_host()) {
        return static_cast<void*>(m_host_tensor->raw_ptr());
    } else {
        return static_cast<void*>(m_dev_tensor->raw_ptr());
    }
 }

 void* TensorImplDft::get_memory_ptr(const std::vector<size_t>& idx) const {
    if (is_host()) {
        auto elemsize_log = m_host_tensor->layout().dtype.size_log();
        switch (elemsize_log) {
            case 0:
                return static_cast<void*>(
                        m_host_tensor->ptr<uint8_t>(idx.begin(), idx.end()));
                break;
            case 1:
                return static_cast<void*>(
                        m_host_tensor->ptr<short>(idx.begin(), idx.end()));
                break;
            case 2:
                return static_cast<void*>(
                        m_host_tensor->ptr<float>(idx.begin(), idx.end()));
                break;
            default:
                LITE_THROW("not supported data_type.");
        }
    } else {
        auto elemsize_log = m_dev_tensor->layout().dtype.size_log();
        switch (elemsize_log) {
            case 0:
                return static_cast<void*>(
                        m_dev_tensor->ptr<uint8_t>(idx.begin(), idx.end()));
                break;
            case 1:
                return static_cast<void*>(
                        m_dev_tensor->ptr<short>(idx.begin(), idx.end()));
                break;
            case 2:
                return static_cast<void*>(
                        m_dev_tensor->ptr<float>(idx.begin(), idx.end()));
                break;
            default:
                LITE_THROW("not supported data_type.");
        }
    }
 }

 std::shared_ptr<Tensor> TensorImplDft::slice(
        const std::vector<size_t>& start, const std::vector<size_t>& end,
        const std::vector<size_t>& step) {
    Layout layout;
    mgb::TensorLayout layout_mge;
    if (is_host()) {
        layout_mge = m_host_tensor->layout();
        layout = to_lite_layout(m_host_tensor->layout());
    } else {
        layout_mge = m_dev_tensor->layout();
        layout = to_lite_layout(m_dev_tensor->layout());
    }

    size_t length = start.size();
    LITE_ASSERT(length == end.size() && length <= layout.ndim,
                "The start and end must be the same size and less than layout "
                "ndim.");
    std::vector<mgb::Slice> slices;
    if (step.size()) {
        LITE_ASSERT(length == step.size(),
                    "The start and step must be the same size.");
        for (size_t i = 0; i < length; i++) {
            slices.push_back(mgb::Slice{start[i], end[i], step[i]});
        }
    } else {
        for (size_t i = 0; i < length; i++) {
            slices.push_back(mgb::Slice{start[i], end[i]});
        }
    }
    auto subspec = mgb::SubTensorSpec::make_from_offset_elem(layout_mge, 0);
    size_t axis = 0;
    for (auto&& i : slices) {
        subspec.merge_with(i.apply(subspec.layout(), axis));
        axis++;
    }
    auto ret = std::make_shared<Tensor>();
    auto& impl = TensorHelper::implement(ret)->cast_final_safe<TensorImplDft>();
    if (is_host()) {
        *impl.m_host_tensor = m_host_tensor->sub(subspec);
    } else {
        impl.m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(
                m_dev_tensor->sub(subspec));
        impl.m_host_tensor = nullptr;
    }
    LITE_ASSERT(is_host() == impl.is_host());
    return ret;
 }

 void TensorImplDft::fill_zero() {
    if (is_host()) {
        auto mge_layout = m_host_tensor->layout();
        if (m_host_tensor->layout().is_physical_contiguous()) {
            auto ptr = get_memory_ptr();
            std::memset(ptr, 0,
                        mge_layout.dtype.size(mge_layout.total_nr_elems()));
        } else {
            TensorImplDft tmp(LiteDeviceType::LITE_CPU,
                              to_lite_layout(mge_layout), true);
            tmp.fill_zero();
            this->copy_from(&tmp);
        }
    } else {
        mgb::dev_tensor_memset(*m_dev_tensor, 0);
        m_dev_tensor->sync();
    }
 }

 void TensorImplDft::share_memory_with(const TensorImplBase* src_tensor_impl) {
    auto src_dft_tensor = static_cast<const TensorImplDft*>(src_tensor_impl);
    LITE_ASSERT(is_host() == src_dft_tensor->is_host(),
                "share memory must happen in same device");
    //! make shape the src memory is ready
    src_tensor_impl->get_memory_ptr();
    if (is_host()) {
        *m_host_tensor = *src_dft_tensor->m_host_tensor;
    } else {
        *m_dev_tensor = *src_dft_tensor->m_dev_tensor;
    }
 }

 void TensorImplDft::set_layout(const Layout& layout) {
    bool host = is_host();
    auto mgb_layout = to_impl_layout(layout);
    if (host) {
        m_host_tensor->dtype(mgb_layout.dtype);
        m_host_tensor->resize(mgb_layout);
    } else {
        m_dev_tensor->dtype(mgb_layout.dtype);
        m_dev_tensor->resize(mgb_layout);
    }
 }

 void TensorImplDft::reshape(const Layout& layout) {
    auto mgb_layout = to_impl_layout(layout);
    bool host = is_host();
    if (host) {
        m_host_tensor->resize(mgb_layout);
    } else {
        m_dev_tensor->resize(mgb_layout);
    }
 }

 void TensorImplDft::reset(void* prepared_data) {
    auto raw_ptr = static_cast<mgb::dt_byte*>(prepared_data);
    auto raw_storage = std::shared_ptr<mgb::dt_byte>(raw_ptr, [](void*) {});
    bool host = is_host();
    if (host) {
        auto cn = m_host_tensor->comp_node();
        auto mge_layout = m_host_tensor->layout();
        size_t size = mge_layout.span().dist_byte();
        mgb::HostTensorStorage storage;
        storage.reset(cn, size, raw_storage);
        m_host_tensor->reset(storage, mge_layout);
    } else {
        auto cn = m_dev_tensor->comp_node();
        auto mge_layout = m_dev_tensor->layout();
        size_t size = mge_layout.span().dist_byte();
        mgb::DeviceTensorStorage storage;
        storage.reset(cn, size, raw_storage);
        m_dev_tensor->reset(storage, mge_layout);
    }
 }

 void TensorImplDft::reset(void* prepared_data, const Layout& layout) {
    set_layout(layout);
    reset(prepared_data);
 }

 bool TensorImplDft::is_continue_memory() const {
    if (is_host()) {
        return m_host_tensor->layout().is_physical_contiguous();
    } else {
        return m_dev_tensor->layout().is_physical_contiguous();
    }
 }

 void TensorImplDft::copy_from(const TensorImplBase* src_impl) {
    if (is_continue_memory()) {
        copy_from_continue(src_impl);
    } else {
        copy_from_fixlayout(src_impl);
    }
 }

 void TensorImplDft::copy_from_continue(const TensorImplBase* src_impl) {
    auto src = static_cast<const TensorImplDft*>(src_impl);
    if (is_host()) {
        //! host to host
        if (src->is_host()) {
            m_host_tensor->copy_from(*src->m_host_tensor);
            //! device to host
        } else {
            auto src_cn = src->m_dev_tensor->comp_node();
            auto dst_cn = m_host_tensor->comp_node();
            if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) {
                LITE_WARN(
                        "The dst tensor memroy is alloced before coping, "
                        "then pinned memroy would not use to optmize the "
                        "copy performance.");
                //! When D2H in megbrain and the compnode of src and dst is not
                //! equal, there must be one compnode that is cpu-default, so
                //! here, we use temp tensor for transition
                auto tmp_impl = std::make_shared<TensorImplDft>();
                tmp_impl->set_mge_tensor_compnode(src_cn);
                tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync();
                m_host_tensor->copy_from(*tmp_impl->m_host_tensor);
            } else {
                //! if dst compnode is not valid(memory is not alloced), the
                //! tensor is pinned host tensor
                m_host_tensor->comp_node(src_cn, true);
                m_host_tensor->copy_from(*src->m_dev_tensor).sync();
            }
        }
    } else {
        //! host to device
        if (src->is_host()) {
            m_dev_tensor->copy_from(*src->m_host_tensor).sync();
            //! device to device
        } else {
            m_dev_tensor->copy_from(*src->m_dev_tensor).sync();
        }
    }
 }

 void TensorImplDft::copy_from_fixlayout(const TensorImplBase* src_impl) {
    auto src = static_cast<const TensorImplDft*>(src_impl);
    if (is_host()) {
        //! host to host
        if (src->is_host()) {
            m_host_tensor->copy_from_fixlayout(*src->m_host_tensor);
            //! device to host
        } else {
            auto src_cn = src->m_dev_tensor->comp_node();
            auto dst_cn = m_host_tensor->comp_node();
            if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) {
                LITE_WARN(
                        "The dst tensor memroy is alloced before coping, "
                        "then pinned memroy would not use to optmize the "
                        "copy performance.");
                //! When D2H in megbrain and the compnode of src and dst is not
                //! equal, there must be one compnode that is cpu-default, so
                //! here, we use temp tensor for transition
                auto tmp_impl = std::make_shared<TensorImplDft>();
                tmp_impl->set_mge_tensor_compnode(src_cn);
                tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync();
                m_host_tensor->copy_from_fixlayout(*tmp_impl->m_host_tensor);
            } else {
                //! if dst compnode is not valid(memory is not alloced), the
                //! tensor is pinned host tensor
                m_host_tensor->comp_node(src_cn, true);
                m_host_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync();
            }
        }
    } else {
        //! host to device
        if (src->is_host()) {
            m_dev_tensor->copy_from_fixlayout(*src->m_host_tensor).sync();
            //! device to device
        } else {
            m_dev_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync();
        }
    }
 }

 void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) {
    if (is_host()) {
        auto src_cn = dv.comp_node();
        m_host_tensor->comp_node(src_cn, true);
        m_host_tensor->copy_from(dv);
    } else {
        m_dev_tensor->copy_from(dv);
    }
 }

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/mge/tensor_impl.h
+++ b/lite/src/mge/tensor_impl.h
@@ -0,0 +1,128 @@
 /**
 * \file src/mge/tensor_impl.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "lite/tensor.h"
 #include "tensor_impl_base.h"

 #include "megbrain/tensor.h"

 #include <unordered_map>

 namespace lite {

 /*!
 * \brief implement the Tensor in mge
 */
 class TensorImplDft final : public Tensor::TensorImplBase {
    LITE_DYN_TYPE_OBJ_FINAL_DECL;

 public:
    TensorImplDft();
    TensorImplDft(LiteDeviceType device, bool is_pinned_host = false);
    TensorImplDft(LiteDeviceType device, const Layout& layout,
                  bool is_pinned_host = false);
    TensorImplDft(int device_id, LiteDeviceType device,
                  const Layout& layout = {}, bool is_pinned_host = false);
    TensorImplDft(int device_id, int stream_id, LiteDeviceType device,
                  bool is_pinned_host = false);

    virtual ~TensorImplDft() = default;

    LiteDeviceType get_device_type() const override;

    int get_device_id() const override;

    LiteBackend get_backend_type() const override {
        return LiteBackend::LITE_DEFAULT;
    }
    Layout get_layout() const override;

    bool is_pinned_host() const override;

    //! which will trigger memory alloc in tensor implement
    void* get_memory_ptr() const override;

    //! which will trigger memory alloc in tensor implement if memory is not
    //! allocated, and compute the ptr in the gaven idx
    void* get_memory_ptr(const std::vector<size_t>& idx) const override;

    //! set layout will change the layout and reallocate memory of the tensor
    void set_layout(const Layout& layout) override;

    //! use the user allocated data to reset the memory of the tensor, the
    //! memory will not be managed by the lite, later, the user should delete
    //! it.
    void reset(void* prepared_data) override;

    //! use the user allocated data and corresponding layout to reset the data
    //! and layout of the tensor, the memory will not be managed by lite, later,
    //! the user should delete it.
    void reset(void* prepared_data, const Layout& layout) override;

    //! get a new tensor slice from the origin tensor
    std::shared_ptr<Tensor> slice(
            const std::vector<size_t>& start, const std::vector<size_t>& end,
            const std::vector<size_t>& step = {}) override;

    //! set the tensor memory with zero
    void fill_zero() override;

    //! reshape the tensor with new shape, keep the data_type the same
    void reshape(const Layout& layout) override;

    //! copy tensor form other tensor
    //! Note: the best way for tensor copy is just set the dst device, left
    //! layout empty, when copying the dst layout will be set the same with
    //! src
    void copy_from(const TensorImplBase* src_impl) override;

    //! share memory with other tensor
    void share_memory_with(const TensorImplBase* src_impl) override;

    //! whether the memory of tensor is continue
    bool is_continue_memory() const override;

    //! get host tensor
    std::shared_ptr<mgb::HostTensorND> host_tensor() const {
        return m_host_tensor;
    }
    //! get device tensor
    std::shared_ptr<mgb::DeviceTensorND> dev_tensor() const {
        return m_dev_tensor;
    }
    //! copy from mgb tensor
    void copy_from_mge_tensor(const mgb::DeviceTensorND& dv);

 public:
    friend class NetworkImplDft;

 private:
    bool is_host() const { return m_host_tensor != nullptr; };

    void copy_from_continue(const TensorImplBase* src_impl);

    void copy_from_fixlayout(const TensorImplBase* src_impl);

    void set_mge_tensor_compnode(const mgb::CompNode& comp_node);

 private:
    std::shared_ptr<mgb::HostTensorND> m_host_tensor;
    std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor;
 };

 }  // namespace lite

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/misc.cpp
+++ b/lite/src/misc.cpp
@@ -0,0 +1,154 @@
 /**
 * \file inlude/misc.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "./misc.h"
 #include "lite/global.h"

 #include <time.h>
 #include <chrono>
 #include <cstdarg>

 #if LITE_BUILD_WITH_MGE
 #include "megbrain/common.h"
 #endif

 #ifdef __ANDROID__
 #include <android/log.h>
 #endif

 using namespace lite;

 namespace lite {
 namespace log_detail {

 LiteLogLevel current_log_level = LiteLogLevel::ERROR;

 template <class T, size_t N>
 constexpr size_t countof(T (&)[N]) {
    return N;
 }
 }  // namespace log_detail
 }  // namespace lite

 namespace {
 std::string svsprintf(const char* fmt, va_list ap_orig) {
    int size = 100; /* Guess we need no more than 100 bytes */
    char* p;

    if ((p = (char*)malloc(size)) == nullptr)
        return "svsprintf: malloc failed";

    for (;;) {
        va_list ap;
        va_copy(ap, ap_orig);
        int n = vsnprintf(p, size, fmt, ap);
        va_end(ap);

        if (n < 0)
            return "svsprintf: vsnprintf failed";

        if (n < size) {
            std::string rst(p);
            free(p);
            return rst;
        }

        size = n + 1;

        char* np = (char*)realloc(p, size);
        if (!np) {
            free(p);
            return "svsprintf: realloc failed";
        } else
            p = np;
    }
 }
 }  // namespace

 void lite::set_log_level(LiteLogLevel l) {
    log_detail::current_log_level = l;
 #if LITE_BUILD_WITH_MGE
    mgb::LogLevel lite_log_level = mgb::LogLevel::DEBUG;
    switch (l) {
        case LiteLogLevel::DEBUG:
            lite_log_level = mgb::LogLevel::DEBUG;
            break;
        case LiteLogLevel::INFO:
            lite_log_level = mgb::LogLevel::INFO;
            break;
        case LiteLogLevel::WARN:
            lite_log_level = mgb::LogLevel::WARN;
            break;
        case LiteLogLevel::ERROR:
            lite_log_level = mgb::LogLevel::ERROR;
            break;
        default:
            LITE_THROW("unkonw loglevel");
    }
    mgb::set_log_level(lite_log_level);
 #endif
 }

 LiteLogLevel lite::get_log_level() {
    return log_detail::current_log_level;
 }

 std::string lite::ssprintf(const char* format, ...) {
    va_list ap;
    va_start(ap, format);
    auto ret = svsprintf(format, ap);
    va_end(ap);
    return ret;
 }

 void lite::print_log(LiteLogLevel level, const char* format, ...) {
    if (static_cast<uint32_t>(level) < static_cast<uint32_t>(get_log_level())) {
        return;
    }
    using namespace std::chrono;

    auto now = system_clock::now();
    auto now_time_t = system_clock::to_time_t(now);

    tm now_tm;

 #if _WIN32
    localtime_s(&now_tm, &now_time_t);
 #else
    localtime_r(&now_time_t, &now_tm);
 #endif

    auto now_trunc_to_sec = system_clock::from_time_t(mktime(&now_tm));
    auto microsec = duration_cast<microseconds>(now - now_trunc_to_sec);

    char time_buffer[100];
    snprintf(time_buffer, log_detail::countof(time_buffer),
             "%02d:%02d:%02d.%06ld ", now_tm.tm_hour, now_tm.tm_min,
             now_tm.tm_sec, long(microsec.count()));

    const char* prefix[] = {"LITE[DBG] ", "LITE[INF] ", "LITE[WRN] ",
                            "LITE[ERR] "};
    std::string out;
    out += prefix[int(level)];
    out += time_buffer;

    va_list ap;
    va_start(ap, format);
    auto ret = svsprintf(format, ap);
    va_end(ap);
    out += ret;

 #ifdef __ANDROID__
    __android_log_print(ANDROID_LOG_INFO, "lite", "%s", out.c_str());
 #else
    fprintf(stderr, "%s\n", out.c_str());
 #endif
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/misc.h
+++ b/lite/src/misc.h
@@ -0,0 +1,254 @@
 /**
 * \file include/misc.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include "lite_build_config.h"

 #include <chrono>
 #include <exception>
 #include <stdexcept>
 #include <string>
 #include "lite/common_enum_c.h"
 #include "lite/global.h"

 namespace lite {
 #if LITE_ENABLE_EXCEPTION
 /*! \brief The error class in lite.
 *
 * It can be used to represent both an error caused by the invalid
 * input of the caller or an invalid runtime condition.
 *
 * The necessary presumption should be guaranteed by assertions instead of
 * exceptions.
 */
 class Error : public std::exception {
 public:
    Error(const std::string& msg) : m_msg("Error: " + msg) {}
    const char* what() const noexcept override { return m_msg.c_str(); }

 private:
    std::string m_msg;
 };
 #endif

 std::string ssprintf(const char* fmt = 0, ...)
        __attribute__((format(printf, 1, 2)));

 /*!
 * \brief Print a message.
 *
 * The message is printed only if level is above or equals to the current log
 * level.
 */
 void print_log(LiteLogLevel level, const char* format = 0, ...)
        __attribute__((format(printf, 2, 3)));
 }  // namespace lite

 #if LITE_ENABLE_LOGGING
 #define LITE_LOG_(level, msg...)                     \
    do {                                             \
        lite::print_log(LiteLogLevel::level, ##msg); \
    } while (0)
 #else
 #define LITE_LOG_(level, msg...) (void)0
 #endif

 #define LITE_LOG(fmt...) LITE_LOG_(DEBUG, fmt);
 #define LITE_DEBUG(fmt...) LITE_LOG_(DEBUG, fmt);
 #define LITE_WARN(fmt...) LITE_LOG_(WARN, fmt);
 #define LITE_ERROR(fmt...) LITE_LOG_(ERROR, fmt);

 #if LITE_ENABLE_EXCEPTION
 #define LITE_THROW(msg) throw lite::Error(msg)
 #else
 #define LITE_THROW(msg)   \
    do {                  \
        LITE_ERROR(msg);  \
        __builtin_trap(); \
    } while (0)
 #endif

 #if LITE_ENABLE_EXCEPTION
 #define LITE_ERROR_HANDLER_BEGIN try {
 #define LITE_ERROR_HANDLER_END                                        \
    }                                                                 \
    catch (const ::lite::Error& e) {                                  \
        std::string msg = std::string("Lite exception: ") + e.what(); \
        LITE_ERROR("%s.", msg.c_str());                               \
        throw;                                                        \
    }

 #else
 #define LITE_ERROR_HANDLER_BEGIN
 #define LITE_ERROR_HANDLER_END
 #endif

 /*! \brief Return an error if the given pointer is null pointer.
 *
 * The macro is used to ensure the validity of the passing context pointer.
 */
 #define LITE_CHECK_NON_NULL_POINTER(ptr) \
    LITE_ASSERT(ptr != nullptr, "Input ptr is null.")

 //! branch prediction hint: likely to take
 #define lite_likely(v) __builtin_expect(static_cast<bool>(v), 1)

 //! branch prediction hint: unlikely to take
 #define lite_unlikely(v) __builtin_expect(static_cast<bool>(v), 0)

 #if LITE_ENABLE_LOGGING
 #if LITE_ASSERT_LOC
 #define LITE_ASSERT(expr, msg...)                                           \
    do {                                                                    \
        if (lite_unlikely(!(expr))) {                                       \
            auto info = lite::ssprintf(msg);                                \
            LITE_THROW(                                                     \
                    lite::ssprintf("Assert \' %s \' failed at file : %s \n" \
                                   "line %d : %s,\nextra "                  \
                                   "message: %s",                           \
                                   #expr, __FILE__, __LINE__,               \
                                   __PRETTY_FUNCTION__, info.c_str()));     \
        }                                                                   \
    } while (0)
 #else
 #define LITE_ASSERT(expr, msg...)                                          \
    do {                                                                   \
        if (lite_unlikely(!(expr))) {                                      \
            auto info = lite::ssprintf(msg);                               \
            LITE_THROW(lite::ssprintf(                                     \
                    "Assert \' %s \' failed at file : %s \n"               \
                    "line %d : %s,\nextra "                                \
                    "message: %s",                                         \
                    #expr, "about location info, please build with debug", \
                    __LINE__, __PRETTY_FUNCTION__, info.c_str()));         \
        }                                                                  \
    } while (0)
 #endif
 #else
 #define LITE_ASSERT(expr, msg...)                  \
    do {                                           \
        if (lite_unlikely(!(expr))) {              \
            auto msg_string = lite::ssprintf(msg); \
            LITE_THROW(msg_string.c_str());        \
        }                                          \
    } while (0)
 #endif

 #define LITE_MARK_USED_VAR(var) ((void)var)

 namespace lite {
 class ScopedTimer {
 public:
    typedef std::chrono::system_clock Clock;
    typedef std::chrono::nanoseconds Nsec;

    ScopedTimer(std::string name) : m_name(name) { m_start = Clock::now(); }
    ~ScopedTimer() {
        m_stop = Clock::now();
        std::chrono::duration<double> elapsed = m_stop - m_start;
        Nsec u = std::chrono::duration_cast<Nsec>(elapsed);
        auto msg = ssprintf("%s used time %fms.", m_name.c_str(),
                            static_cast<double>(u.count()) / 1000000.f);
        LITE_LOG("%s", msg.c_str());
    }

 private:
    std::chrono::time_point<std::chrono::system_clock> m_start, m_stop;
    const std::string m_name;
 };

 class Timer {
 public:
    typedef std::chrono::system_clock Clock;
    typedef std::chrono::nanoseconds Nsec;

    Timer(std::string name) : m_name(name) { m_start = Clock::now(); }
    double get_used_time() {
        m_stop = Clock::now();
        std::chrono::duration<double> elapsed = m_stop - m_start;
        Nsec u = std::chrono::duration_cast<Nsec>(elapsed);
        return static_cast<double>(u.count()) / 1000000.0;
    }
    void print_used_time(int iter) {
        m_stop = Clock::now();
        std::chrono::duration<double> elapsed = m_stop - m_start;
        Nsec u = std::chrono::duration_cast<Nsec>(elapsed);
        printf("%s used time %f ms\n", (m_name + std::to_string(iter)).c_str(),
               static_cast<double>(u.count()) / 1000000.0);
    }
    void reset_start() { m_start = Clock::now(); }

 private:
    std::chrono::time_point<std::chrono::system_clock> m_start, m_stop;
    const std::string m_name;
 };

 inline void mark_used_variable() {}
 template <typename T, typename... Arg>
 inline void mark_used_variable(T firstArg, Arg... args) {
    LITE_MARK_USED_VAR(firstArg);
    mark_used_variable(args...);
 }
 }  // namespace lite

 #if defined(_WIN32)
 #include <io.h>
 #include <windows.h>
 #undef CONST
 #define F_OK 0
 #define RTLD_LAZY 0
 // On the windows platform we use a lib_filename without a full path so
 // the win-api "LoadLibrary" would uses a standard search strategy to
 // find the lib module. As we cannot access to the lib_filename without a
 // full path, we should not use "access(a, b)" to verify it.
 #define access(a, b) false
 static inline void* dlopen(const char* file, int) {
    return static_cast<void*>(LoadLibrary(file));
 }

 static inline char* dlerror() {
    const char* errmsg = "dlerror not aviable in windows";
    return const_cast<char*>(errmsg);
 }

 static inline void* dlsym(void* handle, const char* name) {
    FARPROC symbol = GetProcAddress((HMODULE)handle, name);
    return reinterpret_cast<void*>(symbol);
 }
 #elif __linux__ || __unix__ || __APPLE__
 #include <dlfcn.h>
 #include <unistd.h>
 #endif

 #if __DEPLOY_ON_XP_SP2__
 //! refer to
 //! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160
 //! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not
 //! implement some base apis for c++ std function, for example,
 //! std::mutex/std::thread/std::condition_variable as a workround, we will
 //! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc!
 #define LITE_MUTEX size_t
 #define LITE_RECURSIVE_MUTEX size_t
 #define LITE_LOCK_GUARD(mtx) LITE_MARK_USED_VAR(mtx)
 #define LITE_LOCK_GUARD_UNIQUE(mtx) LITE_MARK_USED_VAR(mtx)
 #define LITE_LOCK_GUARD_SHARED(mtx) LITE_MARK_USED_VAR(LITE_MARK_USED_VAR)
 #else
 #define LITE_MUTEX std::mutex
 #define LITE_RECURSIVE_MUTEX std::recursive_mutex
 #define LITE_LOCK_GUARD(mtx) \
    std::lock_guard<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx)

 #define LITE_LOCK_GUARD_UNIQUE(mtx) \
    std::unique_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx)

 #define LITE_LOCK_GUARD_SHARED(mtx) \
    std::shared_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx)
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/network.cpp
+++ b/lite/src/network.cpp
@@ -0,0 +1,501 @@
 /**
 * \file src/network.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite/network.h"
 #include "function_base.h"
 #include "network_impl_base.h"
 #include "parse_info/parse_info_base.h"
 #include "parse_model/model_parser.h"
 #include "type_info.h"
 #if LITE_BUILD_WITH_MGE
 #include "mge/function_dft.h"
 #include "mge/network_impl.h"
 #endif

 #include <fstream>
 #include <memory>

 using namespace lite;

 /**
 * \brief Construct the new work implement
 * the order must be :
 * 1. creeat the implement
 * 2. config and load
 * 3. set_io
 */
 Network::Network(const Config& config, const NetworkIO& network_io) {
    LITE_ERROR_HANDLER_BEGIN
    m_config = config;
    m_network_io = network_io;
    if (config.backend == LiteBackend::LITE_DEFAULT) {
        m_impl = call_func<NetworkImplDft,
                           std::unique_ptr<lite::Network::NetworkImplBase>>(
                "create_network");
    } else if (config.backend == LiteBackend::LITE_RK_NPU) {
        m_impl = call_func<NetworkImplRK,
                           std::unique_ptr<lite::Network::NetworkImplBase>>(
                "create_network");
    }
    m_impl->set_config(config);
    m_impl->set_io(network_io);
    LITE_ERROR_HANDLER_END
 }

 Network::Network(const NetworkIO& network_io, const Config& config) {
    LITE_ERROR_HANDLER_BEGIN
    m_config = config;
    m_network_io = network_io;
    if (config.backend == LiteBackend::LITE_DEFAULT) {
        m_impl = call_func<NetworkImplDft,
                           std::unique_ptr<lite::Network::NetworkImplBase>>(
                "create_network");
    } else if (config.backend == LiteBackend::LITE_RK_NPU) {
        m_impl = call_func<NetworkImplRK,
                           std::unique_ptr<lite::Network::NetworkImplBase>>(
                "create_network");
    }
    m_impl->set_config(config);
    m_impl->set_io(network_io);
    LITE_ERROR_HANDLER_END
 }

 void Network::load_model(void* model_mem, size_t size) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    //! this model_mem is managed by user
    std::shared_ptr<void> model{model_mem, [](void*) {}};
    prase_model(model, size);
    LITE_ERROR_HANDLER_END
 }

 void Network::load_model(std::string model_path) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    FILE* fin = fopen(model_path.c_str(), "rb");
    LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(),
                strerror(errno));
    fseek(fin, 0, SEEK_END);
    size_t size = ftell(fin);
    fseek(fin, 0, SEEK_SET);
    void* ptr = malloc(size);
    std::shared_ptr<void> buf{ptr, ::free};
    auto nr = fread(buf.get(), 1, size, fin);
    LITE_ASSERT(nr == size);
    fclose(fin);
    prase_model(buf, size);
    LITE_ERROR_HANDLER_END
 }

 void Network::prase_model(std::shared_ptr<void> model_data, size_t size) {
    std::unordered_map<std::string, LiteAny> separate_config_map;
    ModelParser model_parser(model_data, size);
    //! parse the model info
    if (model_parser.parse_model_info(m_config, m_network_io,
                                      separate_config_map, m_extra_info)) {
        if (m_config.backend == LiteBackend::LITE_DEFAULT &&
            m_impl->get_backend_type() != LiteBackend::LITE_DEFAULT) {
            m_impl.reset(try_call_func<NetworkImplDft,
                                       lite::Network::NetworkImplBase*>(
                    "parse_model"));
        } else if (m_config.backend == LiteBackend::LITE_RK_NPU &&
                   m_impl->get_backend_type() != LiteBackend::LITE_RK_NPU) {
            m_impl.reset(try_call_func<NetworkImplRK,
                                       lite::Network::NetworkImplBase*>(
                    "parse_model"));
        }
        m_impl->set_config(m_config);
        m_impl->set_io(m_network_io);
    }
    //! decryption the model
    size_t model_length;
    auto&& model_shared_ptr = model_parser.parse_model(model_length, m_config);

    m_impl->load_model(model_shared_ptr, model_length, separate_config_map);
    m_loaded = true;
    update_from_implement();
 }

 Network::~Network() = default;

 void Network::update_from_implement() {
    m_config.device_type = m_impl->get_device_type();
 }

 void Network::compute_only_configured_output() {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(!m_loaded,
                "compute_only_configured_output should be used before model "
                "loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->compute_only_configured_output();
    LITE_ERROR_HANDLER_END
 }

 std::shared_ptr<Tensor> Network::get_io_tensor(std::string name,
                                               LiteTensorPhase phase) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_io_tensor(name, phase);
    LITE_ERROR_HANDLER_END
 }

 std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded,
                "get_input_tensor should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_input_tensor(index);
    LITE_ERROR_HANDLER_END
 }

 std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded,
                "get_output_tensor should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_output_tensor(index);
    LITE_ERROR_HANDLER_END
 }

 Network& Network::set_async_callback(const AsyncCallback& callback) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    m_impl->set_async_callback(std::move(callback));
    return *this;
    LITE_ERROR_HANDLER_END
 }

 Network& Network::set_start_callback(const StartCallback& callback) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    m_impl->set_start_callback(std::move(callback));
    return *this;
    LITE_ERROR_HANDLER_END
 }

 Network& Network::set_finish_callback(const FinishCallback& callback) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    m_impl->set_finish_callback(std::move(callback));
    return *this;
    LITE_ERROR_HANDLER_END
 }

 Network& Network::set_device_id(int device_id) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(!m_loaded, "set_device_id should be used before model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    m_impl->set_device_id(device_id);
    return *this;
    LITE_ERROR_HANDLER_END
 }

 Network& Network::set_stream_id(int stream_id) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(!m_loaded, "set_stream_id should be used before model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    m_impl->set_stream_id(stream_id);
    return *this;
    LITE_ERROR_HANDLER_END
 }

 void Network::forward() {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "forward should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl.get());
    m_impl->forward();
    LITE_ERROR_HANDLER_END
 }

 void Network::wait() {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "wait should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    m_impl->wait();
    LITE_ERROR_HANDLER_END
 }

 std::string Network::get_input_name(size_t index) const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_input_name should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_input_name(index);
    LITE_ERROR_HANDLER_END
 }

 std::string Network::get_output_name(size_t index) const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_output_name should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_output_name(index);
    LITE_ERROR_HANDLER_END
 }

 std::vector<std::string> Network::get_all_input_name() const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded,
                "get_all_input_name should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    auto all_input_name = m_impl->get_all_input_name();
    std::vector<std::string> all_names;
    for (auto& name : all_input_name) {
        all_names.push_back(name);
    }
    return all_names;
    LITE_ERROR_HANDLER_END
 }

 std::vector<std::string> Network::get_all_output_name() const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded,
                "get_all_output_name should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    auto all_output_name = m_impl->get_all_output_name();
    std::vector<std::string> all_names;
    for (auto& name : all_output_name) {
        all_names.push_back(name);
    }
    return all_names;
    LITE_ERROR_HANDLER_END
 }

 int Network::get_device_id() const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_device_id();
    LITE_ERROR_HANDLER_END
 }

 int Network::get_stream_id() const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_stream_id();
    LITE_ERROR_HANDLER_END
 }

 void Network::enable_profile_performance(std::string profile_file_path) {
    LITE_ERROR_HANDLER_BEGIN
    m_impl->enable_profile_performance(profile_file_path);
    LITE_ERROR_HANDLER_END
 }

 const std::string& Network::get_model_extra_info() {
    LITE_ERROR_HANDLER_BEGIN
    return m_extra_info;
    LITE_ERROR_HANDLER_END
 }

 LiteDeviceType Network::get_device_type() const {
    LITE_ERROR_HANDLER_BEGIN
    return m_impl->get_device_type();
    LITE_ERROR_HANDLER_END
 }

 /*********************** MGE special network function ***************/

 void Runtime::set_cpu_threads_number(std::shared_ptr<Network> network,
                                     size_t nr_threads) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(
                !NetworkHelper::loaded(network),
                "set_cpu_threads_number should be used before model loaded.");
        call_func<NetworkImplDft, void>("set_cpu_threads_number", network_impl,
                                        nr_threads);
        return;
    }
    LITE_THROW("set_cpu_threads_number is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::use_tensorrt(std::shared_ptr<Network> network) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(!NetworkHelper::loaded(network),
                    "use_tensorrt should be used before model loaded.");
        call_func<NetworkImplDft, void>("use_tensorrt", network_impl);
        return;
    }
    LITE_THROW("use_tensorrt is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 size_t Runtime::get_cpu_threads_number(const std::shared_ptr<Network> network) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        return call_func<NetworkImplDft, size_t>("get_cpu_threads_number",
                                                 network_impl);
    }
    LITE_THROW("get_cpu_threads_number is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::set_runtime_thread_affinity(
        std::shared_ptr<Network> network,
        const ThreadAffinityCallback& thread_affinity_callback) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(NetworkHelper::loaded(network),
                    "set_runtime_thread_affinity should be used after model "
                    "loaded.");
        call_func<NetworkImplDft, void>("set_runtime_thread_affinity",
                                        network_impl, thread_affinity_callback);

        return;
    }
    LITE_THROW("set_runtime_thread_affinity is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::set_cpu_inplace_mode(std::shared_ptr<Network> network) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(!NetworkHelper::loaded(network),
                    "set_cpu_inplace_mode should be used before model loaded.");
        call_func<NetworkImplDft, void>("set_cpu_inplace_mode", network_impl);
        return;
    }
    LITE_THROW("set_cpu_inplace_mode is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 bool Runtime::is_cpu_inplace_mode(const std::shared_ptr<Network> network) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        return call_func<NetworkImplDft, bool>("is_cpu_inplace_mode",
                                               network_impl);
    }
    LITE_THROW("is_cpu_inplace_mode is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 //! set opr algorithm selection strategy in the network
 void Runtime::set_network_algo_policy(std::shared_ptr<Network> network,
                                      LiteAlgoSelectStrategy strategy,
                                      uint32_t shared_batch_size,
                                      bool binary_equal_between_batch) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        call_func<NetworkImplDft, void>("set_network_algo_policy", network_impl,
                                        strategy, shared_batch_size,
                                        binary_equal_between_batch);
        return;
    }
    LITE_THROW("set_network_algo_policy is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 //! set opr algorithm selection strategy in the network
 void Runtime::set_network_algo_workspace_limit(std::shared_ptr<Network> network,
                                               size_t workspace_limit) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(NetworkHelper::loaded(network),
                    "set_network_algo_policy should be used after model "
                    "loaded.");
        call_func<NetworkImplDft, void>("set_network_algo_workspace_limit",
                                        network_impl, workspace_limit);
        return;
    }
    LITE_THROW(
            "set_network_algo_workspace_limit is not aviliable in the "
            "backend.");
    LITE_ERROR_HANDLER_END
 }

 //! set the network memroy allocator, the allocator is defined by user
 void Runtime::set_memory_allocator(std::shared_ptr<Network> network,
                                   std::shared_ptr<Allocator> user_allocator) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(!NetworkHelper::loaded(network),
                    "set_memory_allocator should be used before model loaded.");
        call_func<NetworkImplDft, void>("set_memory_allocator", network_impl,
                                        user_allocator);
        return;
    }
    LITE_THROW("set_memory_allocator is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::share_runtime_memory_with(std::shared_ptr<Network> dst_network,
                                        std::shared_ptr<Network> src_network) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl_dst = NetworkHelper::implement(dst_network);
    if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(!NetworkHelper::loaded(dst_network),
                    "share_runtime_memory_with should be used before model "
                    "loaded.");
        call_func<NetworkImplDft, void>("share_runtime_memory_with",
                                        network_impl_dst,
                                        NetworkHelper::implement(src_network));
        return;
    }
    LITE_THROW("share_runtime_memory_with is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::enable_io_txt_dump(std::shared_ptr<Network> network,
                                 std::string io_txt_out_file) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        call_func<NetworkImplDft, void>("enable_io_txt_dump", network_impl,
                                        io_txt_out_file);
        return;
    }
    LITE_THROW("enable_io_txt_dump is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::enable_io_bin_dump(std::shared_ptr<Network> network,
                                 std::string io_bin_out_dir) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl = NetworkHelper::implement(network);
    if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        call_func<NetworkImplDft, void>("enable_io_bin_dump", network_impl,
                                        io_bin_out_dir);
        return;
    }
    LITE_THROW("enable_io_bin_dump is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 void Runtime::shared_weight_with_network(
        std::shared_ptr<Network> dst_network,
        const std::shared_ptr<Network> src_network) {
    LITE_ERROR_HANDLER_BEGIN
    auto network_impl_dst = NetworkHelper::implement(dst_network);
    if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) {
        LITE_ASSERT(NetworkHelper::loaded(src_network),
                    "shared_weight_with_network should be used after the src "
                    "network "
                    "loaded.");
        auto src_implment = NetworkHelper::implement(src_network);
        call_func<NetworkImplDft, void>("shared_weight_with", network_impl_dst,
                                        src_implment);
        NetworkHelper::loaded(dst_network, true);
        return;
    }
    LITE_THROW("shared_weight_with_network is not aviliable in the backend.");
    LITE_ERROR_HANDLER_END
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/network_impl_base.h
+++ b/lite/src/network_impl_base.h
@@ -0,0 +1,161 @@
 /**
 * \file src/network_impl_base.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite/network.h"
 #include "misc.h"
 #include "tensor_impl_base.h"
 #include "type_info.h"

 #include <unordered_map>

 namespace lite {

 /*!
 * \brief the Inner IO data struct, add some inner data from IO
 */
 class IOInner : public IO {
 public:
    //! use to flag the corresponding lite_tensor is filled, when the
    //! value of lite_tensor is filled, the have_sync is true, other wise false,
    //! this is used in async mode
    bool have_sync = false;
    //! Real input and output data location
    std::shared_ptr<Tensor> lite_tensor = nullptr;

    IOInner() = default;
    IOInner(const IO& io) {
        name = io.name;
        is_host = io.is_host;
        io_type = io.io_type;
        config_layout = io.config_layout;
    }
 };

 /*!
 * \brief the realy network IO info when network run
 */
 struct NetworkIOInner {
    std::vector<IOInner> inputs;
    std::vector<IOInner> outputs;
 };

 /*!
 * \brief implement the Network, contain the mgb related member
 */
 class Network::NetworkImplBase : public DynTypeObj {
 public:
    virtual ~NetworkImplBase() = default;

    //! set the config of the network, include:
    //! the inference device
    //! the other inference options, such as record_level, weight_preprocess...
    virtual void set_config(const Config& config) = 0;

    //! set the special io infomation, if not set, default io tensor will used,
    //! this is special for input/output is not host tensor, default the
    //! input/output tensors are host tensor
    virtual void set_io(const NetworkIO& network_io) = 0;

    //! only compute the output tensor in user configured
    virtual void compute_only_configured_output() = 0;

    //! get the network input and ouput tensor, the layout of which is
    //! sync from mge tensor
    virtual std::shared_ptr<Tensor> get_io_tensor(
            std::string io_name,
            LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0;

    //! get the input tensor by index in the load_result tensormap
    virtual std::shared_ptr<Tensor> get_input_tensor(size_t index) = 0;

    //! get the output tensor by index in the load_result output_var_list
    virtual std::shared_ptr<Tensor> get_output_tensor(size_t index) = 0;

    //! get all the input tensor name in the order in load return
    virtual std::vector<const char*> get_all_input_name() const = 0;

    //! get all the output tensor name in the order in load return
    virtual std::vector<const char*> get_all_output_name() const = 0;

    //! get the input tensor name in the order in load return
    virtual const char* get_input_name(size_t index) const = 0;

    //! get the output tensor name in the order in load return
    virtual const char* get_output_name(size_t index) const = 0;

    //! set the callback in async model
    virtual void set_async_callback(const AsyncCallback& callback) = 0;

    //! set the start callback which will execute before network forward
    virtual void set_start_callback(const StartCallback& callback) = 0;

    //! set the finish callback which will execute after network forward
    virtual void set_finish_callback(const FinishCallback& callback) = 0;

    //! load the model and get the m_load_result
    virtual void load_model(std::shared_ptr<void> model_mem, size_t size,
                            std::unordered_map<std::string, LiteAny>
                                    separate_config_map = {}) = 0;

    //! forward the network with filled input data and fill the output data
    //! to the output tensor
    virtual void forward() = 0;

    //! in sync model, wait utile the inference finish
    virtual void wait() = 0;

    //! set device id, default device id = 0
    virtual void set_device_id(int device_id) = 0;
    virtual int get_device_id() const = 0;
    virtual LiteBackend get_backend_type() const = 0;
    //! set stream id, default stream id = 0
    virtual void set_stream_id(int stream_id) = 0;
    virtual int get_stream_id() const = 0;

    virtual LiteDeviceType get_device_type() const = 0;

    //! enable profile the network, a file will be generated
    virtual void enable_profile_performance(std::string profile_file_path) = 0;
 };

 /******************************** friend class *****************************/
 /*!
 * \brief friend class of Network, for convenient accessing the Network members
 */
 class NetworkHelper {
 public:
    static bool loaded(const std::shared_ptr<Network> network) {
        LITE_ASSERT(network);
        return network->m_loaded;
    }
    static void loaded(const std::shared_ptr<Network> network, bool loaded) {
        LITE_ASSERT(network);
        network->m_loaded = loaded;
    }
    static Network::NetworkImplBase* implement(const Network* network) {
        LITE_ASSERT(network);
        return network->m_impl.get();
    }
    static Network::NetworkImplBase* implement(
            const std::shared_ptr<Network> network) {
        LITE_ASSERT(network);
        return network->m_impl.get();
    }
    static void implement(const std::shared_ptr<Network> network,
                          std::unique_ptr<Network::NetworkImplBase> impl) {
        LITE_ASSERT(network);
        network->m_impl = std::move(impl);
    }
 };

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/parse_info/default_parse.h
+++ b/lite/src/parse_info/default_parse.h
@@ -0,0 +1,246 @@
 /**
 * \file src/parse_info/default_parse.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "../misc.h"

 #include "lite/global.h"
 #include "lite/network.h"
 #include "nlohmann/json.hpp"

 namespace lite {
 //! The LITE_default parse info function
 bool default_parse_info(
        const void* info_ptr, size_t length, const std::string& model_name,
        Config& config, NetworkIO& network_io,
        std::unordered_map<std::string, LiteAny>& separate_config_map,
        std::string& extra_info) {
    using json = nlohmann::json;
    std::string json_string(static_cast<const char*>(info_ptr), length);
    auto info = json::parse(json_string);

    if (!info["valid"]) {
        return false;
    }
    auto info_model_name = info["name"];
    if (info_model_name != model_name) {
        LITE_THROW(
                ssprintf("infomation of model name is not match, packed model "
                         "is %s, but json info get %s.",
                         model_name.c_str(),
                         static_cast<std::string>(info_model_name).c_str()));
    }
    //! check version
    std::string model_version = info["version"];
    int major = std::stoi(model_version.substr(0, model_version.find(".")));
    int start = model_version.find(".") + 1;
    int minor = std::stoi(
            model_version.substr(start, model_version.find(".", start)));
    start = model_version.find(".", start) + 1;
    int patch = std::stoi(model_version.substr(start));
    int lite_major, lite_minor, lite_patch;
    lite::get_version(lite_major, lite_minor, lite_patch);
    size_t model_version_sum = (major * 10000 + minor) * 100 + patch;
    size_t lite_version_sum =
            (lite_major * 10000 + lite_minor) * 100 + lite_patch;
    if (model_version_sum > lite_version_sum) {
        LITE_WARN("Lite load the future version model !!!!!!!!!!!!!");
    }

    if (info.contains("has_compression")) {
        config.has_compression = info["has_compression"];
    }
    if (info.contains("backend")) {
        if (info["backend"] == "MGE") {
            config.backend = LiteBackend::LITE_DEFAULT;
        }
        if (info["backend"] == "RK") {
            config.backend = LiteBackend::LITE_RK_NPU;
        }
    }

    auto get_device_type = [](std::string type) -> LiteDeviceType {
        if (type == "CPU")
            return LiteDeviceType::LITE_CPU;
        if (type == "CUDA")
            return LiteDeviceType::LITE_CUDA;
        if (type == "OPENCL")
            return LiteDeviceType::LITE_OPENCL;
        if (type == "ATLAS")
            return LiteDeviceType::LITE_ATLAS;
        if (type == "NPU")
            return LiteDeviceType::LITE_NPU;
        else {
            LITE_THROW(ssprintf("LITE not support device type of %s.",
                                type.c_str()));
        }
    };
    if (info.contains("device")) {
        auto device_json = info["device"];
        config.device_type = get_device_type(device_json["type"]);
        if (device_json.contains("device_id")) {
            separate_config_map["device_id"] =
                    static_cast<int>(device_json["device_id"]);
        }
        if (device_json.contains("number_threads")) {
            separate_config_map["number_threads"] =
                    static_cast<size_t>(device_json["number_threads"]);
        }
        if (device_json.contains("enable_inplace_model")) {
            separate_config_map["enable_inplace_model"] =
                    static_cast<bool>(device_json["enable_inplace_model"]);
        }
        if (device_json.contains("use_tensorrt")) {
            separate_config_map["use_tensorrt"] =
                    static_cast<bool>(device_json["use_tensorrt"]);
        }
    }
    //! options
    if (info.contains("options")) {
        auto options = info["options"];
        if (options.contains("weight_preprocess"))
            config.options.weight_preprocess = options["weight_preprocess"];
        if (options.contains("fuse_preprocess"))
            config.options.fuse_preprocess = options["fuse_preprocess"];
        if (options.contains("fake_next_exec"))
            config.options.fake_next_exec = options["fake_next_exec"];
        if (options.contains("var_sanity_check_first_run"))
            config.options.var_sanity_check_first_run =
                    options["var_sanity_check_first_run"];
        if (options.contains("const_shape"))
            config.options.const_shape = options["const_shape"];
        if (options.contains("force_dynamic_alloc"))
            config.options.force_dynamic_alloc = options["force_dynamic_alloc"];
        if (options.contains("force_output_dynamic_alloc"))
            config.options.force_output_dynamic_alloc =
                    options["force_output_dynamic_alloc"];
        if (options.contains("no_profiling_on_shape_change"))
            config.options.no_profiling_on_shape_change =
                    options["no_profiling_on_shape_change"];
        if (options.contains("jit_level"))
            config.options.jit_level = options["jit_level"];
        if (options.contains("comp_node_seq_record_level"))
            config.options.comp_node_seq_record_level =
                    options["comp_node_seq_record_level"];
        if (options.contains("graph_opt_level"))
            config.options.graph_opt_level = options["graph_opt_level"];
        if (options.contains("async_exec_level"))
            config.options.async_exec_level = options["async_exec_level"];
    }
    //! IO
    auto get_io_type = [](std::string type) -> LiteIOType {
        if (type == "value")
            return LiteIOType::LITE_IO_VALUE;
        if (type == "shape")
            return LiteIOType::LITE_IO_SHAPE;
        else {
            LITE_THROW(
                    ssprintf("LITE not support IO type of %s.", type.c_str()));
        }
    };
    auto get_data_type = [](std::string type) -> LiteDataType {
        if (type == "float32")
            return LiteDataType::LITE_FLOAT;
        if (type == "float16")
            return LiteDataType::LITE_HALF;
        if (type == "int32")
            return LiteDataType::LITE_INT;
        if (type == "int16")
            return LiteDataType::LITE_INT16;
        if (type == "int8")
            return LiteDataType::LITE_INT8;
        if (type == "uint8")
            return LiteDataType::LITE_UINT8;
        else {
            LITE_THROW(ssprintf("LITE not support data type of %s.",
                                type.c_str()));
        }
    };
 #define SET_SHAPE(shape_json_, config_)                                       \
    do {                                                                      \
        int ndim = 0;                                                         \
        for (int i = 0; i < 4; i++) {                                         \
            if (shape_json_.contains(shape_name[i])) {                        \
                ndim++;                                                       \
                config_.config_layout.shapes[i] = shape_json_[shape_name[i]]; \
            } else {                                                          \
                break;                                                        \
            }                                                                 \
        }                                                                     \
        config_.config_layout.ndim = ndim;                                    \
    } while (0)

 #define Config_IO(io_json_, io_config_)                                        \
    if (io_json_.contains("is_host"))                                          \
        io_config_.is_host = io_json_["is_host"];                              \
    if (io_json_.contains("io_type"))                                          \
        io_config_.io_type = get_io_type(io_json_["io_type"]);                 \
    if (io_json_.contains("dtype"))                                            \
        io_config_.config_layout.data_type = get_data_type(io_json_["dtype"]); \
    if (io_json_.contains("shape")) {                                          \
        auto shape_json = io_json_["shape"];                                   \
        SET_SHAPE(shape_json, io_config_);                                     \
    }

    const std::string shape_name[] = {"dim0", "dim1", "dim2", "dim3"};
    if(info.contains("IO")){
        auto IOs = info["IO"];
        if(IOs.contains("inputs")){
            auto inputs = IOs["inputs"];
            for (size_t i = 0; i < inputs.size(); i++) {
                auto input_json = inputs[i];
                bool found = false;
                for (auto&& io_config : network_io.inputs) {
                    if (io_config.name == input_json["name"]) {
                        found = true;
                        Config_IO(input_json, io_config);
                    }
                }
                if (!found) {
                    IO input;
                    input.name = input_json["name"];
                    Config_IO(input_json, input);
                    network_io.inputs.push_back(input);
                }
            }
        }
        if (IOs.contains("outputs")) {
            auto outputs = IOs["outputs"];
            for (size_t i = 0; i < outputs.size(); i++) {
                auto output_json = outputs[i];
                bool found = false;
                for (auto&& io_config : network_io.outputs) {
                    if (io_config.name == output_json["name"]) {
                        found = true;
                        Config_IO(output_json, io_config);
                    }
                }
                if (!found) {
                    IO output;
                    output.name = output_json["name"];
                    Config_IO(output_json, output);
                    network_io.outputs.push_back(output);
                }
            }
        }
    }
    //! extra_info
    if (info.contains("extra_info")) {
        extra_info = info["extra_info"].dump();
    }
    return true;
 #undef GET_BOOL
 #undef Config_IO
 }

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/parse_info/parse_info_base.h
+++ b/lite/src/parse_info/parse_info_base.h
@@ -0,0 +1,40 @@
 /**
 * \file src/parse_info/parse_info_base.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include "lite/global.h"
 #include "mutex"

 namespace lite {

 struct ParseInfoStaticData {
    std::unordered_map<std::string, ParseInfoFunc> parse_info_methods;
    LITE_MUTEX map_mutex;
 };

 ParseInfoStaticData& parse_info_static_data();

 template <int count>
 struct ParseInfoRegister;
 }  // namespace lite

 #define REGIST_PARSE_INFO_FUNCTION(name_, func_) \
    REGIST_PARSE_INFO_FUNCTION_WITH_NUM(__COUNTER__, name_, func_)

 #define REGIST_PARSE_INFO_FUNCTION_WITH_NUM(number_, name_, func_)      \
    template <>                                                         \
    struct ParseInfoRegister<number_> {                                 \
        ParseInfoRegister() { register_parse_info_func(name_, func_); } \
    };                                                                  \
    namespace {                                                         \
    ParseInfoRegister<number_> parse_info_##number_;                    \
    }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/parse_model/model_parser.cpp
+++ b/lite/src/parse_model/model_parser.cpp
@@ -0,0 +1,134 @@
 /**
 * \file src/model_parser.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "model_parser.h"
 #include "decryption/decrypt_base.h"
 #include "parse_info/parse_info_base.h"

 using namespace lite;
 using namespace model_parse;

 std::string ModelParser::sm_model_tag = "packed_model";

 void ModelParser::parse_header() {
    size_t tag_length = sm_model_tag.size();

    //! parse model tag
    const char* ptr = static_cast<char*>(m_model.get());
    std::string tag(static_cast<const char*>(ptr), tag_length);
    if (sm_model_tag == tag) {
        m_is_bare_model = false;
    } else {
        //! if no tag, the model is bare model, return
        m_is_bare_model = true;
        return;
    }

    uint8_t* buffer = static_cast<uint8_t*>(m_model.get()) + tag_length;
    auto packed_model = GetPackModel(buffer);
    auto models = packed_model->models();
    LITE_ASSERT(models->size() == 1, "Now only support one model");
    auto model = models->Get(0);
    m_model_name = model->header()->name()->c_str();
    m_model_decryption_name =
            model->header()->model_decryption_method()->c_str();
    m_info_decryption_name = model->header()->info_decryption_method()->c_str();
    m_info_parse_func_name = model->header()->info_parse_method()->c_str();

    m_info = model->info();
    m_model_data = model->data();
 }

 bool ModelParser::parse_model_info(
        Config& network_config, NetworkIO& network_io,
        std::unordered_map<std::string, LiteAny>& isolated_config_map,
        std::string& extra_info) const {
    //! no model info, no parse, direct return
    if (m_is_bare_model || !m_info) {
        return false;
    }
    size_t info_length = m_info->data()->size();
    const uint8_t* info_data = m_info->data()->Data();
    //! decryption the info
    auto info_ptr = decrypt_memory(info_data, info_length,
                                   m_info_decryption_name, info_length);
    //! parse the info
    LITE_LOCK_GUARD(parse_info_static_data().map_mutex);
    auto it_parse = parse_info_static_data().parse_info_methods.find(
            m_info_parse_func_name);
    if (it_parse == parse_info_static_data().parse_info_methods.end()) {
        LITE_THROW(ssprintf("can't find model info parse function %s.",
                            m_info_parse_func_name.c_str()));
    }
    auto model_info_parse_func =
            parse_info_static_data().parse_info_methods[m_info_parse_func_name];
    //! convert for NetworkIOInner to NetworkIO
    if (model_info_parse_func) {
        model_info_parse_func(info_ptr.get(), info_length, m_model_name,
                              network_config, network_io, isolated_config_map,
                              extra_info);
    } else {
        LITE_THROW(ssprintf("model info parse function of  %s is empty",
                            m_info_parse_func_name.c_str()));
    }
    return true;
 }

 std::shared_ptr<void> ModelParser::parse_model(size_t& model_length,
                                               const Config& config) const {
    if (m_is_bare_model) {
        if (config.bare_model_cryption_name.size() == 0) {
            model_length = m_total_length;
            return m_model;
        } else {
            return decrypt_memory(
                    static_cast<uint8_t*>(m_model.get()), m_total_length,
                    config.bare_model_cryption_name, model_length);
        }
    }
    LITE_ASSERT(m_model_data, "packed model parse error!");
    model_length = m_model_data->data()->size();
    const uint8_t* model_data = m_model_data->data()->Data();
    LITE_ASSERT(model_length > 0, "The loaded model is of zero length.");
    return decrypt_memory(model_data, model_length, m_model_decryption_name,
                          model_length);
 }

 std::shared_ptr<void> ModelParser::decrypt_memory(
        const uint8_t* data, size_t length, const std::string decryption_name,
        size_t& result_length) const {
    const uint8_t* memory_ptr = data;
    if (decryption_name == "NONE") {
        result_length = length;
        return std::shared_ptr<void>(const_cast<uint8_t*>(memory_ptr),
                                     [](void*) {});
    }
    LITE_LOCK_GUARD(decryption_static_data().map_mutex);
    auto it = decryption_static_data().decryption_methods.find(decryption_name);
    if (it == decryption_static_data().decryption_methods.end()) {
        LITE_THROW(ssprintf("The decryption method %s is not registed yet.",
                            decryption_name.c_str()));
    }
    auto&& func = it->second.first;
    auto&& key = it->second.second;
    if (func) {
        auto model_vector = func(memory_ptr, length, *key);
        result_length = model_vector.size();
        auto tmp_model_vector =
                new std::vector<uint8_t>(std::move(model_vector));
        return std::shared_ptr<void>(
                tmp_model_vector->data(),
                [tmp_model_vector](void*) { delete tmp_model_vector; });
    } else {
        LITE_THROW(ssprintf("No decryption function in %s method.",
                            decryption_name.c_str()));
    }
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/parse_model/model_parser.h
+++ b/lite/src/parse_model/model_parser.h
@@ -0,0 +1,75 @@
 /**
 * \file src/model_parser.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once
 #include "lite/global.h"
 #include "../network_impl_base.h"

 #include "pack_model_generated.h"
 #include <flatbuffers/flatbuffers.h>

 #include <unordered_map>

 namespace lite {

 /*!
 * \brief parse the model and decyt
 */
 class ModelParser {
 public:
    ModelParser(std::shared_ptr<void> model_ptr, size_t model_length)
            : m_model(model_ptr), m_total_length(model_length) {
        //! parse the header
        parse_header();
    }

    //! parse the Info part of the model, update the network_config and
    //! network_io
    bool parse_model_info(
            Config& network_config, NetworkIO& network_io,
            std::unordered_map<std::string, LiteAny>& isolated_config_map,
            std::string& extra_info) const;

    //! parse the model and decrypt the model
    std::shared_ptr<void> parse_model(size_t& model_length,
                                      const Config& config) const;

 private:
    //! parse the header of the model and store the model related information
    //! to the menber data
    void parse_header();

    //! decrypt a memory with length of length and decryption method name
    //! decrypt_name
    std::shared_ptr<void> decrypt_memory(const uint8_t* data, size_t length,
                                         const std::string decryption_name,
                                         size_t& result_length) const;

 private:
    std::string m_model_name;
    //! the info and model decryption method name,  the
    //! decryption func can be found through this name
    std::string m_info_decryption_name;
    std::string m_model_decryption_name;
    //! the function name to parse the model info
    std::string m_info_parse_func_name;
    //! if a model is not added json info to the model is not crypted, the
    //! model is a bare model
    bool m_is_bare_model = true;

    const model_parse::ModelInfo* m_info = nullptr;
    const model_parse::ModelData* m_model_data = nullptr;

    std::shared_ptr<void> m_model;
    size_t m_total_length;

    static std::string sm_model_tag;
 };
 }  // namespace lite
   // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/parse_model/pack_model.fbs
+++ b/lite/src/parse_model/pack_model.fbs
@@ -0,0 +1,28 @@
 namespace model_parse;

 table ModelHeader {
    name:string;
    info_decryption_method:string;
    info_parse_method:string;
    model_decryption_method:string;
 }

 table ModelInfo {
    data:[ubyte];
 }

 table ModelData {
    data:[ubyte];
 }

 table Model {
    header:ModelHeader;
    info:ModelInfo;
    data:ModelData;
 }

 table PackModel {
    models:[Model];
 }

 root_type PackModel;
--- a/lite/src/tensor.cpp
+++ b/lite/src/tensor.cpp
@@ -0,0 +1,339 @@
 /**
 * \file src/tensor.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite/tensor.h"
 #include "function_base.h"
 #include "tensor_impl_base.h"
 #if LITE_BUILD_WITH_MGE
 #include "megbrain/comp_node.h"
 #include "megbrain/tensor.h"
 #include "mge/function_dft.h"
 #include "mge/tensor_impl.h"
 #endif

 #include <memory>

 using namespace lite;

 size_t Layout::get_elem_size() const {
    size_t elesize = 1;
    switch (data_type) {
        case LiteDataType::LITE_INT64:
            elesize = 8;
            break;
        case LiteDataType::LITE_FLOAT:
        case LiteDataType::LITE_INT:
        case LiteDataType::LITE_UINT:
            elesize = 4;
            break;
        case LiteDataType::LITE_HALF:
        case LiteDataType::LITE_INT16:
        case LiteDataType::LITE_UINT16:
            elesize = 2;
            break;
        case LiteDataType::LITE_INT8:
        case LiteDataType::LITE_UINT8:
            elesize = 1;
            break;
        default:
            LITE_THROW("not support data type.");
    }
    return elesize;
 }

 bool Layout::operator==(const Layout& other) const {
    bool equal = true;
    equal &= (ndim == other.ndim);
    equal &= (data_type == other.data_type);
    for (size_t i = 0; i < ndim; i++) {
        equal &= (shapes[i] == other.shapes[i]);
    }
    return equal;
 }

 Tensor::~Tensor() = default;

 Tensor::Tensor() {
    LITE_ERROR_HANDLER_BEGIN
    m_tensor_impl = call_func<TensorImplDft,
                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
            "create_tensor");
    LITE_ERROR_HANDLER_END
 }

 Tensor::Tensor(LiteDeviceType device_type, bool is_pinned_host)
        : m_is_pinned_host(is_pinned_host), m_device_type(device_type) {
    LITE_ERROR_HANDLER_BEGIN
    m_tensor_impl = call_func<TensorImplDft,
                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
            "create_tensor", device_type, is_pinned_host);
    LITE_ERROR_HANDLER_END
 }

 Tensor::Tensor(LiteDeviceType device_type, const Layout& layout,
               bool is_pinned_host)
        : m_is_pinned_host(is_pinned_host),
          m_layout(layout),
          m_device_type(device_type) {
    LITE_ERROR_HANDLER_BEGIN
    m_tensor_impl = call_func<TensorImplDft,
                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
            "create_tensor", device_type, layout, is_pinned_host);
    LITE_ERROR_HANDLER_END
 }

 Tensor::Tensor(int device_id, LiteDeviceType device_type, const Layout& layout,
               bool is_pinned_host)
        : m_is_pinned_host(is_pinned_host),
          m_device_id(device_id),
          m_layout(layout),
          m_device_type(device_type) {
    LITE_ERROR_HANDLER_BEGIN
    m_tensor_impl = call_func<TensorImplDft,
                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
            "create_tensor", device_id, device_type, layout, is_pinned_host);
    LITE_ERROR_HANDLER_END
 }

 Tensor::Tensor(int device_id, int stream_id, LiteDeviceType device_type,
               bool is_pinned_host)
        : m_is_pinned_host(is_pinned_host),
          m_device_id(device_id),
          m_device_type(device_type) {
    LITE_ERROR_HANDLER_BEGIN
    m_tensor_impl = call_func<TensorImplDft,
                              std::shared_ptr<lite::Tensor::TensorImplBase>>(
            "create_tensor", device_id, stream_id, device_type, is_pinned_host);
    LITE_ERROR_HANDLER_END
 }

 Tensor::Tensor(LiteBackend backend, LiteDeviceType device_type, int device_id,
               const Layout& layout, bool is_pinned_host) {
    if (backend == LiteBackend::LITE_DEFAULT) {
        m_tensor_impl =
                call_func<TensorImplDft,
                          std::shared_ptr<lite::Tensor::TensorImplBase>>(
                        "create_tensor", device_id, device_type, layout,
                        is_pinned_host);
    } else {
        LITE_MARK_USED_VAR(device_type);
        LITE_MARK_USED_VAR(is_pinned_host);
        LITE_MARK_USED_VAR(layout);
        LITE_MARK_USED_VAR(device_id);
        LITE_THROW("unknow backend, enum id is : %d.");
    }
 }

 void Tensor::reshape(const std::vector<int>& shape) {
    LITE_ASSERT(m_layout.ndim > 0, "The tensor to be reshape is empty.");
    uint32_t length = shape.size();
    LITE_ASSERT(length < Layout::MAXDIM,
                "The ndim of reshape input is too large.");
    Layout new_layout = m_layout;
    new_layout.ndim = length;
    size_t total_length =
            get_tensor_total_size_in_byte() / m_layout.get_elem_size();
    uint32_t unfixed_number = 0;
    uint32_t unfixed_index = 0;
    for (uint32_t i = 0; i < length; i++) {
        if (shape[i] == -1) {
            unfixed_number += 1;
            unfixed_index = i;
        } else {
            LITE_ASSERT(shape[i] > 0, "The reshape inputs invalid.");
            new_layout.shapes[i] = shape[i];
        }
    }
    LITE_ASSERT(unfixed_number <= 1, "The reshape inputs invalid.");
    if (unfixed_number) {
        size_t left = total_length;
        for (uint32_t i = 0; i < length; i++) {
            if (i == unfixed_index) {
                continue;
            } else {
                LITE_ASSERT(left > 0 && (left % new_layout.shapes[i] == 0),
                            "The reshape inputs invalid.");
                left = left / new_layout.shapes[i];
            }
        }
        LITE_ASSERT(left > 0, "The reshape inputs invalid.");
        new_layout.shapes[unfixed_index] = left;
    }
    size_t new_total = 1;
    for (uint32_t i = 0; i < length; i++) {
        new_total *= new_layout.shapes[i];
    }
    LITE_ASSERT(new_total == total_length, "The reshape inputs invalid.");
    m_layout = new_layout;
    m_tensor_impl->reshape(m_layout);
 }

 size_t Tensor::get_tensor_total_size_in_byte() const {
    LITE_ERROR_HANDLER_BEGIN
    size_t elemsize = m_layout.get_elem_size();
    size_t total = m_layout.ndim == 0 ? 0 : 1;
    for (size_t i = 0; i < m_layout.ndim; i++) {
        total *= m_layout.shapes[i];
    }
    return total * elemsize;
    LITE_ERROR_HANDLER_END
 }

 void* Tensor::get_memory_ptr() const {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_layout.ndim != 0,
                "Tensor layout is not valid when get memory ptr.");
    return m_tensor_impl->get_memory_ptr();
    LITE_ERROR_HANDLER_END
 }

 void* Tensor::get_memory_ptr(const std::vector<size_t>& idx) const {
    LITE_ERROR_HANDLER_BEGIN
    return m_tensor_impl->get_memory_ptr(idx);
    LITE_ERROR_HANDLER_END
 }

 std::shared_ptr<Tensor> Tensor::slice(const std::vector<size_t>& start,
                                      const std::vector<size_t>& end,
                                      const std::vector<size_t>& step) {
    LITE_ERROR_HANDLER_BEGIN
    auto ret = m_tensor_impl->slice(start, end, step);
    ret->update_from_implement();
    return ret;
    LITE_ERROR_HANDLER_END
 }

 void Tensor::fill_zero() {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_layout.ndim > 0,
                "fill_zero can't apply on a tensor with empty layout.");
    m_tensor_impl->fill_zero();
    LITE_ERROR_HANDLER_END
 }

 void Tensor::share_memory_with(const Tensor& src_tensor) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(src_tensor.m_layout.ndim > 0,
                "To be shared tensor with empty layout.");
    m_tensor_impl->share_memory_with(src_tensor.m_tensor_impl.get());
    update_from_implement();
    LITE_ERROR_HANDLER_END
 }

 void Tensor::set_layout(const Layout& layout) {
    LITE_ERROR_HANDLER_BEGIN
    m_layout = layout;
    m_tensor_impl->set_layout(layout);
    LITE_ERROR_HANDLER_END
 }

 void Tensor::reset(void* prepared_data, size_t data_length_in_byte) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_layout.ndim,
                "Tensor layout is empty, please reset with layout");
    LITE_ASSERT(data_length_in_byte >= get_tensor_total_size_in_byte(),
                "the memory reset to the tensor is too small.");
    m_tensor_impl->reset(prepared_data);
    LITE_ERROR_HANDLER_END
 }

 void Tensor::reset(void* prepared_data, const Layout& layout) {
    LITE_ERROR_HANDLER_BEGIN
    m_layout = layout;
    m_tensor_impl->reset(prepared_data, layout);
    LITE_ERROR_HANDLER_END
 }

 bool Tensor::is_continue_memory() const {
    LITE_ERROR_HANDLER_BEGIN
    return m_tensor_impl->is_continue_memory();
    LITE_ERROR_HANDLER_END
 }

 void Tensor::copy_from(const Tensor& src) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(src.get_layout().ndim != 0,
                "when tensor copy, the src tensor layout is empty.");
    m_tensor_impl->copy_from(src.m_tensor_impl.get());
    update_from_implement();
    LITE_ERROR_HANDLER_END
 }

 void Tensor::update_from_implement() {
    LITE_ERROR_HANDLER_BEGIN
    m_layout = m_tensor_impl->get_layout();
    m_device_type = m_tensor_impl->get_device_type();
    m_device_id = m_tensor_impl->get_device_id();
    m_is_pinned_host = m_tensor_impl->is_pinned_host();
    LITE_ERROR_HANDLER_END
 }

 void LiteAny::type_missmatch(size_t expect, size_t get) const {
    LITE_THROW(ssprintf(
            "The type store in LiteAny is not match the visit type, type of "
            "storage length is %zu, type of visit length is %zu.",
            expect, get));
 }

 std::shared_ptr<Tensor> TensorUtils::concat(const std::vector<Tensor>& tensors,
                                            int dim, LiteDeviceType dst_device,
                                            int dst_device_id) {
    if (tensors.size() <= 0) {
        return std::make_shared<Tensor>();
    }
    if (dst_device == LiteDeviceType::LITE_DEVICE_DEFAULT) {
        dst_device = tensors.front().get_device_type();
    }
    if (dst_device_id == -1) {
        dst_device_id = tensors.front().get_device_id();
    }
    bool is_pinned_host = tensors.front().is_pinned_host();
    auto layout = tensors.front().get_layout();
    LITE_ASSERT(static_cast<int>(layout.ndim) > dim,
                "the dim in concat is error.");
    size_t sum_in_dim = layout.shapes[dim];
    for (size_t i = 1; i < tensors.size(); ++i) {
        auto other_layout = tensors[i].get_layout();
        LITE_ASSERT(other_layout.ndim == layout.ndim,
                    "the dim size of tensors is not same!");
        LITE_ASSERT(other_layout.data_type == layout.data_type,
                    "the dtype of tensors is not same!");
        for (size_t j = 0; j < other_layout.ndim; ++j) {
            if (dim == static_cast<int>(j)) {
                sum_in_dim += other_layout.shapes[j];
                continue;
            }
            LITE_ASSERT(other_layout.shapes[j] == layout.shapes[j],
                        "the shape of tensors is not same!");
        }
    }
    layout.shapes[dim] = sum_in_dim;
    auto result = std::make_shared<Tensor>(dst_device_id, dst_device, layout,
                                           is_pinned_host);
    size_t index = 0;
    std::vector<size_t> start(dim + 1, 0);
    std::vector<size_t> end(dim + 1, 0);
    for (int i = 0; i < dim; i++) {
        end[i] = layout.shapes[i];
    }
    for (size_t i = 0; i < tensors.size(); ++i) {
        auto&& tensor = tensors[i];
        auto layout = tensor.get_layout();
        if (layout.shapes[dim] == 0)
            continue;
        start[dim] = index;
        end[dim] = index + layout.shapes[dim];
        auto&& sub_dst = result->slice(start, end);
        sub_dst->copy_from(tensor);
        index += layout.shapes[dim];
    }
    return result;
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/tensor_impl_base.h
+++ b/lite/src/tensor_impl_base.h
@@ -0,0 +1,101 @@
 /**
 * \file src/tensor_impl_base.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite/tensor.h"
 #include "misc.h"
 #include "type_info.h"

 #include <unordered_map>

 namespace lite {

 /*!
 * \brief implement the Tensor
 */
 class Tensor::TensorImplBase : public DynTypeObj {
 public:
    virtual ~TensorImplBase() = default;

    virtual LiteDeviceType get_device_type() const = 0;

    virtual int get_device_id() const = 0;

    virtual LiteBackend get_backend_type() const = 0;

    virtual Layout get_layout() const = 0;

    virtual bool is_pinned_host() const = 0;

    virtual void* get_memory_ptr() const = 0;

    virtual void* get_memory_ptr(const std::vector<size_t>& idx) const = 0;

    virtual void set_layout(const Layout& layout) = 0;

    //! use the user allocated data to reset the memory of the tensor, the
    //! memory will not be managed by the lite, later, the user should delete
    //! it.
    virtual void reset(void* prepared_data) = 0;

    //! use the user allocated data and corresponding layout to reset the data
    //! and layout of the tensor, the memory will not be managed by lite, later,
    //! the user should delete it.
    virtual void reset(void* prepared_data, const Layout& layout) = 0;

    //! reshape the tensor with new shape, keep the data_type the same
    virtual void reshape(const Layout& layout) = 0;

    //! get a new tensor slice from the origin tensor
    virtual std::shared_ptr<Tensor> slice(
            const std::vector<size_t>& start, const std::vector<size_t>& end,
            const std::vector<size_t>& step = {}) = 0;

    //! set the tensor memory with zero
    virtual void fill_zero() = 0;

    //! copy tensor form other tensor
    //! Note: the best way for tensor copy is just set the dst device, left
    //! layout empty, when copying the dst layout will be set the same with
    //! src
    virtual void copy_from(const TensorImplBase* src_impl) = 0;

    //! share memory with other tensor
    virtual void share_memory_with(const TensorImplBase* src_impl) = 0;

    //! whether the memory of tensor is continue
    virtual bool is_continue_memory() const = 0;
 };

 /*!
 * \brief friend class of Tensor, for convenient accessing the Network members
 */
 class TensorHelper {
 public:
    static inline std::shared_ptr<Tensor::TensorImplBase> implement(
            const std::shared_ptr<Tensor> tensor) {
        LITE_ASSERT(tensor);
        return tensor->m_tensor_impl;
    }
    static inline std::shared_ptr<Tensor::TensorImplBase> implement(
            const Tensor* tensor) {
        LITE_ASSERT(tensor);
        return tensor->m_tensor_impl;
    }
    static inline void implement(const std::shared_ptr<Tensor> tensor,
                                 std::shared_ptr<Tensor::TensorImplBase> impl) {
        LITE_ASSERT(tensor);
        tensor->m_tensor_impl = impl;
    }
 };

 }  // namespace lite

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/type_info.h
+++ b/lite/src/type_info.h
@@ -0,0 +1,97 @@
 /**
 * \file src/type_info.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "misc.h"

 namespace lite {
 /*!
 * \brief an object to represent a type
 *
 * LITE has a lightweight RTTI system. Each type is represented by the
 * address of a Typeinfo object, which is stored in the .bss segment.
 *
 * LITE_TYPEINFO_OBJ_DECL should be placed into the definition of classes that
 * need compile-time type support.
 *
 * For classes that need RTTI, they should be derived from DynTypeObj
 */
 struct Typeinfo {
    //! name of the corresponding type; nullptr if MGB_VERBOSE_TYPEINFO_NAME==0
    const char* const name;

    /*!
     * \brief whether this is the type of given object
     * \tparam T a class with static typeinfo() method
     */
    template <typename T>
    bool is() const {
        return T::typeinfo() == this;
    }
 };

 /*!
 * \brief base class to emulate RTTI without compiler support
 */
 class DynTypeObj {
 public:
    virtual Typeinfo* dyn_typeinfo() const = 0;

    //! cast this to a final object with type check
    template <class T>
    T& cast_final_safe() {
        LITE_ASSERT(T::typeinfo() == dyn_typeinfo(),
                    "can not convert type %s to %s", dyn_typeinfo()->name,
                    T::typeinfo()->name);
        return *static_cast<T*>(this);
    }

    template <class T>
    const T& cast_final_safe() const {
        return const_cast<DynTypeObj*>(this)->cast_final_safe<T>();
    }

    //! check whether this is same to given type
    template <class T>
    bool same_type() const {
        return dyn_typeinfo() == T::typeinfo();
    }

 protected:
    ~DynTypeObj() = default;
 };

 //! put in the declaration of a final class inherited from DynTypeObj
 #define LITE_DYN_TYPE_OBJ_FINAL_DECL                                    \
 public:                                                                 \
    ::lite::Typeinfo* dyn_typeinfo() const override final;              \
    static inline ::lite::Typeinfo* typeinfo() { return &sm_typeinfo; } \
                                                                        \
 private:                                                                \
    static ::lite::Typeinfo sm_typeinfo

 #if LITE_ENABLE_LOGGING
 //! get class name from class object
 #define _LITE_TYPEINFO_CLASS_NAME(_cls) #_cls
 #else
 #define _LITE_TYPEINFO_CLASS_NAME(_cls) nullptr
 #endif

 //! put in the impl file of a class that needs static typeinfo()
 #define LITE_TYPEINFO_OBJ_IMPL(_cls) \
    ::lite::Typeinfo _cls::sm_typeinfo { _LITE_TYPEINFO_CLASS_NAME(_cls) }

 //! put in the impl file of a final class inherited from DynTypeObj
 #define LITE_DYN_TYPE_OBJ_FINAL_IMPL(_cls)                                \
    ::lite::Typeinfo* _cls::dyn_typeinfo() const { return &sm_typeinfo; } \
    LITE_TYPEINFO_OBJ_IMPL(_cls)

 }  // namespace lite
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/src/version_lite.ld
+++ b/lite/src/version_lite.ld
@@ -0,0 +1,10 @@
 {
 global:
    extern "C++" {lite::*;};
    Lite*;
    LITE*;
    default_config;
    default_network_io;

 local: *;
 };
--- a/lite/test/CMakeLists.txt
+++ b/lite/test/CMakeLists.txt
@@ -0,0 +1,23 @@
 if (MGE_WITH_TEST)
    file (GLOB_RECURSE SOURCES ./*.cpp main.cpp)
    add_executable (lite_test  ${SOURCES})

    target_link_libraries(lite_test gtest)
    target_link_libraries(lite_test lite_static)
    if(LITE_BUILD_WITH_MGE)
        # lite_test will depends megbrain interface
        target_link_libraries(lite_test megbrain)
    endif()

    if(UNIX)
        if(APPLE OR ANDROID)
            target_link_libraries(lite_test dl)
        else()
            target_link_libraries(lite_test dl rt)
        endif()
    endif()

    install (TARGETS lite_test
        EXPORT ${LITE_EXPORT_TARGETS}
        RUNTIME DESTINATION lite/bin)
 endif()
--- a/lite/test/main.cpp
+++ b/lite/test/main.cpp
@@ -0,0 +1,33 @@
 /**
 * \file test/main.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <gtest/gtest.h>
 #include "../src/misc.h"
 #include "lite/global.h"

 namespace {

 class ResetSeedListener : public ::testing::EmptyTestEventListener {
    void OnTestStart(const ::testing::TestInfo&) override {}
 };

 }  // namespace

 int main(int argc, char** argv) {
    ResetSeedListener listener;
    auto&& listeners = ::testing::UnitTest::GetInstance()->listeners();
    ::testing::InitGoogleTest(&argc, argv);
    listeners.Append(&listener);
    lite::set_log_level(LiteLogLevel::WARN);
    auto ret = RUN_ALL_TESTS();
    listeners.Release(&listener);
    return ret;
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/test/npy.h
+++ b/lite/test/npy.h
@@ -0,0 +1,638 @@
 /*
   Copyright 2017 Leon Merten Lohse

   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
 */

 /*
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #ifndef NPY_H
 #define NPY_H

 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <vector>

 namespace npy {

 /* Compile-time test for byte order.
   If your compiler does not define these per default, you may want to define
   one of these constants manually.
   Defaults to little endian order. */
 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN ||                  \
        defined(__BIG_ENDIAN__) || defined(__ARMEB__) ||                      \
        defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \
        defined(__MIBSEB) || defined(__MIBSEB__)
 const bool big_endian = true;
 #else
 const bool big_endian = false;
 #endif

 const char magic_string[] = "\x93NUMPY";
 const size_t magic_string_length = 6;

 const char little_endian_char = '<';
 const char big_endian_char = '>';
 const char no_endian_char = '|';

 constexpr char host_endian_char =
        (big_endian ? big_endian_char : little_endian_char);

 /* npy array length */
 typedef unsigned long int ndarray_len_t;

 inline void write_magic(std::ostream& ostream, unsigned char v_major = 1,
                        unsigned char v_minor = 0) {
    ostream.write(magic_string, magic_string_length);
    ostream.put(v_major);
    ostream.put(v_minor);
 }

 inline void read_magic(std::istream& istream, unsigned char& v_major,
                       unsigned char& v_minor) {
    char buf[magic_string_length + 2];
    istream.read(buf, magic_string_length + 2);

    if (!istream) {
        fprintf(stderr, "io error: failed reading file");
    }

    if (0 != std::memcmp(buf, magic_string, magic_string_length)) {
        fprintf(stderr, "this file does not have a valid npy format.");
    }

    v_major = buf[magic_string_length];
    v_minor = buf[magic_string_length + 1];
 }

 // typestring magic
 struct Typestring {
 private:
    char c_endian;
    char c_type;
    int len;

 public:
    inline std::string str() {
        const size_t max_buflen = 16;
        char buf[max_buflen];
        std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
        return std::string(buf);
    }

    Typestring(const std::vector<float>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {}
    Typestring(const std::vector<double>&)
            : c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {}
    Typestring(const std::vector<long double>&)
            : c_endian{host_endian_char},
              c_type{'f'},
              len{sizeof(long double)} {}

    Typestring(const std::vector<char>&)
            : c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {}
    Typestring(const std::vector<short>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {}
    Typestring(const std::vector<int>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {}
    Typestring(const std::vector<long>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {}
    Typestring(const std::vector<long long>&)
            : c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {}

    Typestring(const std::vector<unsigned char>&)
            : c_endian{no_endian_char},
              c_type{'u'},
              len{sizeof(unsigned char)} {}
    Typestring(const std::vector<unsigned short>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned short)} {}
    Typestring(const std::vector<unsigned int>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned int)} {}
    Typestring(const std::vector<unsigned long>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned long)} {}
    Typestring(const std::vector<unsigned long long>&)
            : c_endian{host_endian_char},
              c_type{'u'},
              len{sizeof(unsigned long long)} {}

    Typestring(const std::vector<std::complex<float>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<float>)} {}
    Typestring(const std::vector<std::complex<double>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<double>)} {}
    Typestring(const std::vector<std::complex<long double>>&)
            : c_endian{host_endian_char},
              c_type{'c'},
              len{sizeof(std::complex<long double>)} {}
 };

 inline void parse_typestring(std::string typestring) {
    std::regex re("'([<>|])([ifuc])(\\d+)'");
    std::smatch sm;

    std::regex_match(typestring, sm, re);

    if (sm.size() != 4) {
        fprintf(stderr, "invalid typestring");
    }
 }

 namespace pyparse {

 /**
  Removes leading and trailing whitespaces
  */
 inline std::string trim(const std::string& str) {
    const std::string whitespace = " \t";
    auto begin = str.find_first_not_of(whitespace);

    if (begin == std::string::npos)
        return "";

    auto end = str.find_last_not_of(whitespace);

    return str.substr(begin, end - begin + 1);
 }

 inline std::string get_value_from_map(const std::string& mapstr) {
    size_t sep_pos = mapstr.find_first_of(":");
    if (sep_pos == std::string::npos)
        return "";

    std::string tmp = mapstr.substr(sep_pos + 1);
    return trim(tmp);
 }

 /**
   Parses the string representation of a Python dict

   The keys need to be known and may not appear anywhere else in the data.
 */
 inline std::unordered_map<std::string, std::string> parse_dict(
        std::string in, std::vector<std::string>& keys) {
    std::unordered_map<std::string, std::string> map;

    if (keys.size() == 0)
        return map;

    in = trim(in);

    // unwrap dictionary
    if ((in.front() == '{') && (in.back() == '}'))
        in = in.substr(1, in.length() - 2);
    else {
        fprintf(stderr, "Not a Python dictionary.");
    }

    std::vector<std::pair<size_t, std::string>> positions;

    for (auto const& value : keys) {
        size_t pos = in.find("'" + value + "'");

        if (pos == std::string::npos) {
            fprintf(stderr, "Missing %s key.", value.c_str());
        }

        std::pair<size_t, std::string> position_pair{pos, value};
        positions.push_back(position_pair);
    }

    // sort by position in dict
    std::sort(positions.begin(), positions.end());

    for (size_t i = 0; i < positions.size(); ++i) {
        std::string raw_value;
        size_t begin{positions[i].first};
        size_t end{std::string::npos};

        std::string key = positions[i].second;

        if (i + 1 < positions.size())
            end = positions[i + 1].first;

        raw_value = in.substr(begin, end - begin);

        raw_value = trim(raw_value);

        if (raw_value.back() == ',')
            raw_value.pop_back();

        map[key] = get_value_from_map(raw_value);
    }

    return map;
 }

 /**
  Parses the string representation of a Python boolean
  */
 inline bool parse_bool(const std::string& in) {
    if (in == "True")
        return true;
    if (in == "False")
        return false;

    fprintf(stderr, "Invalid python boolan.");
    return false;
 }

 /**
  Parses the string representation of a Python str
  */
 inline std::string parse_str(const std::string& in) {
    if ((in.front() == '\'') && (in.back() == '\''))
        return in.substr(1, in.length() - 2);

    fprintf(stderr, "Invalid python string.");
    return "";
 }

 /**
  Parses the string represenatation of a Python tuple into a vector of its items
 */
 inline std::vector<std::string> parse_tuple(std::string in) {
    std::vector<std::string> v;
    const char seperator = ',';

    in = trim(in);

    if ((in.front() == '(') && (in.back() == ')'))
        in = in.substr(1, in.length() - 2);
    else {
        fprintf(stderr, "Invalid Python tuple.");
    }

    std::istringstream iss(in);

    for (std::string token; std::getline(iss, token, seperator);) {
        v.push_back(token);
    }

    return v;
 }

 template <typename T>
 inline std::string write_tuple(const std::vector<T>& v) {
    if (v.size() == 0)
        return "";

    std::ostringstream ss;

    if (v.size() == 1) {
        ss << "(" << v.front() << ",)";
    } else {
        const std::string delimiter = ", ";
        // v.size() > 1
        ss << "(";
        std::copy(v.begin(), v.end() - 1,
                  std::ostream_iterator<T>(ss, delimiter.c_str()));
        ss << v.back();
        ss << ")";
    }

    return ss.str();
 }

 inline std::string write_boolean(bool b) {
    if (b)
        return "True";
    else
        return "False";
 }

 }  // namespace pyparse

 inline void parse_header(std::string header, std::string& descr) {
    /*
       The first 6 bytes are a magic string: exactly "x93NUMPY".
       The next 1 byte is an unsigned byte: the major version number of the file
       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
       number of the file format, e.g. x00. Note: the version of the file format
       is not tied to the version of the numpy package. The next 2 bytes form a
       little-endian unsigned short int: the length of the header data
       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
       array's format. It is an ASCII string which contains a Python literal
       expression of a dictionary. It is terminated by a newline ('n') and
       padded with spaces
       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
       evenly divisible by 16 for alignment purposes. The dictionary contains
       three keys:

       "descr" : dtype.descr
       An object that can be passed as an argument to the numpy.dtype()
       constructor to create the array's dtype. For repeatability and
       readability, this dictionary is formatted using pprint.pformat() so the
       keys are in alphabetic order.
     */

    // remove trailing newline
    if (header.back() != '\n')
        fprintf(stderr, "invalid header");
    header.pop_back();

    // parse the dictionary
    std::vector<std::string> keys{"descr"};
    auto dict_map = npy::pyparse::parse_dict(header, keys);

    if (dict_map.size() == 0)
        fprintf(stderr, "invalid dictionary in header");

    std::string descr_s = dict_map["descr"];
    parse_typestring(descr_s);
    // remove
    descr = npy::pyparse::parse_str(descr_s);
    return;
 }

 inline void parse_header(std::string header, std::string& descr,
                         bool& fortran_order,
                         std::vector<ndarray_len_t>& shape) {
    /*
       The first 6 bytes are a magic string: exactly "x93NUMPY".
       The next 1 byte is an unsigned byte: the major version number of the file
       format, e.g. x01. The next 1 byte is an unsigned byte: the minor version
       number of the file format, e.g. x00. Note: the version of the file format
       is not tied to the version of the numpy package. The next 2 bytes form a
       little-endian unsigned short int: the length of the header data
       HEADER_LEN. The next HEADER_LEN bytes form the header data describing the
       array's format. It is an ASCII string which contains a Python literal
       expression of a dictionary. It is terminated by a newline ('n') and
       padded with spaces
       ('x20') to make the total length of the magic string + 4 + HEADER_LEN be
       evenly divisible by 16 for alignment purposes. The dictionary contains
       three keys:

       "descr" : dtype.descr
       An object that can be passed as an argument to the numpy.dtype()
       constructor to create the array's dtype. "fortran_order" : bool Whether
       the array data is Fortran-contiguous or not. Since Fortran-contiguous
       arrays are a common form of non-C-contiguity, we allow them to be written
       directly to disk for efficiency. "shape" : tuple of int The shape of the
       array. For repeatability and readability, this dictionary is formatted
       using pprint.pformat() so the keys are in alphabetic order.
     */

    // remove trailing newline
    if (header.back() != '\n')
        fprintf(stderr, "invalid header");
    header.pop_back();

    // parse the dictionary
    std::vector<std::string> keys{"descr", "fortran_order", "shape"};
    auto dict_map = npy::pyparse::parse_dict(header, keys);

    if (dict_map.size() == 0)
        fprintf(stderr, "invalid dictionary in header");

    std::string descr_s = dict_map["descr"];
    std::string fortran_s = dict_map["fortran_order"];
    std::string shape_s = dict_map["shape"];

    // TODO: extract info from typestring
    parse_typestring(descr_s);
    // remove
    descr = npy::pyparse::parse_str(descr_s);

    // convert literal Python bool to C++ bool
    fortran_order = npy::pyparse::parse_bool(fortran_s);

    // parse the shape tuple
    auto shape_v = npy::pyparse::parse_tuple(shape_s);
    if (shape_v.size() == 0)
        fprintf(stderr, "invalid shape tuple in header");

    for (auto item : shape_v) {
        ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item));
        shape.push_back(dim);
    }
 }

 inline std::string write_header_dict(const std::string& descr,
                                     bool fortran_order,
                                     const std::vector<ndarray_len_t>& shape) {
    std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
    std::string shape_s = npy::pyparse::write_tuple(shape);

    return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order +
           ", 'shape': " + shape_s + ", }";
 }

 inline void write_header(std::ostream& out, const std::string& descr,
                         bool fortran_order,
                         const std::vector<ndarray_len_t>& shape_v) {
    std::string header_dict = write_header_dict(descr, fortran_order, shape_v);

    size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;

    unsigned char version[2] = {1, 0};
    if (length >= 255 * 255) {
        length = magic_string_length + 2 + 4 + header_dict.length() + 1;
        version[0] = 2;
        version[1] = 0;
    }
    size_t padding_len = 16 - length % 16;
    std::string padding(padding_len, ' ');

    // write magic
    write_magic(out, version[0], version[1]);

    // write header length
    if (version[0] == 1 && version[1] == 0) {
        char header_len_le16[2];
        uint16_t header_len = static_cast<uint16_t>(header_dict.length() +
                                                    padding.length() + 1);

        header_len_le16[0] = (header_len >> 0) & 0xff;
        header_len_le16[1] = (header_len >> 8) & 0xff;
        out.write(reinterpret_cast<char*>(header_len_le16), 2);
    } else {
        char header_len_le32[4];
        uint32_t header_len = static_cast<uint32_t>(header_dict.length() +
                                                    padding.length() + 1);

        header_len_le32[0] = (header_len >> 0) & 0xff;
        header_len_le32[1] = (header_len >> 8) & 0xff;
        header_len_le32[2] = (header_len >> 16) & 0xff;
        header_len_le32[3] = (header_len >> 24) & 0xff;
        out.write(reinterpret_cast<char*>(header_len_le32), 4);
    }

    out << header_dict << padding << '\n';
 }

 inline std::string read_header(std::istream& istream) {
    // check magic bytes an version number
    unsigned char v_major, v_minor;
    read_magic(istream, v_major, v_minor);

    uint32_t header_length = 0;
    if (v_major == 1 && v_minor == 0) {
        char header_len_le16[2];
        istream.read(header_len_le16, 2);
        header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);

        if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
            // TODO: display warning
        }
    } else if (v_major == 2 && v_minor == 0) {
        char header_len_le32[4];
        istream.read(header_len_le32, 4);

        header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) |
                        (header_len_le32[2] << 16) | (header_len_le32[3] << 24);

        if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
            // TODO: display warning
        }
    } else {
        fprintf(stderr, "unsupported file format version");
    }

    auto buf_v = std::vector<char>();
    buf_v.reserve(header_length);
    istream.read(buf_v.data(), header_length);
    std::string header(buf_v.data(), header_length);

    return header;
 }

 inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
    ndarray_len_t size = 1;
    for (ndarray_len_t i : shape)
        size *= i;

    return size;
 }

 template <typename Scalar>
 inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order,
                             unsigned int n_dims, const unsigned long shape[],
                             const std::vector<Scalar>& data) {
    Typestring typestring_o(data);
    std::string typestring = typestring_o.str();

    std::ofstream stream(filename, std::ofstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::vector<ndarray_len_t> shape_v(shape, shape + n_dims);
    write_header(stream, typestring, fortran_order, shape_v);

    auto size = static_cast<size_t>(comp_size(shape_v));

    stream.write(reinterpret_cast<const char*>(data.data()),
                 sizeof(Scalar) * size);
 }

 template <typename Scalar>
 inline void LoadArrayFromNumpy(const std::string& filename,
                               std::vector<unsigned long>& shape,
                               std::vector<Scalar>& data) {
    bool fortran_order;
    LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data);
 }

 template <typename Scalar>
 inline void LoadArrayFromNumpy(const std::string& filename,
                               std::vector<unsigned long>& shape,
                               bool& fortran_order, std::vector<Scalar>& data) {
    std::ifstream stream(filename, std::ifstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::string header = read_header(stream);

    // parse header
    std::string typestr;

    parse_header(header, typestr, fortran_order, shape);

    // check if the typestring matches the given one
    Typestring typestring_o{data};
    std::string expect_typestr = typestring_o.str();
    if (typestr != expect_typestr) {
        fprintf(stderr, "formatting error: typestrings not matching");
    }

    // compute the data size based on the shape
    auto size = static_cast<size_t>(comp_size(shape));
    data.resize(size);

    // read the data
    stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size);
 }

 inline void LoadArrayFromNumpy(const std::string& filename,
                               std::string& type_str,
                               std::vector<ndarray_len_t>& shape,
                               std::vector<int8_t>& data) {
    std::ifstream stream(filename, std::ifstream::binary);
    if (!stream) {
        fprintf(stderr, "io error: failed to open a file.");
    }

    std::string header = read_header(stream);
    bool fortran_order;
    // parse header
    parse_header(header, type_str, fortran_order, shape);

    // check if the typestring matches the given one
    std::string size_str = type_str.substr(type_str.size() - 1);
    size_t elem_size = atoi(size_str.c_str());

    // compute the data size based on the shape
    auto byte_size = elem_size * static_cast<size_t>(comp_size(shape));
    data.resize(byte_size);

    // read the data
    stream.read(reinterpret_cast<char*>(data.data()), byte_size);
 }

 }  // namespace npy

 #endif  // NPY_H
--- a/lite/test/test_common.h
+++ b/lite/test/test_common.h
@@ -0,0 +1,184 @@
 /**
 * \file test/test_common.h
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #pragma once

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "../src/misc.h"
 #include "../src/mge/network_impl.h"
 #include "../src/mge/common.h"
 #include "lite/network.h"
 #include "lite/tensor.h"
 #include "megbrain/tensor.h"
 #include "megbrain/graph/bases.h"
 #include "megbrain/plugin/opr_io_dump.h"
 #include "megbrain/plugin/profiler.h"
 #include "megbrain/serialization/extern_c_opr.h"
 #include "megbrain/serialization/file.h"
 #include "megbrain/serialization/load_dump_config.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/utils/thin/hash_table.h"
 #include "npy.h"

 #include <gtest/gtest.h>

 #include <string.h>
 #include <chrono>
 #include <memory>
 #include <random>

 namespace lite {

 template <typename T>
 static ::testing::AssertionResult compare_memory(const void* memory0,
                                                 const void* memory1,
                                                 size_t length,
                                                 float maxerr = 1e-3) {
    const T* data_ptr0 = static_cast<const T*>(memory0);
    const T* data_ptr1 = static_cast<const T*>(memory1);
    for (size_t i = 0; i < length; i++) {
        auto diff = std::abs(data_ptr0[i] - data_ptr1[i]);
        if (diff > maxerr) {
            return ::testing::AssertionFailure()
                   << "Unequal value:\n"
                   << "value 0 = " << data_ptr0[i] << "\n"
                   << "value 1 = " << data_ptr1[i] << "\n"
                   << "At index: " << i << "\n";
        }
    }
    return ::testing::AssertionSuccess();
 }

 template <typename T>
 void compare_lite_tensor(std::shared_ptr<Tensor> tensor0,
                         std::shared_ptr<Tensor> tensor1, float maxerr = 1e-3) {
    size_t elemsize = tensor0->get_layout().get_elem_size();
    T* data_ptr0 = static_cast<T*>(tensor0->get_memory_ptr());
    T* data_ptr1 = static_cast<T*>(tensor1->get_memory_ptr());
    size_t length = tensor0->get_tensor_total_size_in_byte() / elemsize;
    EXPECT_TRUE(compare_memory<T>(data_ptr0, data_ptr1, length, maxerr));
 }

 __attribute__((unused)) static std::shared_ptr<Tensor> get_input_data(
        std::string path) {
    std::string type_str;
    std::vector<npy::ndarray_len_t> stl_shape;
    std::vector<int8_t> raw;
    npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw);

    auto lite_tensor = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU);
    Layout layout;
    layout.ndim = stl_shape.size();
    const std::map<std::string, LiteDataType> type_map = {
            {"f4", LiteDataType::LITE_FLOAT},
            {"i4", LiteDataType::LITE_INT},
            {"i1", LiteDataType::LITE_INT8},
            {"u1", LiteDataType::LITE_UINT8}};
    layout.shapes[0] = 1;
    for (size_t i = 0; i < stl_shape.size(); i++) {
        layout.shapes[i] = static_cast<size_t>(stl_shape[i]);
    }
    for (auto& item : type_map) {
        if (type_str.find(item.first) != std::string::npos) {
            layout.data_type = item.second;
            break;
        }
    }
    lite_tensor->set_layout(layout);
    size_t length = lite_tensor->get_tensor_total_size_in_byte();
    void* dest = lite_tensor->get_memory_ptr();
    memcpy(dest, raw.data(), length);
    return lite_tensor;
 }

 __attribute__((unused)) static std::shared_ptr<Tensor> mgelite_lar(
        std::string model_path, const Config& config, std::string,
        std::shared_ptr<Tensor> input) {
    std::unique_ptr<Network> network = std::make_unique<Network>(config);

    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);

    auto src_ptr = input->get_memory_ptr();
    auto src_layout = input->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    network->forward();
    network->wait();

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    Layout out_layout = output_tensor->get_layout();
    auto ret = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU, out_layout);
    void* out_data = output_tensor->get_memory_ptr();
    void* dst_data = ret->get_memory_ptr();
    memcpy(dst_data, out_data, ret->get_tensor_total_size_in_byte());
    return ret;
 }

 __attribute__((unused)) static std::shared_ptr<Tensor> mgb_lar(
        std::string model_path, const Config& config, std::string input_name,
        std::shared_ptr<Tensor> input) {
    LITE_ASSERT(config.bare_model_cryption_name.size() == 0);
    using namespace mgb;
    serialization::GraphLoader::LoadConfig mgb_config;
    mgb_config.comp_node_mapper = [config](CompNode::Locator& loc) {
        loc = to_compnode_locator(config.device_type);
    };
    mgb_config.comp_graph = ComputingGraph::make();
    auto&& graph_opt = mgb_config.comp_graph->options();
    if (config.options.weight_preprocess) {
        graph_opt.graph_opt.enable_weight_preprocess();
    }
    graph_opt.comp_node_seq_record_level =
            config.options.comp_node_seq_record_level;

    auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str());
    auto format =
            serialization::GraphLoader::identify_graph_dump_format(*inp_file);
    mgb_assert(format.valid(),
               "invalid model: unknown model format, please make sure input "
               "file is generated by GraphDumper");
    auto loader =
            serialization::GraphLoader::make(std::move(inp_file), format.val());
    auto load_ret = loader->load(mgb_config, false);

    ComputingGraph::OutputSpec out_spec;
    std::vector<HostTensorND> output_tensors(load_ret.output_var_list.size());
    for (size_t i = 0; i < load_ret.output_var_list.size(); i++) {
        auto cb = [&output_tensors, i](const DeviceTensorND& dv) mutable {
            output_tensors[i].copy_from(dv);
        };
        out_spec.emplace_back(load_ret.output_var_list[i], std::move(cb));
    }
    auto func = load_ret.graph_compile(out_spec);

    auto& in = load_ret.tensor_map.find(input_name)->second;
    in->copy_from(*TensorHelper::implement(input)
                           ->cast_final_safe<TensorImplDft>()
                           .host_tensor());
    func->execute();
    func->wait();

    std::shared_ptr<Tensor> ret = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            to_lite_layout(output_tensors[0].layout()));
    auto mge_tensor = TensorHelper::implement(ret)
                              ->cast_final_safe<TensorImplDft>()
                              .host_tensor();
    mge_tensor->copy_from(output_tensors[0]);
    return ret;
 }
 }  // namespace lite

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/test/test_misc.cpp
+++ b/lite/test/test_misc.cpp
@@ -0,0 +1,115 @@
 /**
 * \file test/test_misc.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "test_common.h"
 #include "../src/decryption/decrypt_base.h"
 #include "../src/network_impl_base.h"

 #include "megbrain/opr/io.h"
 #include "megbrain/tensor.h"
 #include "megbrain/utils/metahelper.h"

 #include <gtest/gtest.h>

 #include <string.h>
 #include <chrono>
 #include <memory>
 #include <random>

 using namespace lite;

 TEST(TestMisc, DecryptionRegister) {
    size_t number = decryption_static_data().decryption_methods.size();
    //! At least one method is register by lite
    ASSERT_GE(number, 1);
    DecryptionFunc func;
    register_decryption_and_key("AllForTest0", func, {});

    ASSERT_EQ(number + 1, decryption_static_data().decryption_methods.size());
 }

 TEST(TestMisc, DecryptionUpdate) {
    DecryptionFunc func;
    register_decryption_and_key("AllForTest1", func, {});
    func = [](const void*, size_t,
              const std::vector<uint8_t>&) -> std::vector<uint8_t> {
        return {};
    };
    update_decryption_or_key("AllForTest1", func, {});
    ASSERT_NE(decryption_static_data().decryption_methods["AllForTest1"].first,
              nullptr);
    ASSERT_EQ(decryption_static_data()
                      .decryption_methods["AllForTest1"]
                      .second->size(),
              0);
    update_decryption_or_key("AllForTest1", {}, {1, 2, 3});
    ASSERT_EQ(decryption_static_data()
                      .decryption_methods["AllForTest1"]
                      .second->size(),
              3);
 }

 TEST(TestMisc, SharedSameDeviceTensor) {
    using namespace mgb;
    serialization::GraphLoader::LoadConfig mgb_config;
    mgb_config.comp_node_mapper = [](CompNode::Locator& loc) {
        loc = to_compnode_locator(LiteDeviceType::LITE_CPU);
    };
    mgb_config.comp_graph = ComputingGraph::make();
    std::string model_path = "./shufflenet.mge";

    auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str());
    auto format =
            serialization::GraphLoader::identify_graph_dump_format(*inp_file);
    mgb_assert(format.valid(),
               "invalid model: unknown model format, please make sure input "
               "file is generated by GraphDumper");
    auto loader =
            serialization::GraphLoader::make(std::move(inp_file), format.val());
    auto load_ret_1 = loader->load(mgb_config, true);
    auto load_ret_2 = loader->load(mgb_config, true);
    ASSERT_EQ(load_ret_1.output_var_list.size(),
              load_ret_2.output_var_list.size());

    ComputingGraph::OutputSpec out_spec_1, out_spec_2;
    for (size_t i = 0; i < load_ret_1.output_var_list.size(); i++) {
        out_spec_1.emplace_back(load_ret_1.output_var_list[i], nullptr);
        out_spec_2.emplace_back(load_ret_2.output_var_list[i], nullptr);
    }
    auto func_1 = load_ret_1.graph_compile(out_spec_1);
    auto func_2 = load_ret_2.graph_compile(out_spec_1);
    std::vector<cg::OperatorNodeBase*> oprs_1, oprs_2;
    func_1->iter_opr_seq([&oprs_1](cg::OperatorNodeBase* opr) -> bool {
        if (opr->try_cast_final<opr::ImmutableTensor>()) {
            oprs_1.push_back(opr);
        }
        return true;
    });
    func_1->iter_opr_seq([&oprs_2](cg::OperatorNodeBase* opr) -> bool {
        if (opr->try_cast_final<opr::ImmutableTensor>()) {
            oprs_2.push_back(opr);
        }
        return true;
    });
    ASSERT_EQ(oprs_1.size(), oprs_2.size());
    for (size_t i = 0; i < oprs_1.size(); i++) {
        auto tensor_1 =
                oprs_1[i]->try_cast_final<opr::ImmutableTensor>()->value();
        auto tensor_2 =
                oprs_2[i]->try_cast_final<opr::ImmutableTensor>()->value();
        ASSERT_EQ(tensor_1.raw_ptr(), tensor_2.raw_ptr());
    }
 }

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/test/test_network.cpp
+++ b/lite/test/test_network.cpp
--- a/lite/test/test_network_c.cpp
+++ b/lite/test/test_network_c.cpp
@@ -0,0 +1,895 @@
 /**
 * \file test/test_network_c.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "../src/misc.h"

 #if LITE_BUILD_WITH_MGE
 #include "../src/common.h"
 #include "../src/mge/network_impl.h"

 #include "../lite-c/src/common.h"
 #include "lite-c/global_c.h"
 #include "lite-c/network_c.h"
 #include "lite-c/tensor_c.h"

 #include "./test_common.h"
 #include "megbrain/tensor.h"

 #include <string.h>
 #include <chrono>
 #include <memory>
 #include <random>
 #include <unordered_map>

 namespace {

 int affinity_set = false;
 int single_thread_affinity(int) {
    affinity_set = true;
    return 0;
 }

 std::atomic_size_t m_nr_left{0};
 std::atomic_size_t m_nr_allocated{0};

 void* allocate(LiteDeviceType device, int, size_t size, size_t align) {
    LITE_ASSERT(device == LiteDeviceType::LITE_CPU);
    m_nr_left++;
    m_nr_allocated++;
 #ifdef WIN32
    return _aligned_malloc(size, align);
 #elif defined(__ANDROID__) || defined(ANDROID)
    return memalign(align, size);
 #else
    void* ptr = nullptr;
    auto err = posix_memalign(&ptr, align, size);
    mgb_assert(!err, "failed to malloc %zu bytes with align %zu", size, align);
    return ptr;
 #endif
 }

 void free(LiteDeviceType device, int, void* ptr) {
    m_nr_left--;
    LITE_ASSERT(device == LiteDeviceType::LITE_CPU);
 #ifdef WIN32
    _aligned_free(ptr);
 #else
    ::free(ptr);
 #endif
 };

 #define NUMBER_THREDS (4)
 std::vector<std::thread::id> thread_ids(NUMBER_THREDS);
 int multi_thread_affinity(int id) {
    thread_ids[id] = std::this_thread::get_id();
    return 0;
 };

 volatile bool finished = false;
 int finish_callback() {
    finished = true;
    return 0;
 }

 volatile bool start_checked = false;
 int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors,
                   size_t size) {
    start_checked = true;
    auto check_func = [&]() {
        ASSERT_EQ(size, 1);
        ASSERT_EQ(std::string(inputs->name), "data");
        LiteLayout layout;
        LITE_get_tensor_layout(*input_tensors, &layout);
        ASSERT_EQ(layout.ndim, 4);
        ASSERT_EQ(layout.shapes[1], 3);
        ASSERT_EQ(layout.shapes[2], 224);
        ASSERT_EQ(layout.shapes[3], 224);
    };
    check_func();
    return 0;
 }

 volatile bool finish_checked = false;
 int finish_callback(const LiteIO* outputs, const LiteTensor* output_tensors,
                    size_t size) {
    finish_checked = true;
    auto check_func = [&]() {
        ASSERT_EQ(size, 1);
        ASSERT_EQ(std::string(outputs->name),
                  "TRUE_DIV(EXP[12065],reduce0[12067])[12077]");
        LiteLayout layout;
        LITE_get_tensor_layout(*output_tensors, &layout);
        ASSERT_EQ(layout.shapes[1], 1000);
    };
    check_func();
    return 0;
 }

 }  // namespace

 #define LITE_CAPI_CHECK(_expr)                 \
    do {                                       \
        int _ret = (_expr);                    \
        if (_ret) {                            \
            LITE_THROW(LITE_get_last_error()); \
        }                                      \
    } while (0)

 #define ForwardMgb                                                             \
    lite::Config config;                                                       \
    auto lite_tensor = lite::get_input_data("./input_data.npy");               \
    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); \
    std::string model_path = "./shufflenet.mge";                               \
    auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor)

 #define MakeNetwork                                                  \
    LiteNetwork c_network;                                           \
    LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), \
                                      *default_network_io()))

 #define LoadNetwork \
    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str()))

 #define SetInput                                                            \
    LiteTensor c_input_tensor, c_output_tensor;                             \
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, "data", LITE_INPUT,       \
                                       &c_input_tensor));                   \
    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,                \
                                             lite_tensor->get_memory_ptr(), \
                                             data_length_in_byte))

 #define ForwardNetwork                        \
    LITE_CAPI_CHECK(LITE_forward(c_network)); \
    LITE_CAPI_CHECK(LITE_wait(c_network))

 #define GetOutput                                                           \
    const char* output_name;                                                \
    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));      \
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_OUTPUT, \
                                       &c_output_tensor));                  \
    void* output_ptr;                                                       \
    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr))

 #define CompareResult                                 \
    EXPECT_TRUE(lite::compare_memory<float>(          \
            output_ptr, result_mgb->get_memory_ptr(), \
            result_mgb->get_tensor_total_size_in_byte() / sizeof(float)))

 TEST(TestCapiNetWork, BasicResetInput) {
    ForwardMgb;
    LiteNetwork c_network;
    LITE_CAPI_CHECK(LITE_make_default_network(&c_network));
    LoadNetwork;
    SetInput;
    ForwardNetwork;
    GetOutput;
    CompareResult;
    LITE_destroy_network(c_network);
 }

 TEST(TestCapiNetWork, GetAllName) {
    std::string model_path = "./shufflenet.mge";
    LiteNetwork c_network;
    LITE_CAPI_CHECK(LITE_make_default_network(&c_network));
    LoadNetwork;
    size_t input_size, output_size;
    LITE_get_all_input_name(c_network, &input_size, nullptr);
    LITE_get_all_output_name(c_network, &output_size, nullptr);

    std::vector<const char*> input_names(input_size);
    LITE_get_all_input_name(c_network, nullptr, input_names.data());
    ASSERT_EQ(input_names.size(), 1);
    ASSERT_TRUE(std::string(input_names[0]) == "data");

    std::vector<const char*> output_names(output_size);
    LITE_get_all_output_name(c_network, nullptr, output_names.data());
    ASSERT_TRUE(std::string(output_names[0]) ==
                "TRUE_DIV(EXP[12065],reduce0[12067])[12077]");
    ASSERT_EQ(output_names.size(), 1);
    LITE_destroy_network(c_network);
 }

 #if LITE_BUILD_WITH_RKNPU

 static int GetTop(float* pfProb, float* pfMaxProb, uint32_t* pMaxClass,
                  uint32_t outputCount, uint32_t topNum) {
    uint32_t i, j;

 #define MAX_TOP_NUM 20
    if (topNum > MAX_TOP_NUM)
        return 0;

    memset(pfMaxProb, 0, sizeof(float) * topNum);
    memset(pMaxClass, 0xff, sizeof(float) * topNum);

    for (j = 0; j < topNum; j++) {
        for (i = 0; i < outputCount; i++) {
            if ((i == *(pMaxClass + 0)) || (i == *(pMaxClass + 1)) ||
                (i == *(pMaxClass + 2)) || (i == *(pMaxClass + 3)) ||
                (i == *(pMaxClass + 4))) {
                continue;
            }

            if (pfProb[i] > *(pfMaxProb + j)) {
                *(pfMaxProb + j) = pfProb[i];
                *(pMaxClass + j) = i;
            }
        }
    }

    return 1;
 }

 TEST(TestCapiNetWork, rknntest_set_info) {
 #define SET_INFO_SIZE 2
 #define TENSOR_TYPE_UINT8 3
 #define TENSOR_FORMAT_NHWC 1
    LiteConfig config;
    config.backend = LiteBackend::LITE_RK_NPU;
    config.device_type = LiteDeviceType::LITE_NPU;
    config.bare_model_cryption_name = nullptr;
    auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy");
    auto true_tensor = lite::get_input_data("./output_data.npy");
    auto rknn_model = "./model/mobilenet_v1.rknn";
    
    LiteNetwork c_network;
    LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config));
    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model));

    size_t input_size, output_size;
    LITE_get_all_input_name(c_network, &input_size, nullptr);
    LITE_get_all_output_name(c_network, &output_size, nullptr);

    std::vector<const char*> input_names(input_size);
    std::vector<const char*> output_names(output_size);
    LiteTensor c_input_tensor, c_output_tensor;

    LITE_get_all_input_name(c_network, nullptr, input_names.data());
    LITE_get_all_output_name(c_network, nullptr, output_names.data());
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO,
                                       &c_input_tensor));

    size_t input_length = 0;
    LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length);

    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte();
    {
        LiteLayout input_layout;
        LITE_get_tensor_layout(c_input_tensor, &input_layout);
        ASSERT_TRUE(input_layout.data_type == LITE_INT8);
        std::vector<int> input_shape={1,224,224,3};
        for (size_t i = 0; i < input_layout.ndim; i++) {
            ASSERT_TRUE(input_layout.shapes[i]=input_shape[i]);
        }
    }

    {
        int size_attr = 0;
        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, nullptr, nullptr,
                                                  &size_attr));
        ASSERT_TRUE(size_attr > 0);
        const char* keys[size_attr];
        void* values[size_attr];
        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, keys, values,
                                                  &size_attr));
        ASSERT_TRUE(size_attr > 5);
        std::unordered_map<std::string, uint32_t> result_map = {
                {"zp", 0},
                {"index", 0},
                {"size_with_stride", 150528},
                {"stride", 224},
                {"n_size", 150528},
                {"n_elems", 150528},
                {"qnt_type", 2},
                {"n_dims", 4},
                {"type", 2},
                {"fmt", 1},
                {"dims0", 1},
                {"dims1", 224},
                {"dims2", 224},
                {"dims3", 3},
        };
        for (int i = 0; i < size_attr; i++) {
            std::string key(keys[i]);
            if (key == "names") {
                ASSERT_TRUE(std::string("input") ==
                            std::string(static_cast<const char*>(values[i])));
            } else if (key == "scale") {
                float scale = *static_cast<float*>(values[i]);
                ASSERT_TRUE(std::fabs(scale - 0.007812) < 0.00001);
            } else if (key == "fl" || key == "pass_through") {
                uint8_t val = *static_cast<uint8_t*>(values[i]);
                if (key == "fl") {
                    ASSERT_TRUE(val == 0);
                } else {
                    ASSERT_TRUE(val == 1);
                }
            } else {
                uint32_t val = *static_cast<uint32_t*>(values[i]);
                ASSERT_TRUE(result_map[std::string(keys[i])]==val);
            }
        }
    }
    const char* keys[] = {"type", "fmt"};
    int info_size = SET_INFO_SIZE;
    int type = TENSOR_TYPE_UINT8;
    int fmt = TENSOR_FORMAT_NHWC;
    void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)};
    LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values,
                                                info_size));
    ASSERT_TRUE(std::string(output_names[0]) ==
                std::string("MobilenetV1/Predictions/Reshape_1"));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
                                       &c_output_tensor));

    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,
                                             lite_tensor->get_memory_ptr(),
                                             data_length_in_byte));

    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
                                       &c_output_tensor));
    //LiteLayout tmp_output_layout;
    //LITE_get_tensor_layout(c_output_tensor, &tmp_output_layout);
    //tmp_output_layout.data_type = LiteDataType::LITE_FLOAT;

    //LITE_set_tensor_layout(c_output_tensor, tmp_output_layout);
    {
        const char* keys[] = {"want_float"};
        uint8_t want_float = 1;
        void* values[] = {static_cast<void*>(&want_float)};
        LITE_CAPI_CHECK(
                LITE_set_tensor_information(c_output_tensor, keys, values, 1));
    }

    LITE_CAPI_CHECK(LITE_forward(c_network));
    LITE_CAPI_CHECK(LITE_wait(c_network));

    ASSERT_TRUE(std::string(output_names[0]) == "MobilenetV1/Predictions/Reshape_1");
    ASSERT_EQ(output_names.size(), 1);
    {
        LiteLayout output_layout;
        LITE_get_tensor_layout(c_output_tensor, &output_layout);
        ASSERT_TRUE(output_layout.data_type == LITE_FLOAT);
        int size_attr = 0;

        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, nullptr, nullptr,
                                                  &size_attr));
        ASSERT_TRUE(size_attr > 0);
        const char* keys[size_attr];
        void* values[size_attr];
        LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, keys, values,
                                                  &size_attr));
        ASSERT_TRUE(size_attr > 5);
        std::unordered_map<std::string, uint32_t> result_map = {
                {"zp", 0},
                {"index", 0},
                {"size_with_stride", 2002},
                {"stride", 0},
                {"n_size", 2002},
                {"n_elems", 1001},
                {"qnt_type", 2},
                {"n_dims", 2},
                {"type", 0},
                {"fmt", 2},
                {"dims0", 1},
                {"dims1", 1001},
        };
        for (int i = 0; i < size_attr; i++) {
            std::string key(keys[i]);
            if (key == "names") {
                ASSERT_TRUE("MobilenetV1/Predictions/Reshape_1" ==
                            std::string(static_cast<const char*>(values[i])));

            } else if (key == "scale") {
                float scale = *static_cast<float*>(values[i]);
                ASSERT_TRUE(std::fabs(scale - 1.0) < 0.00001);
            } else if (key == "fl" || key == "pass_through") {
                uint8_t val = *static_cast<uint8_t*>(values[i]);
                    ASSERT_TRUE(val == 0);
            } else {
                uint32_t val = *static_cast<uint32_t*>(values[i]);
                ASSERT_TRUE(result_map[std::string(keys[i])]==val);
            }
        }
    }
    {
        uint32_t MaxClass[5];
        float fMaxProb[5];
        void* output_ptr;
        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
        float* buffer = (float*)output_ptr;
        uint32_t sz = true_tensor->get_tensor_total_size_in_byte() / sizeof(float);

        GetTop(buffer, fMaxProb, MaxClass, sz, 5);

        std::vector<uint32_t> result_class = {
                286, 464, 282, 357, 285,
        };
        std::vector<float> result_prob = {
                0.407227, 0.365723, 0.090454, 0.018051, 0.013069,
        };

        for (int i = 0; i < 5; i++) {
            ASSERT_TRUE(result_class[i] == MaxClass[i]);
            ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001);
        }
    }

    {
        float* true_data = static_cast<float*>(true_tensor->get_memory_ptr());
        void* output_ptr;
        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
        float* data1 = static_cast<float*>(output_ptr);
        size_t length =
                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
        for (size_t i = 0; i < length; i++) {
            ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3);
        }
    }
    LITE_destroy_network(c_network);
 #undef SET_INFO_SIZE
 #undef TENSOR_FORMAT_NHWC
 #undef TENSOR_TYPE_UINT8
 }

 TEST(TestCapiNetWork, rknntest_set_info_two_input) {
 #define SET_INFO_SIZE 2
 #define TENSOR_TYPE_UINT8 3
 #define TENSOR_FORMAT_NHWC 1
    LiteConfig config;
    config.backend = LiteBackend::LITE_RK_NPU;
    config.device_type = LiteDeviceType::LITE_NPU;
    config.bare_model_cryption_name = nullptr;
    auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy");
    auto lite_tensor_dog = lite::get_input_data("./model/dog_224x224.npy");
    auto true_tensor = lite::get_input_data("./output_data.npy");
    auto rknn_model = "./model/mobilenet_v1.rknn";

    LiteNetwork c_network;
    LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config));
    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model));

    size_t input_size, output_size;
    LITE_get_all_input_name(c_network, &input_size, nullptr);
    LITE_get_all_output_name(c_network, &output_size, nullptr);

    std::vector<const char*> input_names(input_size);
    std::vector<const char*> output_names(output_size);
    LiteTensor c_input_tensor, c_output_tensor;

    LITE_get_all_input_name(c_network, nullptr, input_names.data());
    LITE_get_all_output_name(c_network, nullptr, output_names.data());
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO,
                                       &c_input_tensor));

    size_t input_length = 0;
    LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length);

    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte();
    {
        LiteLayout input_layout;
        LITE_get_tensor_layout(c_input_tensor, &input_layout);
        ASSERT_TRUE(input_layout.data_type == LITE_INT8);
        std::vector<int> input_shape = {1, 224, 224, 3};
        for (size_t i = 0; i < input_layout.ndim; i++) {
            ASSERT_TRUE(input_layout.shapes[i] = input_shape[i]);
        }
    }

    const char* keys[] = {"type", "fmt"};
    int info_size = SET_INFO_SIZE;
    int type = TENSOR_TYPE_UINT8;
    int fmt = TENSOR_FORMAT_NHWC;
    void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)};
    LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values,
                                                info_size));
    ASSERT_TRUE(std::string(output_names[0]) ==
                std::string("MobilenetV1/Predictions/Reshape_1"));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
                                       &c_output_tensor));

    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,
                                             lite_tensor->get_memory_ptr(),
                                             data_length_in_byte));

    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO,
                                       &c_output_tensor));
    {
        const char* keys[] = {"want_float"};
        uint8_t want_float = 1;
        void* values[] = {static_cast<void*>(&want_float)};
        LITE_CAPI_CHECK(
                LITE_set_tensor_information(c_output_tensor, keys, values, 1));
    }

    LITE_CAPI_CHECK(LITE_forward(c_network));
    LITE_CAPI_CHECK(LITE_wait(c_network));

    ASSERT_TRUE(std::string(output_names[0]) ==
                "MobilenetV1/Predictions/Reshape_1");
    ASSERT_EQ(output_names.size(), 1);
    {
        uint32_t MaxClass[5];
        float fMaxProb[5];
        void* output_ptr;
        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
        float* buffer = (float*)output_ptr;
        uint32_t sz =
                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);

        GetTop(buffer, fMaxProb, MaxClass, sz, 5);

        std::vector<uint32_t> result_class = {
                286, 464, 282, 357, 285,
        };
        std::vector<float> result_prob = {
                0.407227, 0.365723, 0.090454, 0.018051, 0.013069,
        };

        for (int i = 0; i < 5; i++) {
            ASSERT_TRUE(result_class[i] == MaxClass[i]);
            ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001);
        }
    }

    {
        float* true_data = static_cast<float*>(true_tensor->get_memory_ptr());
        void* output_ptr;
        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
        float* data1 = static_cast<float*>(output_ptr);
        size_t length =
                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);
        for (size_t i = 0; i < length; i++) {
            ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3);
        }
    }

    LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor,
                                             lite_tensor_dog->get_memory_ptr(),
                                             data_length_in_byte));
    LITE_CAPI_CHECK(LITE_forward(c_network));
    LITE_CAPI_CHECK(LITE_wait(c_network));
    ASSERT_TRUE(std::string(output_names[0]) ==
                "MobilenetV1/Predictions/Reshape_1");
    ASSERT_EQ(output_names.size(), 1);
    {
        uint32_t MaxClass[5];
        float fMaxProb[5];
        void* output_ptr;
        LITE_get_tensor_memory(c_output_tensor, &output_ptr);
        float* buffer = (float*)output_ptr;
        uint32_t sz =
                true_tensor->get_tensor_total_size_in_byte() / sizeof(float);

        GetTop(buffer, fMaxProb, MaxClass, sz, 5);

        std::vector<float> result_prob = {
                0.407227, 0.365723, 0.090454, 0.018051, 0.013069,
        };

        for (int i = 0; i < 5; i++) {
            ASSERT_FALSE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001);
        }
    }

    LITE_destroy_network(c_network);
 #undef SET_INFO_SIZE
 #undef TENSOR_FORMAT_NHWC
 #undef TENSOR_TYPE_UINT8
 }
 #endif

 TEST(TestCapiNetWork, BasicResetOutput) {
    ForwardMgb;
    LiteNetwork c_network;
    LITE_CAPI_CHECK(LITE_make_default_network(&c_network));
    LoadNetwork;
    SetInput;
    LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT};
    std::shared_ptr<float> ptr(new float[1000],
                               [](float* ptr) { delete[] ptr; });
    const char* output_name;
    LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO,
                                       &c_output_tensor));
    LITE_CAPI_CHECK(
            LITE_reset_tensor(c_output_tensor, output_layout, ptr.get()));

    ForwardNetwork;

    EXPECT_TRUE(lite::compare_memory<float>(
            ptr.get(), result_mgb->get_memory_ptr(),
            result_mgb->get_tensor_total_size_in_byte() / sizeof(float)));
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, BasicInplaceAndSingleThreadAffinity) {
    ForwardMgb;
    MakeNetwork;
    //! config the network with cpu inplace mode
    LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network));
    LoadNetwork;
    //! set single thread affinith callback
    LITE_CAPI_CHECK(LITE_set_runtime_thread_affinity(c_network,
                                                     single_thread_affinity));
    SetInput;
    ForwardNetwork;
    ASSERT_EQ(affinity_set, true);
    affinity_set = false;
    GetOutput;
    CompareResult;
    LITE_destroy_network(c_network);
 }

 TEST(TestCapiNetWork, UserAllocator) {
    ForwardMgb;
    MakeNetwork;
    LITE_CAPI_CHECK(LITE_set_memory_allocator(c_network, allocate, free));
    LoadNetwork;
    SetInput;
    ForwardNetwork;

    ASSERT_GE(m_nr_allocated, 1);
    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
    ASSERT_EQ(m_nr_left, 0);
 }

 TEST(TestCapiNetWork, BasicMultiThread) {
    ForwardMgb;
    MakeNetwork;
    LITE_CAPI_CHECK(LITE_set_cpu_threads_number(c_network, NUMBER_THREDS));
    LoadNetwork;
    LITE_CAPI_CHECK(
            LITE_set_runtime_thread_affinity(c_network, multi_thread_affinity));
    SetInput;
    ForwardNetwork;
    for (size_t i = 0; i < NUMBER_THREDS; i++) {
        for (size_t j = i + 1; j < NUMBER_THREDS; j++) {
            ASSERT_NE(thread_ids[i], thread_ids[j]);
        }
    }
    for (size_t i = 0; i < NUMBER_THREDS; i++) {
        thread_ids[i] = std::thread::id();
    }
    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, DeviceIO) {
    ForwardMgb;
    LiteNetwork c_network;
    LiteIO input_io = default_io;
    input_io.is_host = true;
    input_io.name = "data";
    LiteNetworkIO network_io = *default_network_io();
    network_io.inputs = &input_io;
    network_io.input_size = 1;
    LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), network_io));
    LoadNetwork;
    SetInput;
    ForwardNetwork;
    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, StartCallBack) {
    ForwardMgb;
    MakeNetwork;
    LoadNetwork;
    LITE_CAPI_CHECK(LITE_set_start_callback(c_network, start_callback));
    SetInput;
    ForwardNetwork;
    GetOutput;
    CompareResult;
    ASSERT_TRUE(start_checked);
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, FinishCallBack) {
    ForwardMgb;
    MakeNetwork;
    LoadNetwork;
    LITE_CAPI_CHECK(LITE_set_finish_callback(c_network, finish_callback));
    SetInput;
    ForwardNetwork;
    GetOutput;
    CompareResult;
    ASSERT_TRUE(finish_checked);
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, BasicCryptAes) {
    ForwardMgb;

    LiteConfig c_config = *default_config();
    c_config.bare_model_cryption_name = "AES_default";
    LiteNetwork c_network;
    LITE_CAPI_CHECK(
            LITE_make_network(&c_network, c_config, *default_network_io()));
    std::string model_crypt_path = "./shufflenet_crypt_aes.mge";

    LITE_CAPI_CHECK(
            LITE_load_model_from_path(c_network, model_crypt_path.c_str()));

    SetInput;
    ForwardNetwork;
    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, PackedCryptRc4) {
    ForwardMgb;
    MakeNetwork;

    std::string model_crypt_path = "./test_packed_model_rc4.lite";
    LITE_CAPI_CHECK(
            LITE_load_model_from_path(c_network, model_crypt_path.c_str()));

    SetInput;
    ForwardNetwork;
    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, AsyncExec) {
    finished = false;
    ForwardMgb;
    LiteNetwork c_network;
    LiteConfig c_config = *default_config();
    c_config.options.var_sanity_check_first_run = false;
    LITE_CAPI_CHECK(
            LITE_make_network(&c_network, c_config, *default_network_io()));
    LITE_CAPI_CHECK(LITE_set_async_callback(c_network, finish_callback));
    LoadNetwork;
    SetInput;

    LITE_forward(c_network);
    size_t count = 0;
    while (finished == false) {
        count++;
    }
    ASSERT_GT(count, 0);
    finished = false;

    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, OutputShapeOnly) {
    ForwardMgb;
    LiteNetwork c_network;
    LiteNetworkIO c_network_io = *default_network_io();
    LiteIO io_output = default_io;
    io_output.io_type = LiteIOType::LITE_IO_SHAPE;
    io_output.name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
    c_network_io.outputs = &io_output;
    c_network_io.output_size = 1;
    LITE_CAPI_CHECK(
            LITE_make_network(&c_network, *default_config(), c_network_io));
    LoadNetwork;
    SetInput;
    ForwardNetwork;
    GetOutput;
    size_t length = 0;
    LITE_CAPI_CHECK(
            LITE_get_tensor_total_size_in_byte(c_output_tensor, &length));
    ASSERT_EQ(length / sizeof(float), 1000);
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, ProfileIOdump) {
    ForwardMgb;
    MakeNetwork;
    LITE_CAPI_CHECK(
            LITE_enable_profile_performance(c_network, "./profile.json"));
    LoadNetwork;
    SetInput;
    ForwardNetwork;
    ASSERT_TRUE(fopen("./profile.json", "r"));

    LITE_CAPI_CHECK(LITE_enable_io_txt_dump(c_network, "./io_txt_dump.txt"));
    ForwardNetwork;
    ASSERT_TRUE(fopen("./io_txt_dump.txt", "r"));

    GetOutput;
    CompareResult;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, GetDeviceType) {
    lite::Config config;
    auto lite_tensor = lite::get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    MakeNetwork;
    LoadNetwork;
    LiteDeviceType devicetype;
    LITE_CAPI_CHECK(LITE_get_device_type(c_network, &devicetype));
    ASSERT_TRUE(devicetype == LiteDeviceType::LITE_CPU);
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, GetModelExtraInfo) {
    lite::Config config;
    std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite";
    MakeNetwork;
    LITE_load_model_from_path(c_network, model_path.c_str());
    const char* info = nullptr;
    int info_size = 0;
    LITE_CAPI_CHECK(LITE_get_model_extra_info(c_network, &info, &info_size));
    ASSERT_TRUE(info_size > 0);
    printf("info %s \n", info);
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, TestWorkSpaceLimit) {
    lite::Config config;
    auto lite_tensor = lite::get_input_data("./input_data.npy");
    size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte();
    std::string model_path = "./shufflenet.mge";
    MakeNetwork;
    LoadNetwork;
    printf("go to config workspace limit\n");
    LITE_CAPI_CHECK(LITE_set_network_algo_workspace_limit(c_network, 1000));
    SetInput;
    ForwardNetwork;

    GetOutput;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 TEST(TestCapiNetWork, TestShareWeights) {
    ForwardMgb;
    MakeNetwork;
    LoadNetwork;
    SetInput;
    ForwardNetwork;

    GetOutput;
    CompareResult;

    LiteNetwork c_network2;
    LITE_CAPI_CHECK(
            LITE_make_network(&c_network2, *default_config(), *default_network_io()));
    LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network2));
    LITE_CAPI_CHECK(LITE_shared_weight_with_network(c_network2, c_network));
    int is_cpu_inplace_mode = false;
    LITE_CAPI_CHECK(LITE_is_cpu_inplace_mode(c_network2, &is_cpu_inplace_mode));
    ASSERT_EQ(is_cpu_inplace_mode, true);

    LiteTensor c_input_tensor2, c_output_tensor2;
    LITE_CAPI_CHECK(
            LITE_get_io_tensor(c_network2, "data", LITE_IO, &c_input_tensor2));
    LITE_CAPI_CHECK(LITE_reset_tensor_memory(
            c_input_tensor2, lite_tensor->get_memory_ptr(),
            lite_tensor->get_tensor_total_size_in_byte()));
    LITE_CAPI_CHECK(LITE_forward(c_network2));
    LITE_CAPI_CHECK(LITE_wait(c_network2));
    LITE_CAPI_CHECK(LITE_get_io_tensor(c_network2, output_name, LITE_IO,
                                       &c_output_tensor2));
    void* output_ptr2;
    LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor2, &output_ptr2));

    EXPECT_TRUE(lite::compare_memory<float>(
            output_ptr2, result_mgb->get_memory_ptr(),
            result_mgb->get_tensor_total_size_in_byte() / sizeof(float)));

    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
    LITE_CAPI_CHECK(LITE_destroy_network(c_network2));
 }

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/test/test_network_options.cpp
+++ b/lite/test/test_network_options.cpp
@@ -0,0 +1,351 @@
 /**
 * \file test/test_network_options.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "../src/common.h"
 #include "../src/misc.h"
 #include "../src/mge/network_impl.h"
 #include "lite/global.h"

 #include "megbrain/tensor.h"
 #include "test_common.h"

 #include <string.h>
 #include <chrono>
 #include <memory>
 #include <random>

 using namespace lite;

 TEST(TestNetWorkOptions, no_var_sanity_check_and_record) {
    Config config;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    config.options.var_sanity_check_first_run = false;
    config.options.comp_node_seq_record_level = 1;

    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    network->load_model(model_path);
    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);
    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);
 }

 TEST(TestNetWorkOptions, const_shape) {
    Config config;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    config.options.var_sanity_check_first_run = false;
    config.options.const_shape = true;
    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);
 }

 TEST(TestNetWorkOptions, NCHW44) {
    Config config;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    config.options.var_sanity_check_first_run = false;
    config.options.enable_nchw44 = true;
    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    Runtime::set_network_algo_policy(
            network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
                             LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE);

    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);
 }

 TEST(TestNetWorkOptions, test_cache) {
    Config config;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    set_persistent_cache("./algo_cache.txt", true);
    network->load_model(model_path);
    Runtime::set_network_algo_policy(
            network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
                             LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);

    dump_persistent_cache("./algo_cache.txt");
    ASSERT_TRUE(fopen("./algo_cache.txt", "r"));

    set_persistent_cache("./algo_cache.txt");
    network->forward();
    network->wait();
    compare_lite_tensor<float>(output_tensor, result_mgb);
 }

 TEST(TestNetWorkOptions, FastRunIgnorBatch) {
    Config config;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    set_persistent_cache("./algo_cache.txt");
    network->load_model(model_path);
    Runtime::set_network_algo_policy(
            network,
            LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
                    LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE,
            1, true);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);

    dump_persistent_cache("./algo_cache.txt");
    ASSERT_TRUE(fopen("./algo_cache.txt", "r"));
 }

 #if LITE_WITH_CUDA
 TEST(TestNetWorkOptions, NCHW4) {
    Config config;
    config.device_type = LiteDeviceType::LITE_CUDA;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    config.options.enable_nchw4 = 1;
    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);
 }

 TEST(TestNetWorkOptions, NCHW32) {
    Config config;
    config.device_type = LiteDeviceType::LITE_CUDA;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    config.options.enable_nchw32 = 1;
    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    Runtime::set_network_algo_policy(
            network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE |
                             LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE);
    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();
    compare_lite_tensor<float>(output_tensor, result_mgb);
 }

 TEST(TestNetWorkOptions, jit_level) {
    Config config;
    config.device_type = LiteDeviceType::LITE_CUDA;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    config.options.jit_level = 1;
    std::shared_ptr<Network> network = std::make_shared<Network>(config);

    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();

    compare_lite_tensor<float>(output_tensor, result_mgb);
 }
 #endif

 #if MGB_ENABLE_TENSOR_RT && LITE_WITH_CUDA
 TEST(TestNetWorkOptions, TensorRT) {
    Config config;
    config.device_type = LiteDeviceType::LITE_CUDA;
    auto tensor = get_input_data("./input_data.npy");
    std::string model_path = "./shufflenet.mge";
    std::string input_name = "data";
    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);

    std::shared_ptr<Network> network = std::make_shared<Network>(config);
    Runtime::use_tensorrt(network);

    set_tensor_rt_cache("./tensorrt_cache.txt");
    network->load_model(model_path);

    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);

    auto src_ptr = tensor->get_memory_ptr();
    auto src_layout = tensor->get_layout();
    input_tensor->reset(src_ptr, src_layout);

    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
    auto result_tensor = std::make_shared<Tensor>(
            LiteDeviceType::LITE_CPU,
            Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});

    void* out_data = result_tensor->get_memory_ptr();
    output_tensor->reset(out_data, result_tensor->get_layout());

    network->forward();
    network->wait();
    dump_tensor_rt_cache();
    ASSERT_TRUE(fopen("./tensorrt_cache.txt", "r"));
    compare_lite_tensor<float>(output_tensor, result_mgb);
 }
 #endif
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/test/test_tensor.cpp
+++ b/lite/test/test_tensor.cpp
@@ -0,0 +1,589 @@
 /**
 * \file test/test_tensor.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "../src/misc.h"
 #include "../src/mge/common.h"
 #include "../src/mge/network_impl.h"
 #include "lite/tensor.h"

 #include <gtest/gtest.h>

 #include <string.h>
 #include <memory>

 using namespace lite;

 TEST(TestTensor, Basic) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor1(LiteDeviceType::LITE_CPU);
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
    //! mge tensor has created
    ASSERT_TRUE(TensorHelper::implement(&tensor1));
    ASSERT_TRUE(TensorHelper::implement(&tensor2));
    ASSERT_TRUE(TensorHelper::implement(&tensor3));
    //! check member
    ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU);
    ASSERT_EQ(tensor2.get_layout(), layout);
    ASSERT_EQ(tensor3.get_layout(), layout);
    //! check the real tensor
    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);
    ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);

    ASSERT_TRUE(TensorHelper::implement(&tensor1)
                        ->cast_final_safe<TensorImplDft>()
                        .host_tensor());

    ASSERT_FALSE(TensorHelper::implement(&tensor1)
                         ->cast_final_safe<TensorImplDft>()
                         .dev_tensor());
    ASSERT_FALSE(TensorHelper::implement(&tensor1)
                         ->cast_final_safe<TensorImplDft>()
                         .dev_tensor());
    ASSERT_TRUE(TensorHelper::implement(&tensor1)
                        ->cast_final_safe<TensorImplDft>()
                        .host_tensor());
 }

 TEST(TestTensor, SetLayoutReAlloc) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor1;
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
    auto old_ptr2 = tensor2.get_memory_ptr();
    auto old_ptr3 = tensor3.get_memory_ptr();

    //! layout set through
    Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8};
    tensor1.set_layout(layout1);
    tensor2.set_layout(layout1);
    tensor3.set_layout(layout1);
    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100);
    ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100);
    auto layout2 = TensorHelper::implement(&tensor2)
                           ->cast_final_safe<TensorImplDft>()
                           .host_tensor()
                           ->layout();
    auto layout3 = TensorHelper::implement(&tensor3)
                           ->cast_final_safe<TensorImplDft>()
                           .host_tensor()
                           ->layout();
    ASSERT_EQ(to_lite_layout(layout2), layout1);
    ASSERT_EQ(to_lite_layout(layout3), layout1);

    auto new_ptr2 = tensor2.get_memory_ptr();
    auto new_ptr3 = tensor3.get_memory_ptr();

    ASSERT_EQ(old_ptr2, new_ptr2);
    ASSERT_EQ(old_ptr3, new_ptr3);
 }

 TEST(TestTensor, Reset) {
    Layout layout{{3, 20}, 2, LiteDataType::LITE_FLOAT};
    Tensor tensor1;
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);

    auto old_ptr2 = tensor2.get_memory_ptr();
    auto old_ptr3 = tensor3.get_memory_ptr();
    //! make sure memory is allocted
    ASSERT_NO_THROW(memcpy(old_ptr2, old_ptr3, 3 * 20 * 2));

    std::shared_ptr<float> new_ptr2(new float[3 * 20],
                                    [](float* ptr) { delete[] ptr; });
    std::shared_ptr<float> new_ptr3(new float[3 * 20],
                                    [](float* ptr) { delete[] ptr; });
    tensor1.reset(new_ptr2.get(), layout);
    tensor2.reset(new_ptr2.get(), 3 * 20 * 4);
    tensor3.reset(new_ptr3.get(), 3 * 20 * 4);
    //! After reset the original mem is freed
    /*ASSERT_EXIT((memcpy(old_ptr2, old_ptr3, 3 * 20 * 2), exit(0)),
                ::testing::KilledBySignal(SIGSEGV), ".*");*/

    ASSERT_EQ(tensor2.get_memory_ptr(), new_ptr2.get());
    ASSERT_EQ(tensor3.get_memory_ptr(), new_ptr3.get());

    ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2));

    Layout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT};
    std::shared_ptr<float> ptr2(new float[6 * 20],
                                [](float* ptr) { delete[] ptr; });
    std::shared_ptr<float> ptr3(new float[6 * 20],
                                [](float* ptr) { delete[] ptr; });
    tensor2.reset(ptr2.get(), layout1);
    tensor3.reset(ptr3.get(), layout1);

    //! memory is not freed by Tensor reset
    ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2));
    auto host_layout2 = TensorHelper::implement(&tensor2)
                                ->cast_final_safe<TensorImplDft>()
                                .host_tensor()
                                ->layout();
    auto host_layout3 = TensorHelper::implement(&tensor3)
                                ->cast_final_safe<TensorImplDft>()
                                .host_tensor()
                                ->layout();

    ASSERT_EQ(to_lite_layout(host_layout2), layout1);
    ASSERT_EQ(to_lite_layout(host_layout3), layout1);
 }

 TEST(TestTensor, CrossCNCopy) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor1(LiteDeviceType::LITE_CPU);
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    Tensor tensor3(LiteDeviceType::LITE_CPU, layout);
    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);
    auto old_ptr2 = tensor2.get_memory_ptr();
    auto old_ptr3 = tensor3.get_memory_ptr();

    //! test source tenor is empty
    ASSERT_THROW(tensor2.copy_from(tensor1), std::exception);
    tensor1.copy_from(tensor2);
    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);

    ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2);
    ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3);
 }

 TEST(TestTensor, SharedTensorMemory) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor1(LiteDeviceType::LITE_CPU);
    {
        Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
        tensor1.share_memory_with(tensor2);
        auto ptr1 = tensor1.get_memory_ptr();
        auto ptr2 = tensor2.get_memory_ptr();
        ASSERT_EQ(ptr1, ptr2);
    }
    // check after tensor2 destroy, tensor1 can also visit
    auto ptr1 = static_cast<float*>(tensor1.get_memory_ptr());
    size_t length = tensor1.get_tensor_total_size_in_byte() /
                    tensor1.get_layout().get_elem_size();
    for (size_t i = 0; i < length; i++) {
        ptr1[i] = i;
    }
 }

 TEST(TestTensor, Reshape) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    auto ptr = tensor2.get_memory_ptr();

    //! test wrong case
    ASSERT_THROW(tensor2.reshape({-1, -1, 3 * 224 * 224}), std::exception);
    ASSERT_THROW(tensor2.reshape({-1, 3, 3 * 224 * 224}), std::exception);
    ASSERT_THROW(tensor2.reshape({1, 3, 3 * 224 * 224}), std::exception);
    ASSERT_THROW(tensor2.reshape({3, 3, 3 * 224 * 224}), std::exception);

    tensor2.reshape({3 * 224 * 224});
    ASSERT_EQ(tensor2.get_layout().ndim, 1);
    ASSERT_EQ(tensor2.get_layout().data_type, LiteDataType::LITE_FLOAT);
    ASSERT_EQ(tensor2.get_layout().shapes[0], 3 * 224 * 224);
    tensor2.reshape({-1, 224, 224});
    ASSERT_EQ(tensor2.get_layout().ndim, 3);
    ASSERT_EQ(tensor2.get_layout().shapes[0], 3);
    ASSERT_EQ(tensor2.get_layout().shapes[1], 224);

    ASSERT_EQ(tensor2.get_memory_ptr(), ptr);
 }

 TEST(TestTensor, Slice) {
    Layout layout{{20, 20}, 2};
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    auto ptr = tensor2.get_memory_ptr();

    //! test source tenor is empty
    ASSERT_THROW(tensor2.slice({5, 10, 10}, {10, 15}), std::exception);
    ASSERT_THROW(tensor2.slice({5, 10}, {10, 15}, {5}), std::exception);
    ASSERT_THROW(tensor2.slice({5, 10}, {10, 15, 10}), std::exception);
    for (int i = 0; i < 20 * 20; i++) {
        *(static_cast<float*>(ptr) + i) = i;
    }
    auto check = [&](size_t start, size_t end, size_t step) {
        Tensor tensor3;
        tensor3.copy_from(
                *tensor2.slice({start, start}, {end, end}, {step, step}));
        float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr());
        for (size_t i = start; i < end; i += step) {
            for (size_t j = start; j < end; j += step) {
                ASSERT_EQ(float(i * 20 + j), *new_ptr);
                ++new_ptr;
            }
        }
    };
    check(5, 10, 1);
    check(5, 11, 2);
    check(2, 18, 4);

    Tensor tensor3;
    tensor3.copy_from(*tensor2.slice({3}, {9}, {2}));
    float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr());
    for (size_t i = 3; i < 9; i += 2) {
        for (size_t j = 0; j < 20; j++) {
            ASSERT_EQ(float(i * 20 + j), *new_ptr);
            ++new_ptr;
        }
    }
 }

 TEST(TestTensor, SliceCopy) {
    Layout layout{{20, 20}, 2};
    Tensor tensor(LiteDeviceType::LITE_CPU, layout);
    //! alloc memory
    auto ptr = static_cast<float*>(tensor.get_memory_ptr());

    Layout layout_slice{{20, 10}, 2};
    Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice);
    auto ptr0 = tensor0.get_memory_ptr();
    for (int i = 0; i < 10 * 20; i++) {
        *(static_cast<float*>(ptr0) + i) = i;
    }
    Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice);
    auto ptr1 = tensor1.get_memory_ptr();
    for (int i = 0; i < 10 * 20; i++) {
        *(static_cast<float*>(ptr1) + i) = i + 200;
    }

    auto slice0 = tensor.slice({0, 0}, {20, 10});
    auto slice1 = tensor.slice({0, 10}, {20, 20});

    slice0->copy_from(tensor0);
    slice1->copy_from(tensor1);

    ASSERT_FALSE(slice0->is_continue_memory());
    ASSERT_FALSE(slice1->is_continue_memory());

    for (size_t i = 0; i < 20; i++) {
        for (size_t j = 0; j < 10; j++) {
            ASSERT_EQ(float(i * 10 + j), *ptr);
            ++ptr;
        }
        for (size_t j = 0; j < 10; j++) {
            ASSERT_EQ(float(i * 10 + j + 200), *ptr);
            ++ptr;
        }
    }
    slice0->fill_zero();
    Tensor tmp;
    tmp.copy_from(*slice0);
    float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr());
    for (size_t i = 0; i < 20; i++) {
        for (size_t j = 0; j < 10; j++) {
            ASSERT_EQ(float(0), *tmp_ptr);
            ++tmp_ptr;
        }
    }
 }

 TEST(TestTensor, GetPtrOffset) {
    Layout layout{{20, 20}, 2};
    Tensor tensor(LiteDeviceType::LITE_CPU, layout);
    //! alloc memory
    auto ptr = static_cast<float*>(tensor.get_memory_ptr());

    auto ptr_offset = tensor.get_memory_ptr({10, 10});
    ASSERT_EQ(ptr_offset, ptr + 10 * 20 + 10);

    auto slice0 = tensor.slice({0, 0}, {20, 10});
    auto slice1 = tensor.slice({0, 10}, {20, 20});

    ASSERT_FALSE(slice0->is_continue_memory());
    ASSERT_FALSE(slice1->is_continue_memory());

    auto ptr_offset_slice0 = slice0->get_memory_ptr({6, 5});
    auto ptr_offset_slice1 = slice1->get_memory_ptr({2, 5});

    ASSERT_EQ(ptr_offset_slice0, ptr + 6 * 20 + 5);
    ASSERT_EQ(ptr_offset_slice1, ptr + 2 * 20 + 10 + 5);
 }

 TEST(TestTensor, Concat) {
    Layout layout{{5, 5, 5}, 3};
    std::vector<Tensor> tensors;
    for (int i = 0; i < 4; i++) {
        Tensor tensor(LiteDeviceType::LITE_CPU, layout);
        auto ptr = static_cast<float*>(tensor.get_memory_ptr());
        for (int n = 0; n < 5 * 5 * 5; n++) {
            ptr[n] = i;
        }
        tensors.push_back(tensor);
    }
    auto check = [&](int dim) {
        auto new_tensor = TensorUtils::concat(tensors, dim);
        auto ptr = static_cast<float*>(new_tensor->get_memory_ptr());
        size_t stride = std::pow(5, (3 - dim));
        for (int i = 0; i < 4; i++) {
            for (size_t j = 0; j < stride; j++) {
                ASSERT_EQ(ptr[i * stride + j], i);
            }
        }
    };
    check(0);
    check(1);
    check(2);
 }

 #if LITE_WITH_CUDA
 TEST(TestTensor, BasicDevice) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor1(LiteDeviceType::LITE_CUDA, layout);
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    //! mge tensor has created
    ASSERT_TRUE(TensorHelper::implement(&tensor1));
    ASSERT_TRUE(TensorHelper::implement(&tensor2));

    //! check member
    ASSERT_EQ(tensor1.get_device_type(), LiteDeviceType::LITE_CUDA);
    ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU);
    ASSERT_EQ(tensor2.get_layout(), layout);
    //! check the real tensor
    ASSERT_EQ(tensor1.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);
    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4);

    ASSERT_TRUE(TensorHelper::implement(&tensor2)
                        ->cast_final_safe<TensorImplDft>()
                        .host_tensor());

    ASSERT_FALSE(TensorHelper::implement(&tensor2)
                         ->cast_final_safe<TensorImplDft>()
                         .dev_tensor());
    ASSERT_TRUE(TensorHelper::implement(&tensor1)
                        ->cast_final_safe<TensorImplDft>()
                        .dev_tensor());
    ASSERT_FALSE(TensorHelper::implement(&tensor1)
                         ->cast_final_safe<TensorImplDft>()
                         .host_tensor());
 }

 TEST(TestTensor, SetLayoutReAllocDevice) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor2(LiteDeviceType::LITE_CUDA, layout);
    auto old_ptr2 = tensor2.get_memory_ptr();

    //! layout set through
    Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8};
    tensor2.set_layout(layout1);
    ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100);
    auto layout2 = TensorHelper::implement(&tensor2)
                           ->cast_final_safe<TensorImplDft>()
                           .dev_tensor()
                           ->layout();
    ASSERT_EQ(to_lite_layout(layout2), layout1);

    auto new_ptr2 = tensor2.get_memory_ptr();

    ASSERT_EQ(old_ptr2, new_ptr2);
 }

 TEST(TestTensor, CrossCNCopyDevice) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor0;
    Tensor tensor1(LiteDeviceType::LITE_CPU);
    Tensor tensor2(LiteDeviceType::LITE_CPU, layout);
    Tensor tensor3(LiteDeviceType::LITE_CUDA, layout);

    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);

    auto old_ptr2 = tensor2.get_memory_ptr();
    auto old_ptr3 = tensor3.get_memory_ptr();
    ASSERT_THROW(tensor3.copy_from(tensor1), std::exception);

    tensor1.copy_from(tensor3);
    tensor0.copy_from(tensor3);

    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);

    ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2);
    ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3);
 }

 TEST(TestTensor, PinnedHostMem) {
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor1(LiteDeviceType::LITE_CPU);
    bool is_pinned_host = true;
    Tensor tensor2(LiteDeviceType::LITE_CUDA, layout, is_pinned_host);
    Tensor tensor3(LiteDeviceType::LITE_CUDA, layout);
    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);

    ASSERT_EQ(tensor2.is_pinned_host(), true);
    ASSERT_EQ(tensor3.is_pinned_host(), false);

    auto old_ptr2 = tensor2.get_memory_ptr();
    auto old_ptr3 = tensor3.get_memory_ptr();

    //! test source tenor is empty
    ASSERT_THROW(tensor2.copy_from(tensor1), std::exception);
    tensor1.copy_from(tensor2);
    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);

    ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2);
    ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3);
 }

 TEST(TestTensor, DeviceId) {
    if(get_device_count(LITE_CUDA) <= 1)
        return;
    Layout layout{{1, 3, 224, 224}, 4};
    Tensor tensor2(0, LiteDeviceType::LITE_CUDA, layout);
    Tensor tensor3(1, LiteDeviceType::LITE_CUDA, layout);

    tensor2.copy_from(tensor3);
    tensor3.copy_from(tensor2);

    Tensor tensor1;
    tensor1.copy_from(tensor2);
    tensor1.copy_from(tensor3);
 }

 TEST(TestTensor, SliceDevice) {
    Layout layout{{20, 20}, 2};
    Tensor host_tensor0;
    Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout);
    host_tensor0.copy_from(dev_tensor0);
    auto ptr = host_tensor0.get_memory_ptr();

    for (int i = 0; i < 20 * 20; i++) {
        *(static_cast<float*>(ptr) + i) = i;
    }
    dev_tensor0.copy_from(host_tensor0);

    auto check = [&](size_t start, size_t end, size_t step) {
        Tensor host_tensor;
        host_tensor.copy_from(
                *dev_tensor0.slice({start, start}, {end, end}, {step, step}));
        float* new_ptr = static_cast<float*>(host_tensor.get_memory_ptr());
        for (size_t i = start; i < end; i += step) {
            for (size_t j = start; j < end; j += step) {
                ASSERT_EQ(float(i * 20 + j), *new_ptr);
                ++new_ptr;
            }
        }
    };
    check(5, 10, 1);
    check(5, 11, 2);
    check(2, 18, 4);
 }

 TEST(TestTensor, MemSetDevice) {
    Layout layout{{20, 20}, 2, LiteDataType::LITE_INT8};
    Tensor host_tensor0(LiteDeviceType::LITE_CPU, layout);
    Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout);
    auto check = [&](uint8_t val, const Tensor& tensor) {
        auto ptr = static_cast<uint8_t*>(tensor.get_memory_ptr());
        for (int i = 0; i < 20 * 20; i++) {
            ASSERT_EQ(val, *(ptr + i));
        }
    };
    host_tensor0.fill_zero();
    check(0, host_tensor0);

    Tensor host_tensor1;
    dev_tensor0.fill_zero();
    host_tensor1.copy_from(dev_tensor0);
    check(0, host_tensor1);
 }

 TEST(TestTensor, DeviceSliceCopy) {
    Layout layout{{20, 20}, 2};
    Tensor tensor(LiteDeviceType::LITE_CUDA, layout);
    //! alloc memory
    tensor.get_memory_ptr();

    Layout layout_slice{{20, 10}, 2};
    Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice);
    auto ptr0 = tensor0.get_memory_ptr();
    for (int i = 0; i < 10 * 20; i++) {
        *(static_cast<float*>(ptr0) + i) = i;
    }
    Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice);
    auto ptr1 = tensor1.get_memory_ptr();
    for (int i = 0; i < 10 * 20; i++) {
        *(static_cast<float*>(ptr1) + i) = i + 200;
    }

    auto slice0 = tensor.slice({0, 0}, {20, 10});
    auto slice1 = tensor.slice({0, 10}, {20, 20});

    slice0->copy_from(tensor0);
    slice1->copy_from(tensor1);

    ASSERT_FALSE(slice0->is_continue_memory());
    ASSERT_FALSE(slice1->is_continue_memory());

    Tensor host_tensor;
    host_tensor.copy_from(tensor);
    auto ptr = static_cast<float*>(host_tensor.get_memory_ptr());

    for (size_t i = 0; i < 20; i++) {
        for (size_t j = 0; j < 10; j++) {
            ASSERT_EQ(float(i * 10 + j), *ptr);
            ++ptr;
        }
        for (size_t j = 0; j < 10; j++) {
            ASSERT_EQ(float(i * 10 + j + 200), *ptr);
            ++ptr;
        }
    }
    slice0->fill_zero();
    Tensor tmp;
    tmp.copy_from(*slice0);
    float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr());
    for (size_t i = 0; i < 20; i++) {
        for (size_t j = 0; j < 10; j++) {
            ASSERT_EQ(float(0), *tmp_ptr);
            ++tmp_ptr;
        }
    }
 }

 TEST(TestTensor, ConcatDevice) {
    Layout layout{{5, 5, 5}, 3};
    std::vector<Tensor> tensors;
    for (int i = 0; i < 4; i++) {
        Tensor tensor(LiteDeviceType::LITE_CPU, layout);
        auto ptr = static_cast<float*>(tensor.get_memory_ptr());
        for (int n = 0; n < 5 * 5 * 5; n++) {
            ptr[n] = i;
        }
        tensors.push_back(tensor);
    }
    auto check = [&](int dim) {
        auto new_tensor =
                TensorUtils::concat(tensors, dim, LiteDeviceType::LITE_CUDA, 0);

        Tensor tensor(LiteDeviceType::LITE_CPU);
        tensor.copy_from(*new_tensor);
        auto ptr = static_cast<float*>(tensor.get_memory_ptr());
        size_t stride = std::pow(5, (3 - dim));
        for (int i = 0; i < 4; i++) {
            for (size_t j = 0; j < stride; j++) {
                ASSERT_EQ(ptr[i * stride + j], i);
            }
        }
        ASSERT_EQ(new_tensor->get_device_type(), LiteDeviceType::LITE_CUDA);
        ASSERT_EQ(new_tensor->get_device_id(), 0);
    };
    check(0);
    check(1);
    check(2);
 }
 #endif
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/test/test_tensor_c.cpp
+++ b/lite/test/test_tensor_c.cpp
@@ -0,0 +1,316 @@
 /**
 * \file test/test_tensor_c.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include "lite_build_config.h"

 #if LITE_BUILD_WITH_MGE
 #include "../src/misc.h"
 #include "lite-c/global_c.h"
 #include "lite-c/tensor_c.h"

 #include <gtest/gtest.h>
 #include <memory>

 TEST(TestCapiTensor, Basic) {
    LiteTensor c_tensor0, c_tensor1;
    LiteTensorDesc description = default_desc;
    LITE_make_tensor(description, &c_tensor0);
    int is_pinned_host = false;
    LITE_is_pinned_host(c_tensor0, &is_pinned_host);
    ASSERT_FALSE(is_pinned_host);
    LiteDeviceType device_type;
    LITE_get_tensor_device_type(c_tensor0, &device_type);
    ASSERT_EQ(device_type, LiteDeviceType::LITE_CPU);
    size_t length = 0;
    LITE_get_tensor_total_size_in_byte(c_tensor0, &length);
    ASSERT_EQ(length, 0);

    LiteLayout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
    description.device_type = LiteDeviceType::LITE_CPU;
    description.layout = layout;
    description.is_pinned_host = true;
    LITE_make_tensor(description, &c_tensor1);
    LITE_is_pinned_host(c_tensor1, &is_pinned_host);
    ASSERT_TRUE(is_pinned_host);
    LITE_get_tensor_total_size_in_byte(c_tensor1, &length);
    ASSERT_EQ(length, 1 * 3 * 224 * 224 * 4);

    LiteLayout get_layout;
    LITE_get_tensor_layout(c_tensor1, &get_layout);
    ASSERT_EQ(get_layout.ndim, layout.ndim);
    ASSERT_EQ(get_layout.data_type, layout.data_type);
    ASSERT_EQ(get_layout.shapes[0], layout.shapes[0]);
    ASSERT_EQ(get_layout.shapes[1], layout.shapes[1]);
    ASSERT_EQ(get_layout.shapes[2], layout.shapes[2]);
    ASSERT_EQ(get_layout.shapes[3], layout.shapes[3]);

    //! test error
    ASSERT_EQ(LITE_is_pinned_host(c_tensor0, nullptr), -1);
    ASSERT_NE(strlen(LITE_get_last_error()), 0);
    printf("The last error is: %s\n", LITE_get_last_error());

    LITE_destroy_tensor(c_tensor0);
    LITE_destroy_tensor(c_tensor1);
 }

 TEST(TestCapiTensor, SetLayoutReAlloc) {
    LiteTensor c_tensor0;
    LiteTensorDesc description = default_desc;
    description.layout =
            LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor0);
    void *old_ptr, *new_ptr;
    LITE_get_tensor_memory(c_tensor0, &old_ptr);

    LiteLayout new_layout =
            LiteLayout{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8};
    LITE_set_tensor_layout(c_tensor0, new_layout);
    LITE_get_tensor_memory(c_tensor0, &new_ptr);

    size_t length = 0;
    LITE_get_tensor_total_size_in_byte(c_tensor0, &length);

    ASSERT_EQ(length, 1 * 3 * 100 * 100);
    ASSERT_EQ(old_ptr, new_ptr);
 }

 TEST(TestCapiTensor, Reset) {
    LiteTensor c_tensor0, c_tensor1;
    LiteTensorDesc description = default_desc;
    description.layout = LiteLayout{{3, 20}, 2, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor0);
    LITE_make_tensor(description, &c_tensor1);
    void *old_ptr0, *old_ptr1;
    LITE_get_tensor_memory(c_tensor0, &old_ptr0);
    LITE_get_tensor_memory(c_tensor1, &old_ptr1);
    //! make sure memory is allocted
    ASSERT_NO_THROW(memcpy(old_ptr0, old_ptr1, 3 * 20 * 4));

    std::shared_ptr<float> new_ptr0(new float[3 * 20],
                                    [](float* ptr) { delete[] ptr; });
    std::shared_ptr<float> new_ptr1(new float[3 * 20],
                                    [](float* ptr) { delete[] ptr; });
    LITE_reset_tensor_memory(c_tensor0, new_ptr0.get(), 3 * 20 * 4);
    LITE_reset_tensor_memory(c_tensor1, new_ptr1.get(), 3 * 20 * 4);
    void *tmp_ptr0, *tmp_ptr1;
    LITE_get_tensor_memory(c_tensor0, &tmp_ptr0);
    LITE_get_tensor_memory(c_tensor1, &tmp_ptr1);
    ASSERT_EQ(tmp_ptr0, new_ptr0.get());
    ASSERT_EQ(tmp_ptr1, new_ptr1.get());

    ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4));

    LiteLayout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT};
    std::shared_ptr<float> ptr2(new float[6 * 20],
                                [](float* ptr) { delete[] ptr; });
    std::shared_ptr<float> ptr3(new float[6 * 20],
                                [](float* ptr) { delete[] ptr; });
    LITE_reset_tensor(c_tensor0, layout1, new_ptr0.get());
    LITE_reset_tensor(c_tensor1, layout1, new_ptr1.get());

    //! memory is not freed by Tensor reset
    ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4));

    LiteLayout tmp_layout0, tmp_layout1;
    LITE_get_tensor_layout(c_tensor0, &tmp_layout0);
    LITE_get_tensor_layout(c_tensor1, &tmp_layout1);
    ASSERT_EQ(tmp_layout0.ndim, tmp_layout1.ndim);
    ASSERT_EQ(tmp_layout0.data_type, tmp_layout1.data_type);
    ASSERT_EQ(tmp_layout0.shapes[0], tmp_layout1.shapes[0]);
    ASSERT_EQ(tmp_layout0.shapes[1], tmp_layout1.shapes[1]);

    LITE_destroy_tensor(c_tensor0);
    LITE_destroy_tensor(c_tensor1);
 }

 TEST(TestCapiTensor, CrossCNCopy) {
    LiteTensor c_tensor0, c_tensor1, c_tensor2;
    LiteTensorDesc description = default_desc;
    LITE_make_tensor(description, &c_tensor0);

    description.layout =
            LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor1);
    LITE_make_tensor(description, &c_tensor2);

    LITE_tensor_copy(c_tensor1, c_tensor2);
    LITE_tensor_copy(c_tensor2, c_tensor1);
    void *old_ptr1, *old_ptr2, *new_ptr1, *new_ptr2;
    LITE_get_tensor_memory(c_tensor1, &old_ptr1);
    LITE_get_tensor_memory(c_tensor2, &old_ptr2);

    //! test source tenor is empty
    ASSERT_EQ(LITE_tensor_copy(c_tensor1, c_tensor0), -1);
    ASSERT_NE(strlen(LITE_get_last_error()), 0);
    printf("The last error is: %s\n", LITE_get_last_error());

    LITE_tensor_copy(c_tensor0, c_tensor1);
    LITE_tensor_copy(c_tensor1, c_tensor2);
    LITE_tensor_copy(c_tensor2, c_tensor0);

    LITE_get_tensor_memory(c_tensor1, &new_ptr1);
    LITE_get_tensor_memory(c_tensor2, &new_ptr2);

    ASSERT_EQ(old_ptr1, new_ptr1);
    ASSERT_EQ(old_ptr2, new_ptr2);

    LITE_destroy_tensor(c_tensor0);
    LITE_destroy_tensor(c_tensor1);
    LITE_destroy_tensor(c_tensor2);
 }

 TEST(TestCapiTensor, ShareMemoryWith) {
    LiteTensor c_tensor0, c_tensor1;
    LiteTensorDesc description = default_desc;
    LITE_make_tensor(description, &c_tensor0);

    description.layout =
            LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor1);

    ASSERT_EQ(LITE_tensor_share_memory_with(c_tensor1, c_tensor0), -1);
    LITE_tensor_share_memory_with(c_tensor0, c_tensor1);
    void *ptr0, *ptr1;
    LITE_get_tensor_memory(c_tensor0, &ptr0);
    LITE_get_tensor_memory(c_tensor1, &ptr1);

    ASSERT_EQ(ptr0, ptr1);

    LITE_destroy_tensor(c_tensor0);
    LITE_destroy_tensor(c_tensor1);
 }

 TEST(TestCapiTensor, Reshape) {
    LiteTensor c_tensor0;
    LiteTensorDesc description = default_desc;
    description.layout =
            LiteLayout{{8, 8, 100, 100}, 4, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor0);
    void* old_ptr;
    LITE_get_tensor_memory(c_tensor0, &old_ptr);

    auto check = [&](std::vector<size_t> expect, const LiteTensor& tensor) {
        LiteLayout get_layout;
        LITE_get_tensor_layout(tensor, &get_layout);
        ASSERT_EQ(get_layout.ndim, expect.size());
        for (size_t i = 0; i < expect.size(); i++) {
            ASSERT_EQ(get_layout.shapes[i], expect[i]);
        }
        void* new_ptr;
        LITE_get_tensor_memory(tensor, &new_ptr);
        ASSERT_EQ(old_ptr, new_ptr);
    };
    {
        int shape[2] = {-1, 50};
        LITE_tensor_reshape(c_tensor0, shape, 2);
        check({8 * 8 * 100 * 2, 50}, c_tensor0);
    }
    {
        int shape[3] = {64, 100, 100};
        LITE_tensor_reshape(c_tensor0, shape, 3);
        check({8 * 8, 100, 100}, c_tensor0);
    }
    {
        int shape[3] = {16, 100, -1};
        LITE_tensor_reshape(c_tensor0, shape, 3);
        check({16, 100, 400}, c_tensor0);
    }
    LITE_destroy_tensor(c_tensor0);
 }

 TEST(TestCapiTensor, Slice) {
    LiteTensor c_tensor0;
    LiteTensorDesc description = default_desc;
    description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor0);
    void* old_ptr;
    LITE_get_tensor_memory(c_tensor0, &old_ptr);
    for (size_t i = 0; i < 20 * 20; i++) {
        *(static_cast<float*>(old_ptr) + i) = i;
    }
    auto check = [&](size_t start, size_t end, size_t step, bool have_step) {
        LiteTensor tensor, slice_tensor;
        LITE_make_tensor(default_desc, &tensor);
        size_t start_ptr[2] = {start, start};
        size_t end_ptr[2] = {end, end};
        size_t step_ptr[2] = {step, step};

        if (have_step) {
            LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, step_ptr, 2,
                              &slice_tensor);
        } else {
            LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, nullptr, 2,
                              &slice_tensor);
        }
        int is_continue = true;
        LITE_is_memory_continue(slice_tensor, &is_continue);
        ASSERT_FALSE(is_continue);

        LITE_tensor_copy(tensor, slice_tensor);
        void* new_ptr;
        LITE_get_tensor_memory(tensor, &new_ptr);
        float* ptr = static_cast<float*>(new_ptr);
        for (size_t i = start; i < end; i += step) {
            for (size_t j = start; j < end; j += step) {
                ASSERT_EQ(float(i * 20 + j), *ptr);
                ++ptr;
            }
        }
        LITE_destroy_tensor(tensor);
    };
    check(1, 8, 1, true);
    check(1, 8, 1, false);
    check(2, 10, 2, true);
    check(10, 18, 4, true);
    check(10, 18, 1, false);
    LITE_destroy_tensor(c_tensor0);
 }

 TEST(TestCapiTensor, Memset) {
    LiteTensor c_tensor0;
    LiteTensorDesc description = default_desc;
    description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor0);
    void* ptr;
    uint8_t* uint8_ptr;
    LITE_get_tensor_memory(c_tensor0, &ptr);
    LITE_tensor_fill_zero(c_tensor0);
    uint8_ptr = static_cast<uint8_t*>(ptr);
    for (size_t i = 0; i < 20 * 20; i++) {
        ASSERT_EQ(0, *uint8_ptr);
        uint8_ptr++;
    }

    LITE_destroy_tensor(c_tensor0);
 }

 TEST(TestCapiTensor, GetMemoryByIndex) {
    LiteTensor c_tensor0;
    LiteTensorDesc description = default_desc;
    description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT};
    LITE_make_tensor(description, &c_tensor0);
    void *ptr0, *ptr1, *ptr2, *ptr3;
    LITE_get_tensor_memory(c_tensor0, &ptr0);
    size_t index0[] = {3, 4};
    LITE_get_tensor_memory_with_index(c_tensor0, &index0[0], 2, &ptr1);
    size_t index1[] = {5, 7};
    LITE_get_tensor_memory_with_index(c_tensor0, &index1[0], 2, &ptr2);
    size_t index2[] = {5};
    LITE_get_tensor_memory_with_index(c_tensor0, &index2[0], 1, &ptr3);

    ASSERT_EQ(ptr1, static_cast<float*>(ptr0) + 3 * 20 + 4);
    ASSERT_EQ(ptr2, static_cast<float*>(ptr0) + 5 * 20 + 7);
    ASSERT_EQ(ptr3, static_cast<float*>(ptr0) + 5 * 20);

    LITE_destroy_tensor(c_tensor0);
 }

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/lite/tools/aes_encrypt.sh
+++ b/lite/tools/aes_encrypt.sh
@@ -0,0 +1,26 @@
 #! /bin/bash -e
 set -e

 if [ $# -lt 2 ] ; then
 echo "USAGE: $0 src dst"
 echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl"
 echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl key"
 exit 1;
 fi

 IV=`openssl rand -hex 16`

 Key=000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F
 if [ $# == 3 ] ; then
 Key=$3
 fi

 # get file size
 size=`wc -c $1`

 echo "encrypt aes-256-cbc ..."
 openssl enc -e -aes-256-cbc -in $1 -out $1.tmp -K $Key -iv $IV
 echo $IV | xxd -r -p | cat - $1.tmp > $2 
 # write size into file
 printf "%016x" ${size%\ *} | xxd -r -p >> $2
 rm -f $1.tmp
--- a/lite/tools/dump_model_mgb.py
+++ b/lite/tools/dump_model_mgb.py
@@ -0,0 +1,134 @@
 #!/usr/bin/env mdl
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 from megskull.graph import NodeFilter, FpropEnv
 from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization
 from megskull.utils.logconf import get_logger
 from meghair.utils import io
 import megbrain as mgb

 import argparse
 import struct
 import re
 import os

 import numpy as np
 import cv2

 logger = get_logger(__name__)

 def optimize_for_inference(args, outputs):
    args_map = {
        'enable_io16xc32': 'f16_io_f32_comp',
        'enable_ioc16': 'f16_io_comp',
        'enable_hwcd4': 'use_nhwcd4',
        'enable_nchw4': 'use_nchw4',
        'enable_nchw88': 'use_nchw88',
        'enable_nchw44': 'use_nchw44',
        'enable_nchw44_dot': 'use_nchw44_dot',
        'enable_nchw32': 'use_nchw32',
        'enable_chwn4': 'use_chwn4',
        'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
        'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
    }
    kwargs = {}
    for k, v in args_map.items():
        if getattr(args, k):
            assert args.optimize_for_inference, (
                'optimize_for_inference should be set when {} is given'.format(
                    k))
            kwargs[v] = True

    if args.optimize_for_inference:
        return mgb.optimize_for_inference(outputs, **kwargs)

    return outputs

 def main():
    parser = argparse.ArgumentParser(
        description='Dump the Python Megbrain model to C++ model, by the way '
        'optimizing for inference',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('input', help='input pkl model file ')
    parser.add_argument('-o', '--output', help='output file', required=True)
    parser.add_argument('--init-bn', action='store_true',
                        help='initialize untrained batch-normalization, to '
                        'avoid NaN or Inf results')
    parser.add_argument('--silent', action='store_true',
                        help='set verbose to False in AssertEqual opr')
    parser.add_argument('--optimize-for-inference', action='store_true',
                        help='enbale optimization for inference')
    parser.add_argument('--discard-var-name', action='store_true',
                        help='discard variable and param names in the '
                        'generated output')
    parser.add_argument('--output-strip-info', action='store_true',
                        help='output code strip information')
    parser.add_argument('--enable-io16xc32', action='store_true',
                        help='transform the mode to float16 io float32 compute')
    parser.add_argument('--enable-ioc16', action='store_true',
                        help='transform the dtype of the model to float16 io '
                        'and compute')
    parser.add_argument('--enable-fuse-conv-bias-nonlinearity',
                        action='store_true',
                        help='fuse convolution bias and nonlinearity opr to a '
                        'conv_bias opr and compute')
    parser.add_argument('--enable-hwcd4', action='store_true',
                        help='transform the model format from NCHW to NHWCD4 '
                        'for inference; you may need to disable CUDA and set '
                        'MGB_USE_MEGDNN_DBG=2')
    parser.add_argument('--enable-nchw4', action='store_true',
                        help='transform the model format from NCHW to NCHW4 '
                        'for inference')
    parser.add_argument('--enable-nchw88', action='store_true',
                        help='transform the model format from NCHW to NCHW88 '
                        'for inference')
    parser.add_argument('--enable-nchw44', action='store_true',
                        help='transform the model format from NCHW to NCHW44 '
                        'for inference')
    parser.add_argument('--enable-nchw44-dot', action='store_true',
                        help='transform the model format from NCHW to NCHW44_DOT '
                        'for optimizing armv8.2 dot in inference')
    parser.add_argument('--enable-chwn4', action='store_true',
                        help='transform the model format to CHWN4 '
                        'for inference, mainly used for nvidia tensorcore')
    parser.add_argument('--enable-nchw32', action='store_true',
                        help='transform the model format from NCHW4 to NCHW32 '
                        'for inference on nvidia TensoCore')
    parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true',
                        help='fuse conv_bias with z input for inference on '
                        'nvidia GPU (this optimization pass will result in mismatch '
                        'of the precision of output of training and inference)')
    args = parser.parse_args()

    env = FpropEnv(verbose_fprop=False)


    outputs = io.load_network(args.input).outputs

    output_mgbvars = list(map(env.get_mgbvar, outputs))

    output_mgbvars = optimize_for_inference(args, output_mgbvars)

    if args.discard_var_name:
        sereg_kwargs = dict(keep_var_name=0, keep_param_name=False)
    else:
        sereg_kwargs = dict(keep_var_name=2, keep_param_name=True)

    stat = mgb.serialize_comp_graph_to_file(
        args.output, output_mgbvars, append=False,
        output_strip_info=args.output_strip_info,
        **sereg_kwargs)
    logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'.
                format(stat.tot_bytes / 1024,
                       (stat.tot_bytes - stat.tensor_value_bytes) / 1024))

 if __name__ == '__main__':
    main()
--- a/lite/tools/pack_model/encrypt_info_and_model.sh
+++ b/lite/tools/pack_model/encrypt_info_and_model.sh
@@ -0,0 +1,75 @@
 #!/usr/bin/env bash
 set -e

 function usage() {
    echo "$0 args1 args2 .."
    echo "available args detail:"
    echo "-i info.json : input info.json file"
    echo "-m model: model name"
    echo "-e encryption mode: encryption mode rc4 encrypt_predefined_rc4 "
    echo "-o output name: output name"
    echo "-n input model name: input model name match with info.json"
    echo "-h : show usage"
    exit -1
 }

 while getopts "i:m:e:o:n:h" arg
 do
    case $arg in
        i)
            INFO_NAME=$OPTARG
            ;;
        m)
            MODEL_NAME=$OPTARG
            ;;
        n)
            INPUT_MODEL_NAME=$OPTARG
            ;;
        e)
            ENCRYPT_MODE=$OPTARG
            ;;
        o)
            OUTPUT_NAME=$OPTARG
            ;;
        h)
            usage
            ;;
        \?)
            echo "show usage"
            usage
            ;;
    esac
 done
 echo "----------------------------------------------------"
 echo "commad args summary:"
 echo "INFO_NAME: $INFO_NAME"
 echo "MODEL_NAME: $MODEL_NAME"
 echo "ENCRYPT_MODE: $ENCRYPT_MODE"
 echo "OUTPUT_NAME: $OUTPUT_NAME"
 echo "INPUT_MODEL_NAME: $INPUT_MODEL_NAME"
 echo "----------------------------------------------------"

 if [[ $INFO_NAME == '' ]]; then
    echo "INFO_NAME is NULL,exit now..."
    exit -1
 fi
 if [[ $MODEL_NAME == '' ]]; then
    echo "MODEL_NAME is NULL,exit now..."
    exit -1
 fi
 if [[ $INPUT_MODEL_NAME == '' ]]; then
    echo "INPUT_MODEL_NAME is NULL,exit now..."
    exit -1
 fi
 if [[ $OUTPUT_NAME == '' ]]; then
    echo "OUTPUT_NAME is NULL,exit now..."
    exit -1
 fi
 ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod
 ENCRYPT_MODEL_NAME=$MODEL_NAME.pr_rc4.emod
 ./rc4_encryptor $ENCRYPT_MODE $INFO_NAME $INFO_NAME.pr_rc4.emod
 ./rc4_encryptor $ENCRYPT_MODE $MODEL_NAME $MODEL_NAME.pr_rc4.emod


 ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod
 python3 pack_model_and_info.py --input-model=$ENCRYPT_MODEL_NAME --model-name=$INPUT_MODEL_NAME --model-cryption="RC4_default" --info-cryption="RC4_default" --input-info=$ENCRYPT_INFO_NAME --info-parser="LITE_default" -o $OUTPUT_NAME
--- a/lite/tools/pack_model/pack_model_and_info.py
+++ b/lite/tools/pack_model/pack_model_and_info.py
@@ -0,0 +1,135 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 #
 # This file is part of MegEngine, a deep learning framework developed by
 # Megvii.
 #
 # copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.

 import argparse
 import struct
 import os
 import subprocess

 import flatbuffers

 def generate_flatbuffer():
    status, path = subprocess.getstatusoutput('which flatc')
    if not status:
        cwd = os.path.dirname(os.path.dirname(__file__))
        fbs_file = os.path.abspath(os.path.join(cwd,
            "../../src/parse_model/pack_model.fbs"))
        cmd = path + ' -p -b '+fbs_file
        ret, _ = subprocess.getstatusoutput(str(cmd))
        if ret:
            raise Exception("flatc generate error!")
    else:
        raise Exception('no flatc in current environment, please build flatc '
                'and put in the system PATH!')

 def main():
    parser = argparse.ArgumentParser(
            description='load a encrypted or not encrypted model and a '
            'json format of the infomation of the model, pack them to a file '
            'which can be loaded by lite.')
    parser.add_argument('--input-model', help='input a encrypted or not encrypted model')
    parser.add_argument('--input-info', help='input a encrypted or not encrypted '
            'json format file.')
    parser.add_argument('--model-name', help='the model name, this must match '
            'with the model name in model info', default = 'NONE')
    parser.add_argument('--model-cryption', help='the model encryption method '
            'name, this is used to find the right decryption method. e.g. '
            '--model_cryption = "AES_default", default is NONE.', default =
            'NONE')
    parser.add_argument('--info-cryption', help='the info encryption method '
            'name, this is used to find the right decryption method. e.g. '
            '--model_cryption = "AES_default", default is NONE.', default =
            'NONE')
    parser.add_argument('--info-parser', help='The information parse method name '
            'default is "LITE_default". ', default = 'LITE_default')
    parser.add_argument('--append', '-a', help='append another model to a '
            'packed model.')
    parser.add_argument('--output', '-o', help='output file of packed model.')

    args = parser.parse_args()

    generate_flatbuffer()
    assert not args.append, ('--append is not support yet')
    assert args.input_model, ('--input_model must be given')
    with open(args.input_model, 'rb') as fin:
        raw_model = fin.read()

    model_length = len(raw_model)

    if args.input_info:
        with open(args.input_info, 'rb') as fin:
            raw_info = fin.read()
            info_length = len(raw_info)
    else:
        raw_info = None
        info_length = 0

    # Generated by `flatc`.
    from model_parse import Model, ModelData, ModelHeader, ModelInfo, PackModel

    builder = flatbuffers.Builder(1024)

    model_name = builder.CreateString(args.model_name)
    model_cryption = builder.CreateString(args.model_cryption)
    info_cryption = builder.CreateString(args.info_cryption)
    info_parser = builder.CreateString(args.info_parser)

    info_data = builder.CreateByteVector(raw_info)
    arr_data = builder.CreateByteVector(raw_model)

    #model header
    ModelHeader.ModelHeaderStart(builder)
    ModelHeader.ModelHeaderAddName(builder, model_name)
    ModelHeader.ModelHeaderAddModelDecryptionMethod(builder, model_cryption)
    ModelHeader.ModelHeaderAddInfoDecryptionMethod(builder, info_cryption)
    ModelHeader.ModelHeaderAddInfoParseMethod(builder, info_parser)
    model_header = ModelHeader.ModelHeaderEnd(builder)

    #model info
    ModelInfo.ModelInfoStart(builder)
    ModelInfo.ModelInfoAddData(builder, info_data)
    model_info = ModelInfo.ModelInfoEnd(builder)

    #model data
    ModelData.ModelDataStart(builder)
    ModelData.ModelDataAddData(builder, arr_data)
    model_data = ModelData.ModelDataEnd(builder)

    Model.ModelStart(builder)
    Model.ModelAddHeader(builder, model_header)
    Model.ModelAddData(builder, model_data)
    Model.ModelAddInfo(builder, model_info)
    model = Model.ModelEnd(builder)

    PackModel.PackModelStartModelsVector(builder, 1)
    builder.PrependUOffsetTRelative(model)
    models = builder.EndVector(1)

    PackModel.PackModelStart(builder)
    PackModel.PackModelAddModels(builder, models)
    packed_model = PackModel.PackModelEnd(builder)

    builder.Finish(packed_model)
    buff = builder.Output()

    result = struct.pack(str(len("packed_model")) + 's', "packed_model".encode('ascii'))
    result += buff

    assert args.output, ('--output must be given')
    with open(args.output, 'wb') as fin:
        fin.write(result)

    print("Model packaged successfully!!!")
    print("model name is: {}.".format(args.model_name))
    print("model encryption method is: {}. ".format(args.model_cryption))
    print("model json infomation encryption method is: {}. ".format(args.info_cryption))
    print("model json infomation parse method is: {}. ".format(args.info_parser))
    print("packed model is write to {} ".format(args.output))

 if __name__ == '__main__':
    main()
--- a/lite/tools/rc4_encrypt.cpp
+++ b/lite/tools/rc4_encrypt.cpp
@@ -0,0 +1,211 @@
 /** \file tools/rc4_encrypt.cpp
 *
 * This file is part of MegEngine, a deep learning framework developed by
 * Megvii.
 *
 * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
 */

 #include <stdio.h>
 #include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <memory>

 #include "../src/decryption/rc4/rc4_cryption_base.h"
 #include "../src/decryption/rc4_cryption.h"

 using namespace lite;

 std::shared_ptr<void> read_file(std::string file_path, size_t& size) {
    FILE* fin = fopen(file_path.c_str(), "rb");
    if (!fin) {
        printf("failed to open %s.", file_path.c_str());
    };
    fseek(fin, 0, SEEK_END);
    size = ftell(fin);
    fseek(fin, 0, SEEK_SET);
    void* ptr = malloc(size);
    std::shared_ptr<void> buf{ptr, ::free};
    fread(buf.get(), 1, size, fin);
    fclose(fin);
    return buf;
 }

 void write_file(std::string file_path, const std::vector<uint8_t>& data) {
    FILE* fin = fopen(file_path.c_str(), "wb");
    if (!fin) {
        printf("failed to open %s.", file_path.c_str());
    };
    fwrite(data.data(), 1, data.size(), fin);
    fclose(fin);
 }

 typedef int (*CommandHandler)(int, char**);

 const char* usage =
        "Usage:\n"
        " rc4_encryptor encrypt_predefined_rc4 <input file> <output file>\n"
        " rc4_encryptor encrypt_rc4 <hash key> <enc key> <input file> <output "
        "file>\n"
        " rc4_encryptor encrypt_predefined_sfrc4 <input file> <output file>\n"
        " rc4_encryptor encrypt_sfrc4 <hash key> <enc key> <input file> "
        "<output "
        "file>\n"
        " rc4_encryptor hash <input file>\n";

 int command_encrypt_predefined_rc4(int argc, char** argv) {
    if (argc != 4) {
        printf("Invalid encrypt_predefined_rc4 arguments.\n");
        return 1;
    }

    const char* input_file_path = argv[2];
    const char* output_file_path = argv[3];

    size_t size = 0;
    auto keys = RC4::get_decrypt_key();
    auto input = read_file(input_file_path, size);
    printf("Reading input file ...\n");
    auto output = RC4::encrypt_model(input.get(), size, keys);

    write_file(output_file_path, output);

    printf("Done.\n");
    return 0;
 }

 int command_encrypt_rc4(int argc, char** argv) {
    if (argc != 6) {
        printf("Invalid encrypt_rc4 arguments.\n");
        return 1;
    }

    uint64_t hash_key = std::stoull(argv[2], 0, 0);
    uint64_t enc_key = std::stoull(argv[3], 0, 0);
    const char* input_file_path = argv[4];
    const char* output_file_path = argv[5];

    std::vector<uint8_t> keys(128, 0);
    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
    data[0] = hash_key;
    data[1] = enc_key;

    size_t size = 0;
    auto input = read_file(input_file_path, size);
    printf("Reading input file ...\n");
    auto output = RC4::encrypt_model(input.get(), size, keys);

    printf("Encrypting ...\n");
    write_file(output_file_path, output);

    printf("Done.\n");
    return 0;
 }

 int command_encrypt_predefined_sfrc4(int argc, char** argv) {
    if (argc != 4) {
        printf("Invalid encrypt_predefined_rc4 arguments.\n");
        return 1;
    }

    const char* input_file_path = argv[2];
    const char* output_file_path = argv[3];

    size_t size = 0;
    auto keys = SimpleFastRC4::get_decrypt_key();
    auto input = read_file(input_file_path, size);
    printf("Reading input file ...\n");
    auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys);

    write_file(output_file_path, output);

    printf("Done.\n");
    return 0;
 }

 int command_encrypt_sfrc4(int argc, char** argv) {
    if (argc != 6) {
        printf("Invalid encrypt_rc4 arguments.\n");
        return 1;
    }

    uint64_t hash_key = std::stoull(argv[2], 0, 0);
    uint64_t enc_key = std::stoull(argv[3], 0, 0);
    const char* input_file_path = argv[4];
    const char* output_file_path = argv[5];

    std::vector<uint8_t> keys(128, 0);
    uint64_t* data = reinterpret_cast<uint64_t*>(keys.data());
    data[0] = hash_key;
    data[1] = enc_key;

    size_t size = 0;
    auto input = read_file(input_file_path, size);
    printf("Reading input file ...\n");
    auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys);

    printf("Encrypting ...\n");
    write_file(output_file_path, output);

    printf("Done.\n");
    return 0;
 }

 int command_hash(int argc, char** argv) {
    if (argc != 3) {
        printf("Invalid hash arguments.\n");
        return 1;
    }

    const char* input_file_path = argv[2];

    size_t len = 0;
    auto input = read_file(input_file_path, len);

    rc4::FastHash64 hasher(rc4::key_gen_hash_key());
    auto start = static_cast<const char*>(input.get());

    auto ptr = reinterpret_cast<const uint64_t*>(start);
    while (reinterpret_cast<const char*>(ptr + 1) <= start + len) {
        hasher.feed(*ptr);
        ++ptr;
    }

    auto cptr = reinterpret_cast<const char*>(ptr);
    if (cptr < start + len) {
        uint64_t v = 0;
        std::copy(cptr, start + len, reinterpret_cast<char*>(&v));
        hasher.feed(v);
    }

    printf("%llx\n", static_cast<unsigned long long>(hasher.get()));
    return 0;
 }


 std::unordered_map<std::string, CommandHandler> commands = {
        {"encrypt_predefined_rc4", command_encrypt_predefined_rc4},
        {"encrypt_rc4", command_encrypt_rc4},
        {"encrypt_predefined_sfrc4", command_encrypt_predefined_sfrc4},
        {"encrypt_sfrc4", command_encrypt_sfrc4},
        {"hash", command_hash},
 };

 int main(int argc, char** argv) {
    if (argc == 1) {
        printf("%s", usage);
        return 1;
    }

    auto it = commands.find(argv[1]);
    if (it == commands.end()) {
        printf("Invalid command arguments.\n");
        printf("%s", usage);
        return 1;
    }
    return it->second(argc, argv);
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
@@ -209,6 +209,35 @@ function do_build() {
        echo "comapt whl name: ${compat_whl_name}"
        cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}

        # handle megenginelite
        cd ${BUILD_DIR}
        rm -rf lite_staging
        mkdir -p lite_staging/megenginelite
        cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/
        cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/
        cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/
        VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py
        if [ -f ${VER_FILE} ];then
            cp ${VER_FILE} lite_staging/megenginelite
        else
            echo "ERROR: can not find version file"
            exit -1
        fi
        mkdir -p ${BUILD_DIR}/lite_staging/megenginelite/libs
        LITE_LIB=${BUILD_DIR}/lite_staging/megenginelite/libs/liblite_shared.dylib
        cp ${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/lite/liblite_shared.dylib ${LITE_LIB}
        llvm-strip -s ${LITE_LIB}

        cd ${BUILD_DIR}/lite_staging/
        ${PYTHON_DIR}/bin/python3 setup.py bdist_wheel
        cd ${BUILD_DIR}/lite_staging/dist/
        org_whl_name=`ls Meg*.whl`
        index=`awk -v a="${org_whl_name}" -v b="-macosx" 'BEGIN{print index(a,b)}'`
        compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_14_x86_64.whl
        echo "megenginelite org whl name: ${org_whl_name}"
        echo "megenginelite comapt whl name: ${compat_whl_name}"
        cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name}

        cd ${SRC_DIR}
        echo ""
        echo "##############################################################################################"
--- a/scripts/whl/manylinux2014/do_build_common.sh
+++ b/scripts/whl/manylinux2014/do_build_common.sh
@@ -155,6 +155,33 @@ do
    echo "comapt whl name: ${compat_whl_name}"
    mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name}

    # handle megenginelite
    cd ${BUILD_DIR}
    rm -rf lite_staging
    mkdir -p lite_staging/megenginelite
    cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/
    cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/
    cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/
    VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py
    if [ -f ${VER_FILE} ];then
        cp ${VER_FILE} lite_staging/megenginelite
    else
        echo "ERROR: can not find version file"
        exit -1
    fi
    patch_elf_depend_lib_megenginelite

    cd ${BUILD_DIR}/lite_staging/
    ${PYTHON_DIR}/bin/python setup.py bdist_wheel
    cd /home/output
    mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}
    cd ${BUILD_DIR}/lite_staging/dist/
    org_whl_name=`ls Meg*${ver}*.whl`
    compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'`
    echo "megenginelite org whl name: ${org_whl_name}"
    echo "megenginelite comapt whl name: ${compat_whl_name}"
    mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name}

    cd /home/output
    chown -R ${UID}.${UID} .
    # compat for root-less docker env to remove output at host side