@@ -0,0 +1,10 @@ | |||
test/resource/input_data.npy filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/shufflenet.mge filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/shufflenet_crypt_aes.mge filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/test_packed_model.lite filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/test_packed_model_rc4.lite filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/output_data.npy filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/model.mgb filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/liveness_rgb_nosub128.rknn filter=lfs diff=lfs merge=lfs -text | |||
third_party/librknn_api filter=lfs diff=lfs merge=lfs -text | |||
test/resource/lite/model_atlas.mgb filter=lfs diff=lfs merge=lfs -text |
@@ -0,0 +1,135 @@ | |||
option(LITE_BUILD_WITH_MGE "Build lite with MegEngine." ON) | |||
# config lite_build_config.h.in | |||
set(LITE_WITH_OPENCL ${MGE_WITH_OPENCL}) | |||
set(LITE_WITH_CUDA ${MGE_WITH_CUDA}) | |||
set(LITE_ENABLE_LOGGING ${MGE_ENABLE_LOGGING}) | |||
set(LITE_ENABLE_EXCEPTION ${MGE_ENABLE_EXCEPTIONS}) | |||
set(LITE_ASSERT_LOC ${MGB_ASSERT_LOC}) | |||
if(NOT MGB_WITH_FLATBUFFERS) | |||
include(../cmake/flatbuffers.cmake) | |||
endif() | |||
file(GLOB_RECURSE SRC_FBS src/**/*.fbs) | |||
build_flatbuffers( | |||
"${SRC_FBS}" | |||
"" | |||
lite_fbs_generate | |||
"" | |||
"${CMAKE_CURRENT_BINARY_DIR}" | |||
"" | |||
"" | |||
) | |||
file(GLOB_RECURSE SOURCES_LITE src/*.cpp src/*.cc lite-c/*.cpp) | |||
if(MGE_WITH_MINIMUM_SIZE) | |||
set(LITE_ENABLE_LOGGING OFF) | |||
set(LITE_ENABLE_EXCEPTION OFF) | |||
endif() | |||
# Write out lite_build_config.h | |||
# It defines macros needed by lite | |||
configure_file(src/lite_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h) | |||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/genfiles/lite_build_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
# begin config lite | |||
if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) | |||
# FXIME third_party cpp redis do not support build with clang-cl | |||
file(GLOB_RECURSE SOURCES_CPP_REDIS ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/sources/*.cpp) | |||
list(APPEND SOURCES_LITE ${SOURCES_CPP_REDIS}) | |||
file(GLOB_RECURSE SOURCES_TACOPIE ${PROJECT_SOURCE_DIR}/third_party/tacopie/sources/*.cpp) | |||
list(APPEND SOURCES_LITE ${SOURCES_TACOPIE}) | |||
endif() | |||
add_library(lite_static STATIC ${SOURCES_LITE}) | |||
add_dependencies(lite_static lite_fbs_generate) | |||
include_directories($<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/genfiles>) | |||
if(LITE_BUILD_WITH_MGE) | |||
target_link_libraries(lite_static PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) | |||
add_compile_definitions(LITE_BUILD_WITH_MGE=1) | |||
message(STATUS "build lite with MegEngine.") | |||
else() | |||
target_link_libraries(lite_static PUBLIC flatbuffers) | |||
endif() | |||
include_directories( | |||
PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/lite/include> | |||
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include> | |||
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/include/lite> | |||
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/lite-c/include> | |||
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lite/src> | |||
PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/Json/include> | |||
) | |||
# end config lite | |||
# define a shared lib | |||
add_library(lite_shared SHARED $<TARGET_OBJECTS:lite_static>) | |||
if(LITE_BUILD_WITH_MGE) | |||
target_link_libraries(lite_shared PRIVATE megbrain megdnn ${MGE_CUDA_LIBS}) | |||
endif() | |||
if(ANDROID) | |||
link_libraries(log) | |||
target_link_libraries(lite_static PRIVATE log) | |||
target_link_libraries(lite_shared PRIVATE log) | |||
endif() | |||
if(LITE_BUILD_WITH_MGE AND LITE_WITH_CUDA AND NOT WIN32) | |||
# FXIME third_party cpp redis do not support build with clang-cl | |||
target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes) | |||
target_include_directories(lite_static PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) | |||
target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/cpp_redis/includes) | |||
target_include_directories(lite_shared PRIVATE ${PROJECT_SOURCE_DIR}/third_party/tacopie/includes) | |||
endif() | |||
set(LITE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/lite/src/version_lite.ld CACHE INTERNAL "Path to linker version script") | |||
add_custom_target(_lite_version_ld SOURCES ${LITE_VERSION_SCRIPT}) | |||
if(NOT MSVC AND NOT WIN32) | |||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") | |||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") | |||
endif() | |||
#TODO: implemente version script for other OS | |||
if (UNIX AND NOT APPLE) | |||
target_link_options(lite_shared PRIVATE -Wl,--version-script=${LITE_VERSION_SCRIPT}) | |||
set_target_properties(lite_shared PROPERTIES LINK_DEPENDS ${LITE_VERSION_SCRIPT}) | |||
endif() | |||
# config install | |||
install(TARGETS lite_static | |||
LIBRARY DESTINATION lite/lib/${MGE_ARCH} | |||
FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} | |||
ARCHIVE DESTINATION lite/lib/${MGE_ARCH}) | |||
install(TARGETS lite_shared | |||
LIBRARY DESTINATION lite/lib/${MGE_ARCH} | |||
FRAMEWORK DESTINATION lite/lib/${MGE_ARCH} | |||
ARCHIVE DESTINATION lite/lib/${MGE_ARCH} | |||
) | |||
install(FILES ${PROJECT_SOURCE_DIR}/lite/include/lite/common_enum_c.h | |||
DESTINATION ${CMAKE_INSTALL_PREFIX}/lite/include/lite-c) | |||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/include | |||
DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") | |||
install(DIRECTORY ${PROJECT_SOURCE_DIR}/lite/lite-c/include | |||
DESTINATION ${CMAKE_INSTALL_PREFIX}/lite FILES_MATCHING PATTERN "*.h") | |||
add_subdirectory(example) | |||
if(MGE_WITH_TEST) | |||
add_subdirectory(test) | |||
endif() | |||
# tools and example | |||
add_executable(rc4_encryptor tools/rc4_encrypt.cpp) | |||
target_link_libraries(rc4_encryptor lite_static) | |||
if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) | |||
# FIXME: hip obj can not find cpp obj only through lite_static | |||
target_link_libraries(rc4_encryptor megdnn) | |||
endif() | |||
target_include_directories(rc4_encryptor PRIVATE | |||
{PROJECT_SOURCE_DIR}/lite/src/decryption) | |||
install (TARGETS rc4_encryptor | |||
EXPORT ${LITE_EXPORT_TARGETS} | |||
RUNTIME DESTINATION lite/tools) |
@@ -0,0 +1,251 @@ | |||
# Lite | |||
It is a lite warper of MegEngine, to enable MegEngine easy to be integrated in | |||
user's SDK | |||
## bazel build | |||
目前支持内部 bazel 和 CMake 编译,支持 C++/C, Python 接口, | |||
下面是 bazel 中 lite_shared 目标的编译,可以作为其他目标的编译的参考, | |||
该编译依赖内部 bazel 编译以及 megvii3。 | |||
### 配置编译环境 | |||
需要使用 megvii3 workspace 来完成 bazel 的编译 | |||
#### Clone megvii3 安装 bazel | |||
```bash | |||
git clone git@git-core.megvii-inc.com:brain-sdk/megvii3.git | |||
./utils/bazel/get_bazel.sh | |||
``` | |||
#### Clone megbrain | |||
``` | |||
git submodule update brain/megbrain brain/midout | |||
``` | |||
### 编译 x86 CUDA 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \ | |||
--compiler="gcc7_cuda10" -c opt | |||
``` | |||
### 编译 x86 CPU 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_shared --cpu="k8" \ | |||
--compiler="gcc9" -c opt | |||
``` | |||
### 编译 arm OpenCL 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_shared_shared --cpu=android_aarch64 \ | |||
-c opt --define enable_opencl=1 --define enable_opencl_search=1 | |||
``` | |||
### 编译 arm opencl lite_examples | |||
bazel-3.0.0-megvii2 build //brain/megbrain/lite:lite_shared_examples \ | |||
--cpu=android_aarch64 --define enable_opencl=1 --define enable_opencl_search=1 | |||
####如何运行snpe_loder 的lite_exampes 请查看下面的wiki | |||
https://wiki.megvii-inc.com/pages/viewpage.action?pageId=268786906 | |||
### 编译 armv7 CPU 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_shared --cpu=android_armv7 \ | |||
-c opt | |||
``` | |||
### 编译 arm64 CPU 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \ | |||
-c opt | |||
``` | |||
### 编译 arm64 CPU v8.2 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_shared --cpu=android_aarch64 \ | |||
--copt -march=armv8.2-a+fp16+dotprod -c opt | |||
``` | |||
## 同时支持cmake构建 | |||
cmake构建参考scripts/cmake-build/BUILD_README.md,下面example表示同时支持编译megengine | |||
和RKNPU后端且打开OpenCL的release模式 | |||
```bash | |||
EXTRA_CMAKE_ARGS="-DANDROID_NATIVE_API_LEVEL=24 -DLITE_BUILD_WITH_RKNPU=ON -DMGE_WITH_OPENCL=ON \ | |||
-DMGE_OPENCL_SEARCH_ALGO=ON -DCUSTOM_C_OPR_INIT_FUNC=custom_loader_func" ./scripts/cmake-build/cross_build_android_arm_inference.sh" | |||
``` | |||
* 如果需要支持性能分析的 profile 功能,则需要在编译时候加上 | |||
--copt -DMGB_ENABLE_JSON=1 该参数 | |||
* 如果需要支持 fast-run 功能则需要加上 | |||
--copt -DMGB_ENABLE_FASTRUN=1,开启 fast-run 功能 | |||
* 如果编译 arm64,可以加上 --copt -mcpu=cortex-a53 选项进行优化。 | |||
### midout 裁减编译 | |||
具体 midout 的裁减原理见 megbrain 中 midout 裁减,裁减方法见 MegBrain | |||
和 MegEngine 的裁减方法 | |||
## 模型 | |||
### 支持的模型 | |||
lite 目前支持只支持 MegEngine dump 的模型格式,可以加载的模型文件包括原始 | |||
的模型文件,原始的加密模型,pack 之后的加密或者非加密模型。加密算法以及 | |||
加密的秘钥可以用户自定义,然后注册到 lite 中,详见 example 中加解密部分。 | |||
* 原始模型未加密:直接将完成训练的模型在 MegEngine 环境中进行 dump 生成的模型 | |||
* 原始加密模型:将上述 dump 的模型通过加密算法进行加密,lite 提供两种默认 | |||
的加密算法,在 tools 中,分别为 aes 和 rc4. 对应为:aes_encypt.sh 和 | |||
rc4_encrypt.cpp,rc4_encrypt.cpp 需要编译生成可执行文件。这种方式加密的模型在 | |||
加载时候需要在 Config 中配置模型的加密方式。 | |||
* pack 之后的模型:模型结构将在下面介绍,可以将上面加密或者未加密的模型,和下面 | |||
定义的 json config 文件一同打包为一个 pack 之后的模型,可以使用 tools 下面 | |||
的 pack_model_and_info.py 工具中完成,pack_model_and_info.py 的使用详见其中 | |||
的 help 输出。 | |||
### 模型结构 | |||
不同的模型文件主要是通过 pack 之后的模型文件中的 model_tag 来区分. | |||
* 打包处理之后的文件: | |||
模型打包过程可以通过脚本 pack_model_and_json.py 来完成,其将模型info文件( | |||
可以是任意格式,推荐使用JSON,可以加密也可以不加密)和加密或者未加密的模型文件 | |||
一同打包在一起,并在文件开头加上 Header 来帮助解析。 | |||
* 原始文件和原始的加密文件没有 Header 和模型 info部分,模型加载需要的信息 | |||
可以通过 Config 和 NetworkIO 进行传递。 | |||
### Header | |||
Header 部分最开始为一个明文固定model_tag,目前定义为"packed_model"字符串, | |||
后面主要包含模型文件各个部分的信息,每个部分的加密方式,load 模型时候可以 | |||
调用相应的解密方法对各个部分进行解密,以及model infomation 部分的解析方法。 | |||
具体细节参考lite/src/parse_model/pack_model.fbs | |||
### Info部分 | |||
Info 部分主要用来解释模型,如用户关心的:模型的输入数据的格式,模型运行的平台 | |||
等信息,这部分信息也可以用于用户进行 check 运行的模型是否在指定的条件下运行。 | |||
由于这个 Info 部分不同的用户需求不一致,想传递的信息也无法统一,所以目前 | |||
Lite 中提供自定义的方式,用户可以自定义自己 Info 部分的类容,并在 Header 中 | |||
指定 **Info 解析方式名字** ,并注册以该名字为 key 的解析函数到 Lite 中, | |||
以这样方式来可以实现用户自定义 Info 格式。同时,Lite 中也提供了一套定义好的 | |||
格式,其名字为 "LITE_default",并已经实现了对应的解析函数,该 info | |||
为 JSON 格式,具体内容定义如下: | |||
```json | |||
{ | |||
"name": "shufflenet_test", | |||
"valid": true, | |||
"version": "8.9999.0", | |||
"has_compression": false, | |||
"device": { | |||
"type": "CPU", | |||
"device_id": 0, | |||
"number_threads": 1, | |||
"use_tensor_rt": false, | |||
"enable_inplace_model": false | |||
}, | |||
"options":{ | |||
"weight_preprocess": false, | |||
"var_sanity_check_first_run": true, | |||
"const_shape": false, | |||
"jit_level": 0, | |||
"record_level": 0 | |||
}, | |||
"IO":{ | |||
"inputs":[ | |||
{ | |||
"name": "data", | |||
"io_type": "value", | |||
"is_host": true, | |||
"dtype": "float32", | |||
"shape": { | |||
"dim0": 1, | |||
"dim1": 3, | |||
"dim2": 224, | |||
"dim3": 224 | |||
} | |||
} | |||
], | |||
"outputs":[ | |||
{ | |||
"name": "TRUE_DIV(EXP[12065],reduce0[12067])[12077]", | |||
"io_type": "value", | |||
"is_host": true, | |||
"dtype": "float32", | |||
"shape": { | |||
"dim0": 1, | |||
"dim1": 1000, | |||
"dim2": 0, | |||
"dim3": 0 | |||
} | |||
} | |||
] | |||
} | |||
} | |||
``` | |||
* model_name: 指这个模型的名字,用户可以用来验证是否运行了正确的模型, | |||
和 Header 部分中的进行对比 check | |||
* valid: 指在这个 info 文件中的设置是否影响模型的 Config | |||
* version: 指模型对应的 megbrain 的版本号,load 模型时候会进行 check | |||
* has_compression: 标识这个模型文件中 tensor 的数据是否压缩过 | |||
* device: 目前支持字段包括:"CPU","CUDA","OPENCL","ATLAS" | |||
* number_threads 和 is_inplace_model : 只有在 device 为 CPU 的情况下才生效 | |||
* IO::inputs::type: 包括 value,shape,详见 include"network.h" | |||
* IO::inputs::is_host: 值输入数据来自 device 或者来自 host 端 | |||
* IO::outputs::is_host: 值输出数据将保存在 device 或者 host 端 | |||
* IO::outputs::shape::dimx: 如果为0,则便是该 dim 无效 | |||
### Model部分 | |||
可以是加密的模型文件或者未加密的模型文件 | |||
## 使用 | |||
丰富的使用方法详见文件 example 中文档和对应的 example。 | |||
## 工具 | |||
目前 lite 中有三个工具保存在 tools 目录中,其他 megbrain 工具 | |||
没有包含在内,分别为: | |||
* pack_model_and_info.py 为上面提到的模型打包工具,其为一个 | |||
python 脚本,可以直接用其对已有的模型和模型 information 的文件,按照上面 | |||
的格式进行打包模型,用户可以指定模型名字,模型加密方式,模型信息 | |||
文件加密方式,解析方式等,如下: | |||
```bash | |||
python3 pack_model_and_info.py --input-model xxx.mge \ | |||
--model-name="shufflenet_test" \ | |||
--model-cryption="RC4_default" \ | |||
--input-info xxx.json \ | |||
--info-cryption="RC4_default" \ | |||
--info-parser="LITE_default" \ | |||
-o xxx.lite | |||
``` | |||
* aes_encrypt.sh 为一个 aes 加密方式的加密脚本,可以将一个文件, | |||
通过指定的的 key 加密成一个 aes 加密的文件,其中 key 为 32 个字节 | |||
16进制数。 | |||
```bash | |||
aes_encrypt.sh xxx.mdl xxx_encrypted.mdl \ | |||
000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F | |||
``` | |||
* rc4_encypt.cpp 可以被编译成为一个 rc4 加密的工具,这个工具可以通过 | |||
制定的 key 或者默认的 key 加密制定的文件,支持 rc4 方法和 | |||
simple_fast_rc4 两种方法,支持自定义 key。 | |||
* bazel 编译 x86 命令为: | |||
```bash | |||
bazel build //brain/megbrain/lite:rc4_encryptor \ | |||
--cpu='k8' --compiler='gcc9' | |||
``` | |||
* 加密文件,具体用法见 help | |||
```bash | |||
rc4_encryptor encrypt_predefined_rc4 \ | |||
to_be_encrypt.file encrypted.file | |||
``` |
@@ -0,0 +1,32 @@ | |||
/** | |||
* \file lite/build_config/lite_build_config.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef _HEADER_LITE_BUILD_CONFIG | |||
#define _HEADER_LITE_BUILD_CONFIG | |||
#ifndef LITE_ENABLE_LOGGING | |||
#define LITE_ENABLE_LOGGING 1 | |||
#endif | |||
#ifndef LITE_ENABLE_EXCEPTION | |||
#if __cpp_exceptions || __EXCEPTIONS || \ | |||
(defined(_MSC_VER) && defined(_CPPUNWIND)) | |||
#define LITE_ENABLE_EXCEPTION 1 | |||
#else | |||
#define LITE_ENABLE_EXCEPTION 0 | |||
#endif | |||
#endif | |||
#ifndef LITE_WITH_CUDA | |||
#define LITE_WITH_CUDA 0 | |||
#endif | |||
#ifndef LITE_ASSERT_LOC | |||
#define LITE_ASSERT_LOC 1 | |||
#endif | |||
#endif // _HEADER_LITE_BUILD_CONFIG |
@@ -0,0 +1,47 @@ | |||
file (GLOB_RECURSE SOURCES ./*.cpp) | |||
add_executable(lite_examples ${SOURCES}) | |||
if(LITE_BUILD_WITH_RKNPU) | |||
#rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check | |||
target_link_options(lite_examples PRIVATE "-fuse-ld=gold") | |||
endif() | |||
target_link_libraries(lite_examples lite_static) | |||
if(LITE_BUILD_WITH_MGE AND MGE_WITH_ROCM) | |||
# FIXME: hip obj can not find cpp obj only through lite_static | |||
target_link_libraries(lite_examples megdnn) | |||
endif() | |||
if(UNIX) | |||
if(APPLE OR ANDROID) | |||
target_link_libraries(lite_examples dl) | |||
else() | |||
target_link_libraries(lite_examples dl rt) | |||
endif() | |||
endif() | |||
install (TARGETS lite_examples | |||
EXPORT ${LITE_EXPORT_TARGETS} | |||
RUNTIME DESTINATION lite/bin) | |||
# add lite_examples_depends_shared for CI check symbol export valid | |||
add_executable(lite_examples_depends_shared ${SOURCES}) | |||
if(LITE_BUILD_WITH_RKNPU) | |||
#rknn sdk1.0.0 depend on libc++_shared, use gold to remove NEEDED so symbol check | |||
target_link_options(lite_examples_depends_shared PRIVATE "-fuse-ld=gold") | |||
endif() | |||
target_link_libraries(lite_examples_depends_shared lite_shared) | |||
if(UNIX) | |||
if(APPLE OR ANDROID) | |||
target_link_libraries(lite_examples_depends_shared dl) | |||
else() | |||
target_link_libraries(lite_examples_depends_shared dl rt) | |||
endif() | |||
endif() | |||
install (TARGETS lite_examples_depends_shared | |||
EXPORT ${LITE_EXPORT_TARGETS} | |||
RUNTIME DESTINATION lite/bin) |
@@ -0,0 +1,101 @@ | |||
/** | |||
* \file example/example.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include <lite_build_config.h> | |||
#include "lite/global.h" | |||
#include "lite/network.h" | |||
#include "lite/tensor.h" | |||
#include "npy.h" | |||
#include <string.h> | |||
#include <memory> | |||
#include <unordered_map> | |||
#include <vector> | |||
namespace lite { | |||
namespace example { | |||
void set_cpu_affinity(const std::vector<int>& cpuset); | |||
struct Args { | |||
int args_parse_ret = 0; | |||
std::string example_name; | |||
std::string model_path; | |||
std::string input_path; | |||
std::string output_path; | |||
std::string loader_path; | |||
static Args from_argv(int argc, char** argv); | |||
}; | |||
std::shared_ptr<Tensor> parse_npy( | |||
const std::string& path, | |||
LiteBackend backend = LiteBackend::LITE_DEFAULT); | |||
using ExampleFunc = std::function<bool(const Args&)>; | |||
using ExampleFuncMap = std::unordered_map<std::string, ExampleFunc>; | |||
ExampleFuncMap* get_example_function_map(); | |||
bool register_example(std::string example_name, const ExampleFunc& fuction); | |||
template <int> | |||
struct Register; | |||
#if LITE_BUILD_WITH_MGE | |||
#if LITE_WITH_CUDA | |||
bool load_from_path_run_cuda(const Args& args); | |||
#endif | |||
bool basic_load_from_path(const Args& args); | |||
bool basic_load_from_path_with_loader(const Args& args); | |||
bool basic_load_from_memory(const Args& args); | |||
bool cpu_affinity(const Args& args); | |||
bool network_share_same_weights(const Args& args); | |||
bool reset_input(const Args& args); | |||
bool reset_input_output(const Args& args); | |||
bool config_user_allocator(const Args& args); | |||
bool register_cryption_method(const Args& args); | |||
bool update_cryption_key(const Args& args); | |||
bool async_forward(const Args& args); | |||
#if LITE_WITH_CUDA | |||
bool device_input(const Args& args); | |||
bool device_input_output(const Args& args); | |||
bool pinned_host_input(const Args& args); | |||
#endif | |||
#endif | |||
} // namespace example | |||
} // namespace lite | |||
#if LITE_BUILD_WITH_MGE | |||
bool basic_c_interface(const lite::example::Args& args); | |||
bool device_io_c_interface(const lite::example::Args& args); | |||
bool async_c_interface(const lite::example::Args& args); | |||
#endif | |||
#define CONCAT_IMPL(a, b) a##b | |||
#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b) | |||
#define REGIST_EXAMPLE(name_, func_) \ | |||
REGIST_EXAMPLE_WITH_NUM(__COUNTER__, name_, func_) | |||
#define REGIST_EXAMPLE_WITH_NUM(number_, name_, func_) \ | |||
template <> \ | |||
struct Register<number_> { \ | |||
Register() { register_example(name_, func_); } \ | |||
}; \ | |||
namespace { \ | |||
Register<number_> MACRO_CONCAT(example_function_, number_); \ | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,172 @@ | |||
/** | |||
* \file example/example.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite/global.h" | |||
#include "lite/network.h" | |||
#include "lite/tensor.h" | |||
#include "example.h" | |||
#include "npy.h" | |||
#include <string.h> | |||
#include <map> | |||
#include <memory> | |||
#include <vector> | |||
using namespace lite; | |||
using namespace example; | |||
Args Args::from_argv(int argc, char** argv) { | |||
Args ret; | |||
if (argc < 4) { | |||
printf("usage: lite_examples <example_name> <model file> <input " | |||
"file> <output file>.\n"); | |||
printf("*********The output file is optional.*************\n"); | |||
printf("The registered examples include:\n"); | |||
size_t index = 0; | |||
for (auto it : *get_example_function_map()) { | |||
printf("%zu : %s\n", index, it.first.c_str()); | |||
index++; | |||
} | |||
ret.args_parse_ret = -1; | |||
return ret; | |||
} | |||
ret.example_name = argv[1]; | |||
ret.model_path = argv[2]; | |||
ret.input_path = argv[3]; | |||
if (argc > 4) { | |||
ret.output_path = argv[4]; | |||
} | |||
if (argc > 5) { | |||
ret.loader_path = argv[5]; | |||
} | |||
return ret; | |||
} | |||
ExampleFuncMap* lite::example::get_example_function_map() { | |||
static ExampleFuncMap static_map; | |||
return &static_map; | |||
} | |||
bool lite::example::register_example(std::string example_name, | |||
const ExampleFunc& fuction) { | |||
auto map = get_example_function_map(); | |||
if (map->find(example_name) != map->end()) { | |||
printf("Error!!! This example is registed yet\n"); | |||
return false; | |||
} | |||
(*map)[example_name] = fuction; | |||
return true; | |||
} | |||
std::shared_ptr<Tensor> lite::example::parse_npy(const std::string& path, | |||
LiteBackend backend) { | |||
std::string type_str; | |||
std::vector<npy::ndarray_len_t> stl_shape; | |||
std::vector<int8_t> raw; | |||
npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw); | |||
auto lite_tensor = | |||
std::make_shared<Tensor>(backend, LiteDeviceType::LITE_CPU); | |||
Layout layout; | |||
layout.ndim = stl_shape.size(); | |||
const std::map<std::string, LiteDataType> type_map = { | |||
{"f4", LiteDataType::LITE_FLOAT}, | |||
{"i4", LiteDataType::LITE_INT}, | |||
{"i1", LiteDataType::LITE_INT8}, | |||
{"u1", LiteDataType::LITE_UINT8}}; | |||
layout.shapes[0] = 1; | |||
for (size_t i = 0; i < layout.ndim; i++) { | |||
layout.shapes[i] = static_cast<size_t>(stl_shape[i]); | |||
} | |||
for (auto& item : type_map) { | |||
if (type_str.find(item.first) != std::string::npos) { | |||
layout.data_type = item.second; | |||
break; | |||
} | |||
} | |||
lite_tensor->set_layout(layout); | |||
size_t length = lite_tensor->get_tensor_total_size_in_byte(); | |||
void* dest = lite_tensor->get_memory_ptr(); | |||
memcpy(dest, raw.data(), length); | |||
//! rknn not support reshape now | |||
if (layout.ndim == 3) { | |||
lite_tensor->reshape({1, static_cast<int>(layout.shapes[0]), | |||
static_cast<int>(layout.shapes[1]), | |||
static_cast<int>(layout.shapes[2])}); | |||
} | |||
return lite_tensor; | |||
} | |||
void lite::example::set_cpu_affinity(const std::vector<int>& cpuset) { | |||
#if defined(__APPLE__) || defined(WIN32) | |||
#pragma message("set_cpu_affinity not enabled on apple and windows platform") | |||
#else | |||
cpu_set_t mask; | |||
CPU_ZERO(&mask); | |||
for (auto i : cpuset) { | |||
CPU_SET(i, &mask); | |||
} | |||
auto err = sched_setaffinity(0, sizeof(mask), &mask); | |||
if (err) { | |||
printf("failed to sched_setaffinity: %s (error ignored)", | |||
strerror(errno)); | |||
} | |||
#endif | |||
} | |||
int main(int argc, char** argv) { | |||
set_log_level(LiteLogLevel::WARN); | |||
auto&& args = Args::from_argv(argc, argv); | |||
if (args.args_parse_ret) | |||
return -1; | |||
auto map = get_example_function_map(); | |||
auto example = (*map)[args.example_name]; | |||
if (example) { | |||
printf("Begin to run %s example.\n", args.example_name.c_str()); | |||
return example(args); | |||
} else { | |||
printf("The example of %s is not registed.", args.example_name.c_str()); | |||
return -1; | |||
} | |||
} | |||
namespace lite { | |||
namespace example { | |||
#if LITE_BUILD_WITH_MGE | |||
#if LITE_WITH_CUDA | |||
REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda); | |||
#endif | |||
REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path); | |||
REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader); | |||
REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory); | |||
REGIST_EXAMPLE("cpu_affinity", cpu_affinity); | |||
REGIST_EXAMPLE("register_cryption_method", register_cryption_method); | |||
REGIST_EXAMPLE("update_cryption_key", update_cryption_key); | |||
REGIST_EXAMPLE("network_share_same_weights", network_share_same_weights); | |||
REGIST_EXAMPLE("reset_input", reset_input); | |||
REGIST_EXAMPLE("reset_input_output", reset_input_output); | |||
REGIST_EXAMPLE("config_user_allocator", config_user_allocator); | |||
REGIST_EXAMPLE("async_forward", async_forward); | |||
REGIST_EXAMPLE("basic_c_interface", basic_c_interface); | |||
REGIST_EXAMPLE("device_io_c_interface", device_io_c_interface); | |||
REGIST_EXAMPLE("async_c_interface", async_c_interface); | |||
#if LITE_WITH_CUDA | |||
REGIST_EXAMPLE("device_input", device_input); | |||
REGIST_EXAMPLE("device_input_output", device_input_output); | |||
REGIST_EXAMPLE("pinned_host_input", pinned_host_input); | |||
#endif | |||
#endif | |||
} // namespace example | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,166 @@ | |||
# Example | |||
在该 example 目录中实现了一系列调用 lite 接口来实现 inference 的例子,主要 | |||
是演示 lite 中不同接口的调用来实现不同情况下的 inference 功能。这里所有的 example | |||
都是使用 shufflenet 来进行演示。 | |||
## Example bazel 的编译和运行 | |||
* 参考主目录下面的 README.md 搭建 megvii3 bazel 的编译环境,编译 CPU 版本 | |||
```bash | |||
./bazel build //brain/megbrain/lite:lite_examples --cpu="k8" \ | |||
--compiler="gcc9" -c opt | |||
``` | |||
* 运行时需要指定运行的具体 example 名字,运行的模型,模型运行的数据 | |||
* 获取所有的 example 名字 | |||
``` | |||
bazel-bin/brain/megbrain/lite/lite_examples | |||
``` | |||
* 运行 example,下面命令运行 basic_load_from_memory | |||
``` | |||
bazel-bin/brain/megbrain/lite/lite_examples \ | |||
basic_load_from_memory \ | |||
path-to-megbrain/lite/test/resource/lite/shufflenet.mge \ | |||
path-to-megbrain/lite/test/resource/lite/input_data.npy | |||
``` | |||
## basic 使用 | |||
* **实现在文件 basic.cpp 中, 包括 basic_load_from_path 和 | |||
basic_load_from_memory** | |||
* 该 example 使用 lite 来完成基本的 inference 功能,load 模型使用默认的配置, | |||
进行 forward 之前将输入数据 copy 到输入 tensor 中,完成 forward 之后,再将 | |||
数据从输出 tensor 中 copy 到用户的内存中,输入 tensor 和输出 tensor 都是从 | |||
Network 中通过 name 来获取的,输入输出 tensor 的 layout 也可以从对应的 tensor | |||
中直接获取获取,**输出 tensor 的 layout 必须在 forward 完成之后获取才是正确的。** | |||
## 输入输出指定的内存 | |||
* **实现在 reset_io.cpp 中,包括两个 example,reset_input 和 reset_input_output | |||
两个 example。** | |||
* 该 example 中演示输入 tensor 的内存为用户指定的内存(该内存中已经保存好输入 | |||
数据),输出 tensor 也可以是用户指定的内存,这样 Network 完成 Forward 之后就会将数据 | |||
保存在指定的输出内存中。如此减少不必要的 memory copy 的操作。 | |||
* 主要是通过 tensor 中的 reset 接口,该接口可以重新指定 tensor 的内存和对应的 | |||
layout,如果 layout 没有指定,默认为 tensor 中原来的 layout。 | |||
* **该方法中由于内存是用户申请,需要用户提前知道输入,输出 tensor 对应的 layout,然后 | |||
根据 layout 来申请内存,另外通过 reset 设置到 tensor 中的内存,生命周期不由 tensor | |||
管理,由外部用户来管理。** | |||
## 输入输出指定 device 上内存 | |||
* **实现在 device_io.cpp 中,device_input 和 device_input_output 两个 example。** | |||
* 该 example 中配置模型运行在 device(CUDA) 上,并且使用用户提前申请的 device 上的内存 | |||
作为模型运行的输入和输出。需要在 Network 构建的时候指定输入输出的在 device 上,不设置默认 | |||
在 CPU 上,其他地方和**输入输出为用户指定的内存**的使用相同 | |||
* 可以通过 tensor 的 is_host() 接口来判断该 tensor 在 device 端还是 host 端 | |||
## 申请 pinned host 内存作为输入 | |||
* **实现在 device_io.cpp 中,函数名字为 pinned_host_input。** | |||
* 这个 example 中模型运行在 device(CUDA) 上,但是输入输出在 CPU 上,为了加速 host2device 的 | |||
copy,将 CPU 上的 input tensor 的内存指定提前申请为 cuda pinned 内存。目前如果输出 | |||
output tensor 不是 device 上的时候,默认就是 pinned host 的。 | |||
* 申请 pinned host 内存的方法是:构建 tensor 的时候指定 device,layout,以及 is_host_pinned | |||
参数,这样申请的内存就是 pinned host 的内存。 | |||
```C | |||
bool is_pinned_host = true; | |||
auto tensor_pinned_input = | |||
Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host); | |||
``` | |||
## 用户指定内存分配器 | |||
* **实现在 user_allocator.cpp 中,函数名为:config_user_allocator。** | |||
* 这个例子中使用用户自定义的 CPU 内存分配器演示了用户设置自定义的 Allocator 的方法,用户自定义 | |||
内存分配器需要继承自 lite 中的 Allocator 基类,并实现 allocate 和 free 两个接口。目前在 CPU | |||
上验证是正确的,其他设备上有待测试。 | |||
* 设置自定定义内存分配器的接口为 Network 中如下接口: | |||
```C | |||
Network& set_memory_allocator(std::shared_ptr<Allocator> user_allocator); | |||
``` | |||
## 多个 Network 共享同一份模型 weights | |||
* **实现在 network_share_weights.cpp 中,函数名为:network_share_same_weights。** | |||
* 很多情况用户希望多个 Network 共享同一份 weights,因为模型中 weights 是只读的,这样可以节省 | |||
模型的运行时内存使用量。这个例子主要演示了 lite 中如何实现这个功能,首先创建一个新的 Network, | |||
用户可以指定新的 Config 和 NetworkIO 以及其他一些配置,使得新创建出来的 Network 完成不同的 | |||
功能。 | |||
* 通过已有的 NetWork load 一个新的 Network 的接口为 Network 中如下接口: | |||
```C | |||
static void shared_weight_with_network( | |||
std::shared_ptr<Network> dst_network, | |||
const std::shared_ptr<Network> src_network); | |||
``` | |||
* dst_network: 指新 load 出来的 Network | |||
* src_network:已经 load 的老的 Network | |||
## CPU 绑核 | |||
* **实现在 cpu_affinity.cpp 中,函数名为:cpu_affinity。** | |||
* 该 example 之中指定模型运行在 CPU 多线程上,然后使用 Network 中的 | |||
set_runtime_thread_affinity 来设置绑核回调函数。该回调函数中会传递当前线程的 id 进来,用户可以 | |||
根据该 id 决定具体绑核行为,在多线程中,如果线程总数为 n,则 id 为 n-1 的线程为主线程。 | |||
## 用户注册自定义解密算法和 key | |||
* **实现在 user_cryption.cpp 中,函数名为:register_cryption_method 和 update_aes_key 。** | |||
* 这两个 example 主要使用 lite 自定义解密算法和更新解密算法的接口,实现了使用用户自定的解密算法 | |||
实现模型的 load 操作。在这个 example 中,自定义了一个解密方法,(其实没有做任何事情, | |||
将模型两次异或上 key 之后返回,等于将原始模型直接返回),然后将其注册到 lite 中,后面创建 Network 时候在其 | |||
config 中的 bare_model_cryption_name 指定具体的解密算法名字。在第二个 example 展示了对其 | |||
key 的更新操作。 | |||
目前 lite 里面定义好了几种解密算法: | |||
* AES_default : 其 key 是由 32 个 unsighed char 组成,默认为0到31 | |||
* RC4_default : 其 key 由 hash key 和 enc_key 组成的8个 unsigned char,hash | |||
key 在前,enc_key 在后。 | |||
* SIMPLE_FAST_RC4_default : 其 key 组成同 RC4_default。 | |||
大概命名规则为:前面大写是具体算法的名字,'_'后面的小写,代表解密 key。 | |||
具体的接口为: | |||
```C | |||
bool register_decryption_and_key(std::string decrypt_name, | |||
const DecryptionFunc& func, | |||
const std::vector<uint8_t>& key); | |||
bool update_decryption_or_key(std::string decrypt_name, | |||
const DecryptionFunc& func, | |||
const std::vector<uint8_t>& key); | |||
``` | |||
register 接口中必须要求三个参数都是正确的值,update中 decrypt_nam 必须为已有的解密算法, | |||
将使用 func 和 key 中不为空的部分对 decrypt_nam 解密算法进行更新 | |||
## 异步执行模式 | |||
* **实现在 basic.cpp 中,函数名为:async_forward。** | |||
* 用户通过接口注册异步回调函数将设置 Network 的 Forward 模式为异步执行模式, | |||
目前异步执行模式只有在 CPU 和 CUDA 10.0 以上才支持,在 inference 时异步模式, | |||
主线程可以在工作线程正在执行计算的同时做一些其他的运算,避免长时间等待,但是 | |||
在一些单核处理器上没有收益。 | |||
## 纯 C example | |||
* **实现在 lite_c_interface.cpp,函数名为:basic_c_interface, | |||
device_io_c_interface,async_c_interface** | |||
* Lite 完成对 C++ 接口的封装,对外暴露了纯 C 的接口,用户如果不是源码依赖 Lite | |||
的情况下,应该使用纯 C 接口来完成集成。 | |||
* 纯 C 的所有接口都是返回一个 int,如果这个 int 的数值不为 0,则又错误产生,需要 | |||
调用 LITE_get_last_error 来获取错误信息。 | |||
* 纯 C 的所有 get 函数都需要先定义一个对应的对象,然后将该对象的指针传递进接口, | |||
Lite 会将结果写入到 对应指针的地址里面。 |
@@ -0,0 +1,370 @@ | |||
/** | |||
* \file example/basic.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include <thread> | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include <cstdio> | |||
#include "misc.h" | |||
using namespace lite; | |||
using namespace example; | |||
namespace { | |||
void output_info(std::shared_ptr<Network> network, size_t output_size) { | |||
for (size_t index = 0; index < output_size; index++) { | |||
printf("output[%zu] names %s \n", index, | |||
network->get_all_output_name()[index].c_str()); | |||
std::shared_ptr<Tensor> output_tensor = | |||
network->get_output_tensor(index); | |||
size_t ndim = output_tensor->get_layout().ndim; | |||
for (size_t i = 0; i < ndim; i++) { | |||
printf("output[%zu] tensor.shape[%zu] %zu \n", index, i, | |||
output_tensor->get_layout().shapes[i]); | |||
} | |||
} | |||
} | |||
void output_data_info(std::shared_ptr<Network> network, size_t output_size) { | |||
for (size_t index = 0; index < output_size; index++) { | |||
auto output_tensor = network->get_output_tensor(index); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
LiteDataType dtype = output_tensor->get_layout().data_type; | |||
float max = -1000.0f; | |||
float min = 1000.0f; | |||
int max_idx = 0; | |||
int min_idx = 0; | |||
float sum = 0.0f; | |||
#define cb(_dtype, _real_dtype) \ | |||
case LiteDataType::_dtype: { \ | |||
for (size_t i = 0; i < out_length; i++) { \ | |||
_real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \ | |||
sum += data; \ | |||
if (max < data) { \ | |||
max = data; \ | |||
max_idx = i; \ | |||
} \ | |||
if (min > data) { \ | |||
min = data; \ | |||
min_idx = i; \ | |||
} \ | |||
} \ | |||
} break; | |||
switch (dtype) { | |||
cb(LITE_FLOAT, float); | |||
cb(LITE_INT, int); | |||
cb(LITE_INT8, int8_t); | |||
cb(LITE_UINT8, uint8_t); | |||
default: | |||
printf("unknow datatype"); | |||
} | |||
printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n", | |||
out_length, index, max, max_idx, min, min_idx, sum); | |||
} | |||
#undef cb | |||
} | |||
} // namespace | |||
#if LITE_WITH_CUDA | |||
bool lite::example::load_from_path_run_cuda(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
set_log_level(LiteLogLevel::DEBUG); | |||
//! config the network running in CUDA device | |||
lite::Config config{false, -1, LiteDeviceType::LITE_CUDA}; | |||
//! set NetworkIO | |||
NetworkIO network_io; | |||
std::string input_name = "img0_comp_fullface"; | |||
bool is_host = false; | |||
IO device_input{input_name, is_host}; | |||
network_io.inputs.push_back(device_input); | |||
//! create and load the network | |||
std::shared_ptr<Network> network = | |||
std::make_shared<Network>(config, network_io); | |||
network->load_model(network_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
Layout input_layout = input_tensor->get_layout(); | |||
//! read data from numpy data file | |||
auto src_tensor = parse_npy(input_path); | |||
//! malloc the device memory | |||
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); | |||
//! copy to the device memory | |||
tensor_device.copy_from(*src_tensor); | |||
//! Now the device memory if filled with user input data, set it to the | |||
//! input tensor | |||
input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); | |||
//! forward | |||
{ | |||
lite::Timer ltimer("warmup"); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(0); | |||
} | |||
lite::Timer ltimer("forward_iter"); | |||
for (int i = 0; i < 10; i++) { | |||
ltimer.reset_start(); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(i); | |||
} | |||
//! get the output data or read tensor set in network_in | |||
size_t output_size = network->get_all_output_name().size(); | |||
output_info(network, output_size); | |||
output_data_info(network, output_size); | |||
return true; | |||
} | |||
#endif | |||
bool lite::example::basic_load_from_path(const Args& args) { | |||
set_log_level(LiteLogLevel::DEBUG); | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto layout = input_tensor->get_layout(); | |||
for (size_t i = 0; i < layout.ndim; i++) { | |||
printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]); | |||
} | |||
//! copy or forward data to network | |||
size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
void* dst_ptr = input_tensor->get_memory_ptr(); | |||
auto src_tensor = parse_npy(input_path); | |||
auto layout0 = src_tensor->get_layout(); | |||
for (size_t i = 0; i < layout0.ndim; i++) { | |||
printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]); | |||
} | |||
void* src = src_tensor->get_memory_ptr(); | |||
memcpy(dst_ptr, src, length); | |||
//! forward | |||
{ | |||
lite::Timer ltimer("warmup"); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(0); | |||
} | |||
lite::Timer ltimer("forward_iter"); | |||
for (int i = 0; i < 10; i++) { | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(i); | |||
} | |||
//! forward | |||
{ | |||
lite::Timer ltimer("warmup"); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(0); | |||
} | |||
for (int i = 0; i < 10; i++) { | |||
ltimer.reset_start(); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(i); | |||
} | |||
//! get the output data or read tensor set in network_in | |||
size_t output_size = network->get_all_output_name().size(); | |||
output_info(network, output_size); | |||
output_data_info(network, output_size); | |||
return true; | |||
} | |||
bool lite::example::basic_load_from_path_with_loader(const Args& args) { | |||
set_log_level(LiteLogLevel::DEBUG); | |||
lite::set_loader_lib_path(args.loader_path); | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto input_layout = input_tensor->get_layout(); | |||
//! copy or forward data to network | |||
auto src_tensor = parse_npy(input_path); | |||
auto src_layout = src_tensor->get_layout(); | |||
if (src_layout.ndim != input_layout.ndim) { | |||
printf("src dim is not equal model input dim\n"); | |||
} | |||
//! pay attention the input shape can change | |||
for (size_t i = 0; i < input_layout.ndim; i++) { | |||
if (input_layout.shapes[i] != src_layout.shapes[i]) { | |||
printf("src shape not equal input shape"); | |||
} | |||
} | |||
input_tensor->set_layout(src_tensor->get_layout()); | |||
//! reset or forward data to network | |||
input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout()); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! forward | |||
{ | |||
lite::Timer ltimer("warmup"); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(0); | |||
} | |||
lite::Timer ltimer("forward_iter"); | |||
for (int i = 0; i < 10; i++) { | |||
ltimer.reset_start(); | |||
network->forward(); | |||
network->wait(); | |||
ltimer.print_used_time(i); | |||
} | |||
//! get the output data or read tensor set in network_in | |||
size_t output_size = network->get_all_output_name().size(); | |||
output_info(network, output_size); | |||
output_data_info(network, output_size); | |||
return true; | |||
} | |||
bool lite::example::basic_load_from_memory(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
FILE* fin = fopen(network_path.c_str(), "rb"); | |||
if (!fin) { | |||
printf("failed to open %s.", network_path.c_str()); | |||
} | |||
fseek(fin, 0, SEEK_END); | |||
size_t size = ftell(fin); | |||
fseek(fin, 0, SEEK_SET); | |||
void* ptr = malloc(size); | |||
std::shared_ptr<void> buf{ptr, ::free}; | |||
auto len = fread(buf.get(), 1, size, fin); | |||
if (len < 1) { | |||
printf("read file failed.\n"); | |||
} | |||
fclose(fin); | |||
network->load_model(buf.get(), size); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
//! copy or forward data to network | |||
size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
void* dst_ptr = input_tensor->get_memory_ptr(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
memcpy(dst_ptr, src, length); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
printf("length=%zu\n", length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
bool lite::example::async_forward(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
Config config; | |||
config.options.var_sanity_check_first_run = false; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
//! copy or forward data to network | |||
size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
void* dst_ptr = input_tensor->get_memory_ptr(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
memcpy(dst_ptr, src, length); | |||
//! set async mode and callback | |||
volatile bool finished = false; | |||
network->set_async_callback([&finished]() { | |||
#if !__DEPLOY_ON_XP_SP2__ | |||
std::cout << "worker thread_id:" << std::this_thread::get_id() | |||
<< std::endl; | |||
#endif | |||
finished = true; | |||
}); | |||
#if !__DEPLOY_ON_XP_SP2__ | |||
std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl; | |||
#endif | |||
//! forward | |||
network->forward(); | |||
size_t count = 0; | |||
while (finished == false) { | |||
count++; | |||
} | |||
printf("Forward finish, count is %zu\n", count); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
printf("length=%zu\n", length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,69 @@ | |||
/** | |||
* \file example/cpu_affinity.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
using namespace lite; | |||
using namespace example; | |||
bool lite::example::cpu_affinity(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
//! run with multi theads | |||
Runtime::set_cpu_threads_number(network, 4); | |||
network->load_model(network_path); | |||
std::vector<int> core_ids = {0, 1, 2, 3}; | |||
auto affinity = [core_ids](int id) { | |||
//! add user define affinity function | |||
set_cpu_affinity({core_ids[id]}); | |||
printf("set thread id = %d with the affinity of core %d.\n", id, | |||
core_ids[id]); | |||
}; | |||
Runtime::set_runtime_thread_affinity(network, affinity); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
//! copy or forward data to network | |||
size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
void* dst_ptr = input_tensor->get_memory_ptr(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
memcpy(dst_ptr, src, length); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
printf("length=%zu\n", length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,189 @@ | |||
/** | |||
* \file example/device_io.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include <thread> | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
using namespace lite; | |||
using namespace example; | |||
#if LITE_WITH_CUDA | |||
bool lite::example::device_input(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! config the network running in CUDA device | |||
lite::Config config{LiteDeviceType::LITE_CUDA}; | |||
//! set NetworkIO | |||
NetworkIO network_io; | |||
std::string input_name = "data"; | |||
bool is_host = false; | |||
IO device_input{input_name, is_host}; | |||
network_io.inputs.push_back(device_input); | |||
//! create and load the network | |||
std::shared_ptr<Network> network = | |||
std::make_shared<Network>(config, network_io); | |||
network->load_model(network_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
Layout input_layout = input_tensor->get_layout(); | |||
//! read data from numpy data file | |||
auto src_tensor = parse_npy(input_path); | |||
//! malloc the device memory | |||
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); | |||
//! copy to the device memory | |||
tensor_device.copy_from(*src_tensor); | |||
//! Now the device memory if filled with user input data, set it to the | |||
//! input tensor | |||
input_tensor->reset(tensor_device.get_memory_ptr(), input_layout); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
bool lite::example::device_input_output(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! config the network running in CUDA device | |||
lite::Config config{LiteDeviceType::LITE_CUDA}; | |||
//! set NetworkIO include input and output | |||
NetworkIO network_io; | |||
std::string input_name = "data"; | |||
std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; | |||
bool is_host = false; | |||
IO device_input{input_name, is_host}; | |||
IO device_output{output_name, is_host}; | |||
network_io.inputs.push_back(device_input); | |||
network_io.outputs.push_back(device_output); | |||
//! create and load the network | |||
std::shared_ptr<Network> network = | |||
std::make_shared<Network>(config, network_io); | |||
network->load_model(network_path); | |||
std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0); | |||
Layout input_layout = input_tensor_device->get_layout(); | |||
//! read data from numpy data file | |||
auto src_tensor = parse_npy(input_path); | |||
//! malloc the device memory | |||
auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout); | |||
//! copy to the device memory | |||
tensor_device.copy_from(*src_tensor); | |||
//! Now the device memory is filled with user input data, set it to the | |||
//! input tensor | |||
input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! output is in device, should copy it to host | |||
std::shared_ptr<Tensor> output_tensor_device = | |||
network->get_io_tensor(output_name); | |||
auto output_tensor = std::make_shared<Tensor>(); | |||
output_tensor->copy_from(*output_tensor_device); | |||
//! get the output data or read tensor set in network_in | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
bool lite::example::pinned_host_input(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! config the network running in CUDA device | |||
lite::Config config{LiteDeviceType::LITE_CUDA}; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(network_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
Layout input_layout = input_tensor->get_layout(); | |||
//! read data from numpy data file | |||
auto src_tensor = parse_npy(input_path); | |||
//! malloc the pinned host memory | |||
bool is_pinned_host = true; | |||
auto tensor_pinned_input = | |||
Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host); | |||
//! copy to the pinned memory | |||
tensor_pinned_input.copy_from(*src_tensor); | |||
//! set the pinned host memory to the network as input | |||
input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,224 @@ | |||
/** | |||
* \file example/basic_c_interface.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../example.h" | |||
#include "misc.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "lite-c/global_c.h" | |||
#include "lite-c/network_c.h" | |||
#include "lite-c/tensor_c.h" | |||
#include <thread> | |||
#define LITE_CAPI_CHECK(_expr) \ | |||
do { \ | |||
int _ret = (_expr); \ | |||
if (_ret) { \ | |||
LITE_THROW(LITE_get_last_error()); \ | |||
} \ | |||
} while (0) | |||
bool basic_c_interface(const lite::example::Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! read input data to lite::tensor | |||
auto src_tensor = lite::example::parse_npy(input_path); | |||
void* src_ptr = src_tensor->get_memory_ptr(); | |||
//! create and load the network | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK( | |||
LITE_make_network(&c_network, *default_config(), *default_network_io())); | |||
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); | |||
//! set input data to input tensor | |||
LiteTensor c_input_tensor; | |||
LITE_CAPI_CHECK( | |||
LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); | |||
void* dst_ptr; | |||
size_t length_in_byte; | |||
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, | |||
&length_in_byte)); | |||
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_input_tensor, &dst_ptr)); | |||
//! copy or forward data to network | |||
memcpy(dst_ptr, src_ptr, length_in_byte); | |||
//! forward | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
//! get the output data or read tensor data | |||
const char* output_name; | |||
LiteTensor c_output_tensor; | |||
//! get the first output tensor name | |||
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
&c_output_tensor)); | |||
void* output_ptr; | |||
size_t length_output_in_byte; | |||
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)); | |||
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor, | |||
&length_output_in_byte)); | |||
size_t out_length = length_output_in_byte / sizeof(float); | |||
printf("length=%zu\n", out_length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(output_ptr)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
bool device_io_c_interface(const lite::example::Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! read input data to lite::tensor | |||
auto src_tensor = lite::example::parse_npy(input_path); | |||
void* src_ptr = src_tensor->get_memory_ptr(); | |||
size_t length_read_in = src_tensor->get_tensor_total_size_in_byte(); | |||
//! create and load the network | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK( | |||
LITE_make_network(&c_network, *default_config(), *default_network_io())); | |||
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); | |||
//! set input data to input tensor | |||
LiteTensor c_input_tensor; | |||
size_t length_tensor_in; | |||
LITE_CAPI_CHECK( | |||
LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); | |||
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, | |||
&length_tensor_in)); | |||
if (length_read_in != length_tensor_in) { | |||
LITE_THROW("The input data size is not match the network input tensro " | |||
"size,\n"); | |||
} | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr, | |||
length_tensor_in)); | |||
//! reset the output tensor memory with user allocated memory | |||
size_t out_length = 1000; | |||
LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}; | |||
std::shared_ptr<float> ptr(new float[out_length], | |||
[](float* ptr) { delete[] ptr; }); | |||
const char* output_name; | |||
LiteTensor c_output_tensor; | |||
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
&c_output_tensor)); | |||
LITE_CAPI_CHECK( | |||
LITE_reset_tensor(c_output_tensor, output_layout, ptr.get())); | |||
//! forward | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
printf("length=%zu\n", out_length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
void* out_data = ptr.get(); | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
namespace { | |||
volatile bool finished = false; | |||
int async_callback(void) { | |||
#if !__DEPLOY_ON_XP_SP2__ | |||
std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl; | |||
#endif | |||
finished = true; | |||
return 0; | |||
} | |||
} // namespace | |||
bool async_c_interface(const lite::example::Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! read input data to lite::tensor | |||
auto src_tensor = lite::example::parse_npy(input_path); | |||
void* src_ptr = src_tensor->get_memory_ptr(); | |||
LiteNetwork c_network; | |||
LiteConfig config = *default_config(); | |||
config.options.var_sanity_check_first_run = false; | |||
LITE_CAPI_CHECK(LITE_make_network(&c_network, config, *default_network_io())); | |||
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, network_path.c_str())); | |||
//! set input data to input tensor | |||
LiteTensor c_input_tensor; | |||
size_t length_tensor_in; | |||
LITE_CAPI_CHECK( | |||
LITE_get_io_tensor(c_network, "data", LITE_IO, &c_input_tensor)); | |||
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_input_tensor, | |||
&length_tensor_in)); | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, src_ptr, | |||
length_tensor_in)); | |||
#if !__DEPLOY_ON_XP_SP2__ | |||
std::cout << "user thread_id:" << std::this_thread::get_id() << std::endl; | |||
#endif | |||
LITE_CAPI_CHECK(LITE_set_async_callback(c_network, async_callback)); | |||
//! forward | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
size_t count = 0; | |||
while (finished == false) { | |||
count++; | |||
} | |||
printf("The count is %zu\n", count); | |||
finished = false; | |||
//! get the output data or read tensor data | |||
const char* output_name; | |||
LiteTensor c_output_tensor; | |||
//! get the first output tensor name | |||
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
&c_output_tensor)); | |||
void* output_ptr; | |||
size_t length_output_in_byte; | |||
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)); | |||
LITE_CAPI_CHECK(LITE_get_tensor_total_size_in_byte(c_output_tensor, | |||
&length_output_in_byte)); | |||
size_t out_length = length_output_in_byte / sizeof(float); | |||
printf("length=%zu\n", out_length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(output_ptr)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,78 @@ | |||
/** | |||
* \file example/network_share_weights.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
using namespace lite; | |||
using namespace example; | |||
bool lite::example::network_share_same_weights(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
network->load_model(network_path); | |||
//! load a new network from the created network and share the same weights, | |||
Config config_new; | |||
config_new.options.const_shape = true; | |||
NetworkIO network_io_new; | |||
std::shared_ptr<Network> weight_shared_network = | |||
std::make_shared<Network>(config_new, network_io_new); | |||
Runtime::shared_weight_with_network(weight_shared_network, network); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
void* dst_ptr = input_tensor->get_memory_ptr(); | |||
std::shared_ptr<Tensor> input_tensor2 = | |||
weight_shared_network->get_input_tensor(0); | |||
void* dst_ptr2 = input_tensor2->get_memory_ptr(); | |||
//! copy or forward data to network | |||
size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
memcpy(dst_ptr, src, length); | |||
memcpy(dst_ptr2, src, length); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
weight_shared_network->forward(); | |||
weight_shared_network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
std::shared_ptr<Tensor> output_tensor2 = | |||
weight_shared_network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
void* out_data2 = output_tensor2->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
printf("length=%zu\n", length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
float data2 = static_cast<float*>(out_data2)[i]; | |||
if (data != data2) { | |||
printf("the result between the origin network and weight share " | |||
"netwrok is different.\n"); | |||
} | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,95 @@ | |||
/** | |||
* \file example/reset_io.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
using namespace lite; | |||
using namespace example; | |||
bool lite::example::reset_input(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
lite::Config config; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto layout = input_tensor->get_layout(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
input_tensor->reset(src, layout); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! 6. get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
bool lite::example::reset_input_output(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
lite::Config config; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto layout = input_tensor->get_layout(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
input_tensor->reset(src, layout); | |||
//! set output ptr to store the network output | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < 1000; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,89 @@ | |||
/** | |||
* \file example/user_allocator.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
using namespace lite; | |||
using namespace example; | |||
namespace { | |||
class CheckAllocator : public lite::Allocator { | |||
public: | |||
//! allocate memory of size in the given device with the given align | |||
void* allocate(LiteDeviceType, int, size_t size, size_t align) override { | |||
#ifdef WIN32 | |||
return _aligned_malloc(size, align); | |||
#elif defined(__ANDROID__) || defined(ANDROID) | |||
return memalign(align, size); | |||
#else | |||
void* ptr = nullptr; | |||
auto err = posix_memalign(&ptr, align, size); | |||
if (!err) { | |||
printf("failed to malloc %zu bytes with align %zu", size, align); | |||
} | |||
return ptr; | |||
#endif | |||
}; | |||
//! free the memory pointed by ptr in the given device | |||
void free(LiteDeviceType, int, void* ptr) override { | |||
#ifdef WIN32 | |||
_aligned_free(ptr); | |||
#else | |||
::free(ptr); | |||
#endif | |||
}; | |||
}; | |||
} // namespace | |||
bool lite::example::config_user_allocator(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
auto allocator = std::make_shared<CheckAllocator>(); | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(); | |||
Runtime::set_memory_allocator(network, allocator); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
//! copy or forward data to network | |||
size_t length = input_tensor->get_tensor_total_size_in_byte(); | |||
void* dst_ptr = input_tensor->get_memory_ptr(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
memcpy(dst_ptr, src, length); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
printf("length=%zu\n", length); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,122 @@ | |||
/** | |||
* \file example/user_cryption.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../example.h" | |||
#if LITE_BUILD_WITH_MGE | |||
using namespace lite; | |||
using namespace example; | |||
namespace { | |||
std::vector<uint8_t> decrypt_model(const void* model_mem, size_t size, | |||
const std::vector<uint8_t>& key) { | |||
if (key.size() == 1) { | |||
std::vector<uint8_t> ret(size, 0); | |||
const uint8_t* ptr = static_cast<const uint8_t*>(model_mem); | |||
uint8_t key_data = key[0]; | |||
for (size_t i = 0; i < size; i++) { | |||
ret[i] = ptr[i] ^ key_data ^ key_data; | |||
} | |||
return ret; | |||
} else { | |||
printf("the user define decrypt method key length is wrong.\n"); | |||
return {}; | |||
} | |||
} | |||
} // namespace | |||
bool lite::example::register_cryption_method(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! register the decryption method | |||
register_decryption_and_key("just_for_test", decrypt_model, {15}); | |||
lite::Config config; | |||
config.bare_model_cryption_name = "just_for_test"; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto layout = input_tensor->get_layout(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
input_tensor->reset(src, layout); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
bool lite::example::update_cryption_key(const Args& args) { | |||
std::string network_path = args.model_path; | |||
std::string input_path = args.input_path; | |||
//! update the decryption method key | |||
std::vector<uint8_t> key(32, 0); | |||
for (size_t i = 0; i < 32; i++) { | |||
key[i] = 31 - i; | |||
} | |||
update_decryption_or_key("AES_default", nullptr, key); | |||
lite::Config config; | |||
config.bare_model_cryption_name = "AES_default"; | |||
//! create and load the network | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(network_path); | |||
//! set input data to input tensor | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto layout = input_tensor->get_layout(); | |||
auto src_tensor = parse_npy(input_path); | |||
void* src = src_tensor->get_memory_ptr(); | |||
input_tensor->reset(src, layout); | |||
//! forward | |||
network->forward(); | |||
network->wait(); | |||
//! get the output data or read tensor set in network_in | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
size_t out_length = output_tensor->get_tensor_total_size_in_byte() / | |||
output_tensor->get_layout().get_elem_size(); | |||
float max = -1.0f; | |||
float sum = 0.0f; | |||
for (size_t i = 0; i < out_length; i++) { | |||
float data = static_cast<float*>(out_data)[i]; | |||
sum += data; | |||
if (max < data) | |||
max = data; | |||
} | |||
printf("max=%e, sum=%e\n", max, sum); | |||
return true; | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,638 @@ | |||
/* | |||
Copyright 2017 Leon Merten Lohse | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in | |||
all copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. | |||
*/ | |||
/* | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#ifndef NPY_H | |||
#define NPY_H | |||
#include <algorithm> | |||
#include <complex> | |||
#include <cstdint> | |||
#include <cstring> | |||
#include <fstream> | |||
#include <iostream> | |||
#include <regex> | |||
#include <sstream> | |||
#include <stdexcept> | |||
#include <string> | |||
#include <unordered_map> | |||
#include <vector> | |||
namespace npy { | |||
/* Compile-time test for byte order. | |||
If your compiler does not define these per default, you may want to define | |||
one of these constants manually. | |||
Defaults to little endian order. */ | |||
#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ | |||
defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \ | |||
defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \ | |||
defined(__MIBSEB) || defined(__MIBSEB__) | |||
const bool big_endian = true; | |||
#else | |||
const bool big_endian = false; | |||
#endif | |||
const char magic_string[] = "\x93NUMPY"; | |||
const size_t magic_string_length = 6; | |||
const char little_endian_char = '<'; | |||
const char big_endian_char = '>'; | |||
const char no_endian_char = '|'; | |||
constexpr char host_endian_char = | |||
(big_endian ? big_endian_char : little_endian_char); | |||
/* npy array length */ | |||
typedef unsigned long int ndarray_len_t; | |||
inline void write_magic(std::ostream& ostream, unsigned char v_major = 1, | |||
unsigned char v_minor = 0) { | |||
ostream.write(magic_string, magic_string_length); | |||
ostream.put(v_major); | |||
ostream.put(v_minor); | |||
} | |||
inline void read_magic(std::istream& istream, unsigned char& v_major, | |||
unsigned char& v_minor) { | |||
char buf[magic_string_length + 2]; | |||
istream.read(buf, magic_string_length + 2); | |||
if (!istream) { | |||
fprintf(stderr, "io error: failed reading file"); | |||
} | |||
if (0 != std::memcmp(buf, magic_string, magic_string_length)) { | |||
fprintf(stderr, "this file does not have a valid npy format."); | |||
} | |||
v_major = buf[magic_string_length]; | |||
v_minor = buf[magic_string_length + 1]; | |||
} | |||
// typestring magic | |||
struct Typestring { | |||
private: | |||
char c_endian; | |||
char c_type; | |||
int len; | |||
public: | |||
inline std::string str() { | |||
const size_t max_buflen = 16; | |||
char buf[max_buflen]; | |||
std::sprintf(buf, "%c%c%u", c_endian, c_type, len); | |||
return std::string(buf); | |||
} | |||
Typestring(const std::vector<float>&) | |||
: c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {} | |||
Typestring(const std::vector<double>&) | |||
: c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {} | |||
Typestring(const std::vector<long double>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'f'}, | |||
len{sizeof(long double)} {} | |||
Typestring(const std::vector<char>&) | |||
: c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {} | |||
Typestring(const std::vector<short>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {} | |||
Typestring(const std::vector<int>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {} | |||
Typestring(const std::vector<long>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {} | |||
Typestring(const std::vector<long long>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {} | |||
Typestring(const std::vector<unsigned char>&) | |||
: c_endian{no_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned char)} {} | |||
Typestring(const std::vector<unsigned short>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned short)} {} | |||
Typestring(const std::vector<unsigned int>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned int)} {} | |||
Typestring(const std::vector<unsigned long>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned long)} {} | |||
Typestring(const std::vector<unsigned long long>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned long long)} {} | |||
Typestring(const std::vector<std::complex<float>>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'c'}, | |||
len{sizeof(std::complex<float>)} {} | |||
Typestring(const std::vector<std::complex<double>>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'c'}, | |||
len{sizeof(std::complex<double>)} {} | |||
Typestring(const std::vector<std::complex<long double>>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'c'}, | |||
len{sizeof(std::complex<long double>)} {} | |||
}; | |||
inline void parse_typestring(std::string typestring) { | |||
std::regex re("'([<>|])([ifuc])(\\d+)'"); | |||
std::smatch sm; | |||
std::regex_match(typestring, sm, re); | |||
if (sm.size() != 4) { | |||
fprintf(stderr, "invalid typestring"); | |||
} | |||
} | |||
namespace pyparse { | |||
/** | |||
Removes leading and trailing whitespaces | |||
*/ | |||
inline std::string trim(const std::string& str) { | |||
const std::string whitespace = " \t"; | |||
auto begin = str.find_first_not_of(whitespace); | |||
if (begin == std::string::npos) | |||
return ""; | |||
auto end = str.find_last_not_of(whitespace); | |||
return str.substr(begin, end - begin + 1); | |||
} | |||
inline std::string get_value_from_map(const std::string& mapstr) { | |||
size_t sep_pos = mapstr.find_first_of(":"); | |||
if (sep_pos == std::string::npos) | |||
return ""; | |||
std::string tmp = mapstr.substr(sep_pos + 1); | |||
return trim(tmp); | |||
} | |||
/** | |||
Parses the string representation of a Python dict | |||
The keys need to be known and may not appear anywhere else in the data. | |||
*/ | |||
inline std::unordered_map<std::string, std::string> parse_dict( | |||
std::string in, std::vector<std::string>& keys) { | |||
std::unordered_map<std::string, std::string> map; | |||
if (keys.size() == 0) | |||
return map; | |||
in = trim(in); | |||
// unwrap dictionary | |||
if ((in.front() == '{') && (in.back() == '}')) | |||
in = in.substr(1, in.length() - 2); | |||
else { | |||
fprintf(stderr, "Not a Python dictionary."); | |||
} | |||
std::vector<std::pair<size_t, std::string>> positions; | |||
for (auto const& value : keys) { | |||
size_t pos = in.find("'" + value + "'"); | |||
if (pos == std::string::npos) { | |||
fprintf(stderr, "Missing %s key.", value.c_str()); | |||
} | |||
std::pair<size_t, std::string> position_pair{pos, value}; | |||
positions.push_back(position_pair); | |||
} | |||
// sort by position in dict | |||
std::sort(positions.begin(), positions.end()); | |||
for (size_t i = 0; i < positions.size(); ++i) { | |||
std::string raw_value; | |||
size_t begin{positions[i].first}; | |||
size_t end{std::string::npos}; | |||
std::string key = positions[i].second; | |||
if (i + 1 < positions.size()) | |||
end = positions[i + 1].first; | |||
raw_value = in.substr(begin, end - begin); | |||
raw_value = trim(raw_value); | |||
if (raw_value.back() == ',') | |||
raw_value.pop_back(); | |||
map[key] = get_value_from_map(raw_value); | |||
} | |||
return map; | |||
} | |||
/** | |||
Parses the string representation of a Python boolean | |||
*/ | |||
inline bool parse_bool(const std::string& in) { | |||
if (in == "True") | |||
return true; | |||
if (in == "False") | |||
return false; | |||
fprintf(stderr, "Invalid python boolan."); | |||
return false; | |||
} | |||
/** | |||
Parses the string representation of a Python str | |||
*/ | |||
inline std::string parse_str(const std::string& in) { | |||
if ((in.front() == '\'') && (in.back() == '\'')) | |||
return in.substr(1, in.length() - 2); | |||
fprintf(stderr, "Invalid python string."); | |||
return ""; | |||
} | |||
/** | |||
Parses the string represenatation of a Python tuple into a vector of its items | |||
*/ | |||
inline std::vector<std::string> parse_tuple(std::string in) { | |||
std::vector<std::string> v; | |||
const char seperator = ','; | |||
in = trim(in); | |||
if ((in.front() == '(') && (in.back() == ')')) | |||
in = in.substr(1, in.length() - 2); | |||
else { | |||
fprintf(stderr, "Invalid Python tuple."); | |||
} | |||
std::istringstream iss(in); | |||
for (std::string token; std::getline(iss, token, seperator);) { | |||
v.push_back(token); | |||
} | |||
return v; | |||
} | |||
template <typename T> | |||
inline std::string write_tuple(const std::vector<T>& v) { | |||
if (v.size() == 0) | |||
return ""; | |||
std::ostringstream ss; | |||
if (v.size() == 1) { | |||
ss << "(" << v.front() << ",)"; | |||
} else { | |||
const std::string delimiter = ", "; | |||
// v.size() > 1 | |||
ss << "("; | |||
std::copy(v.begin(), v.end() - 1, | |||
std::ostream_iterator<T>(ss, delimiter.c_str())); | |||
ss << v.back(); | |||
ss << ")"; | |||
} | |||
return ss.str(); | |||
} | |||
inline std::string write_boolean(bool b) { | |||
if (b) | |||
return "True"; | |||
else | |||
return "False"; | |||
} | |||
} // namespace pyparse | |||
inline void parse_header(std::string header, std::string& descr) { | |||
/* | |||
The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
The next 1 byte is an unsigned byte: the major version number of the file | |||
format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
number of the file format, e.g. x00. Note: the version of the file format | |||
is not tied to the version of the numpy package. The next 2 bytes form a | |||
little-endian unsigned short int: the length of the header data | |||
HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
array's format. It is an ASCII string which contains a Python literal | |||
expression of a dictionary. It is terminated by a newline ('n') and | |||
padded with spaces | |||
('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
evenly divisible by 16 for alignment purposes. The dictionary contains | |||
three keys: | |||
"descr" : dtype.descr | |||
An object that can be passed as an argument to the numpy.dtype() | |||
constructor to create the array's dtype. For repeatability and | |||
readability, this dictionary is formatted using pprint.pformat() so the | |||
keys are in alphabetic order. | |||
*/ | |||
// remove trailing newline | |||
if (header.back() != '\n') | |||
fprintf(stderr, "invalid header"); | |||
header.pop_back(); | |||
// parse the dictionary | |||
std::vector<std::string> keys{"descr"}; | |||
auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
if (dict_map.size() == 0) | |||
fprintf(stderr, "invalid dictionary in header"); | |||
std::string descr_s = dict_map["descr"]; | |||
parse_typestring(descr_s); | |||
// remove | |||
descr = npy::pyparse::parse_str(descr_s); | |||
return; | |||
} | |||
inline void parse_header(std::string header, std::string& descr, | |||
bool& fortran_order, | |||
std::vector<ndarray_len_t>& shape) { | |||
/* | |||
The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
The next 1 byte is an unsigned byte: the major version number of the file | |||
format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
number of the file format, e.g. x00. Note: the version of the file format | |||
is not tied to the version of the numpy package. The next 2 bytes form a | |||
little-endian unsigned short int: the length of the header data | |||
HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
array's format. It is an ASCII string which contains a Python literal | |||
expression of a dictionary. It is terminated by a newline ('n') and | |||
padded with spaces | |||
('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
evenly divisible by 16 for alignment purposes. The dictionary contains | |||
three keys: | |||
"descr" : dtype.descr | |||
An object that can be passed as an argument to the numpy.dtype() | |||
constructor to create the array's dtype. "fortran_order" : bool Whether | |||
the array data is Fortran-contiguous or not. Since Fortran-contiguous | |||
arrays are a common form of non-C-contiguity, we allow them to be written | |||
directly to disk for efficiency. "shape" : tuple of int The shape of the | |||
array. For repeatability and readability, this dictionary is formatted | |||
using pprint.pformat() so the keys are in alphabetic order. | |||
*/ | |||
// remove trailing newline | |||
if (header.back() != '\n') | |||
fprintf(stderr, "invalid header"); | |||
header.pop_back(); | |||
// parse the dictionary | |||
std::vector<std::string> keys{"descr", "fortran_order", "shape"}; | |||
auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
if (dict_map.size() == 0) | |||
fprintf(stderr, "invalid dictionary in header"); | |||
std::string descr_s = dict_map["descr"]; | |||
std::string fortran_s = dict_map["fortran_order"]; | |||
std::string shape_s = dict_map["shape"]; | |||
// TODO: extract info from typestring | |||
parse_typestring(descr_s); | |||
// remove | |||
descr = npy::pyparse::parse_str(descr_s); | |||
// convert literal Python bool to C++ bool | |||
fortran_order = npy::pyparse::parse_bool(fortran_s); | |||
// parse the shape tuple | |||
auto shape_v = npy::pyparse::parse_tuple(shape_s); | |||
if (shape_v.size() == 0) | |||
fprintf(stderr, "invalid shape tuple in header"); | |||
for (auto item : shape_v) { | |||
ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item)); | |||
shape.push_back(dim); | |||
} | |||
} | |||
inline std::string write_header_dict(const std::string& descr, | |||
bool fortran_order, | |||
const std::vector<ndarray_len_t>& shape) { | |||
std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); | |||
std::string shape_s = npy::pyparse::write_tuple(shape); | |||
return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + | |||
", 'shape': " + shape_s + ", }"; | |||
} | |||
inline void write_header(std::ostream& out, const std::string& descr, | |||
bool fortran_order, | |||
const std::vector<ndarray_len_t>& shape_v) { | |||
std::string header_dict = write_header_dict(descr, fortran_order, shape_v); | |||
size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; | |||
unsigned char version[2] = {1, 0}; | |||
if (length >= 255 * 255) { | |||
length = magic_string_length + 2 + 4 + header_dict.length() + 1; | |||
version[0] = 2; | |||
version[1] = 0; | |||
} | |||
size_t padding_len = 16 - length % 16; | |||
std::string padding(padding_len, ' '); | |||
// write magic | |||
write_magic(out, version[0], version[1]); | |||
// write header length | |||
if (version[0] == 1 && version[1] == 0) { | |||
char header_len_le16[2]; | |||
uint16_t header_len = static_cast<uint16_t>(header_dict.length() + | |||
padding.length() + 1); | |||
header_len_le16[0] = (header_len >> 0) & 0xff; | |||
header_len_le16[1] = (header_len >> 8) & 0xff; | |||
out.write(reinterpret_cast<char*>(header_len_le16), 2); | |||
} else { | |||
char header_len_le32[4]; | |||
uint32_t header_len = static_cast<uint32_t>(header_dict.length() + | |||
padding.length() + 1); | |||
header_len_le32[0] = (header_len >> 0) & 0xff; | |||
header_len_le32[1] = (header_len >> 8) & 0xff; | |||
header_len_le32[2] = (header_len >> 16) & 0xff; | |||
header_len_le32[3] = (header_len >> 24) & 0xff; | |||
out.write(reinterpret_cast<char*>(header_len_le32), 4); | |||
} | |||
out << header_dict << padding << '\n'; | |||
} | |||
inline std::string read_header(std::istream& istream) { | |||
// check magic bytes an version number | |||
unsigned char v_major, v_minor; | |||
read_magic(istream, v_major, v_minor); | |||
uint32_t header_length = 0; | |||
if (v_major == 1 && v_minor == 0) { | |||
char header_len_le16[2]; | |||
istream.read(header_len_le16, 2); | |||
header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); | |||
if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { | |||
// TODO: display warning | |||
} | |||
} else if (v_major == 2 && v_minor == 0) { | |||
char header_len_le32[4]; | |||
istream.read(header_len_le32, 4); | |||
header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) | | |||
(header_len_le32[2] << 16) | (header_len_le32[3] << 24); | |||
if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { | |||
// TODO: display warning | |||
} | |||
} else { | |||
fprintf(stderr, "unsupported file format version"); | |||
} | |||
auto buf_v = std::vector<char>(); | |||
buf_v.reserve(header_length); | |||
istream.read(buf_v.data(), header_length); | |||
std::string header(buf_v.data(), header_length); | |||
return header; | |||
} | |||
inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) { | |||
ndarray_len_t size = 1; | |||
for (ndarray_len_t i : shape) | |||
size *= i; | |||
return size; | |||
} | |||
template <typename Scalar> | |||
inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order, | |||
unsigned int n_dims, const unsigned long shape[], | |||
const std::vector<Scalar>& data) { | |||
Typestring typestring_o(data); | |||
std::string typestring = typestring_o.str(); | |||
std::ofstream stream(filename, std::ofstream::binary); | |||
if (!stream) { | |||
fprintf(stderr, "io error: failed to open a file."); | |||
} | |||
std::vector<ndarray_len_t> shape_v(shape, shape + n_dims); | |||
write_header(stream, typestring, fortran_order, shape_v); | |||
auto size = static_cast<size_t>(comp_size(shape_v)); | |||
stream.write(reinterpret_cast<const char*>(data.data()), | |||
sizeof(Scalar) * size); | |||
} | |||
template <typename Scalar> | |||
inline void LoadArrayFromNumpy(const std::string& filename, | |||
std::vector<unsigned long>& shape, | |||
std::vector<Scalar>& data) { | |||
bool fortran_order; | |||
LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data); | |||
} | |||
template <typename Scalar> | |||
inline void LoadArrayFromNumpy(const std::string& filename, | |||
std::vector<unsigned long>& shape, | |||
bool& fortran_order, std::vector<Scalar>& data) { | |||
std::ifstream stream(filename, std::ifstream::binary); | |||
if (!stream) { | |||
fprintf(stderr, "io error: failed to open a file."); | |||
} | |||
std::string header = read_header(stream); | |||
// parse header | |||
std::string typestr; | |||
parse_header(header, typestr, fortran_order, shape); | |||
// check if the typestring matches the given one | |||
Typestring typestring_o{data}; | |||
std::string expect_typestr = typestring_o.str(); | |||
if (typestr != expect_typestr) { | |||
fprintf(stderr, "formatting error: typestrings not matching"); | |||
} | |||
// compute the data size based on the shape | |||
auto size = static_cast<size_t>(comp_size(shape)); | |||
data.resize(size); | |||
// read the data | |||
stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size); | |||
} | |||
inline void LoadArrayFromNumpy(const std::string& filename, | |||
std::string& type_str, | |||
std::vector<ndarray_len_t>& shape, | |||
std::vector<int8_t>& data) { | |||
std::ifstream stream(filename, std::ifstream::binary); | |||
if (!stream) { | |||
fprintf(stderr, "io error: failed to open a file."); | |||
} | |||
std::string header = read_header(stream); | |||
bool fortran_order; | |||
// parse header | |||
parse_header(header, type_str, fortran_order, shape); | |||
// check if the typestring matches the given one | |||
std::string size_str = type_str.substr(type_str.size() - 1); | |||
size_t elem_size = atoi(size_str.c_str()); | |||
// compute the data size based on the shape | |||
auto byte_size = elem_size * static_cast<size_t>(comp_size(shape)); | |||
data.resize(byte_size); | |||
// read the data | |||
stream.read(reinterpret_cast<char*>(data.data()), byte_size); | |||
} | |||
} // namespace npy | |||
#endif // NPY_H |
@@ -0,0 +1,97 @@ | |||
/** | |||
* \file inlude/lite/common_enum_c.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef LITE_COMMON_ENUM_C_H_ | |||
#define LITE_COMMON_ENUM_C_H_ | |||
/*! | |||
* \brief The log level. | |||
*/ | |||
typedef enum LiteLogLevel { | |||
DEBUG = 0, /*!< The lowest level and most verbose */ | |||
INFO = 1, /*!< The lowest level and most verbose */ | |||
WARN = 2, /*!< Print only warning and errors */ | |||
ERROR = 3, /*!< Print only errors */ | |||
} LiteLogLevel; | |||
typedef enum LiteBackend { | |||
LITE_DEFAULT = 0, //! default backend is mge | |||
} LiteBackend; | |||
typedef enum LiteDeviceType { | |||
LITE_CPU = 0, | |||
LITE_CUDA = 1, | |||
LITE_ATLAS = 3, | |||
LITE_NPU = 4, | |||
//! when the device information is set in model, so set LITE_DEVICE_DEFAULT | |||
//! in lite | |||
LITE_DEVICE_DEFAULT = 5, | |||
} LiteDeviceType; | |||
typedef enum LiteDataType { | |||
LITE_FLOAT = 0, | |||
LITE_HALF = 1, | |||
LITE_INT = 2, | |||
LITE_INT16 = 3, | |||
LITE_INT8 = 4, | |||
LITE_UINT8 = 5, | |||
LITE_UINT = 6, | |||
LITE_UINT16 = 7, | |||
LITE_INT64 = 8, | |||
} LiteCDataType; | |||
typedef enum LiteTensorPhase { | |||
//! Tensor maybe input or output | |||
LITE_IO = 0, | |||
//! Tensor is input | |||
LITE_INPUT = 1, | |||
//! Tensor is output | |||
LITE_OUTPUT = 2, | |||
} LiteTensorPhase; | |||
/*! | |||
* \brief the input and output type, include SHAPE and VALUE | |||
* sometimes user only need the shape of the output tensor | |||
*/ | |||
typedef enum LiteIOType { | |||
LITE_IO_VALUE = 0, | |||
LITE_IO_SHAPE = 1, | |||
} LiteIOType; | |||
/*! | |||
* \brief operation algorithm seletion strategy type, some operations have | |||
* multi algorithms, different algorithm has different attribute, according to | |||
* the strategy, the best algorithm will be selected. | |||
* | |||
* Note: These strategies can be combined | |||
* | |||
* 1. LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid, | |||
* use heuristic instead | |||
* | |||
* 2. LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the | |||
* reproducible algo | |||
* | |||
* 3. LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best | |||
* algorithm from the reproducible algorithms set | |||
* | |||
* 4. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best | |||
* algorithm form the optimzed algorithms, thus profile will process fast | |||
* | |||
* 5. LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means: | |||
* profile the best algorithm form the optimzed and reproducible algorithms | |||
*/ | |||
typedef enum LiteAlgoSelectStrategy { | |||
LITE_ALGO_HEURISTIC = 1 << 0, | |||
LITE_ALGO_PROFILE = 1 << 1, | |||
LITE_ALGO_REPRODUCIBLE = 1 << 2, | |||
LITE_ALGO_OPTIMIZED = 1 << 3, | |||
} LiteAlgoSelectStrategy; | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,157 @@ | |||
/** | |||
* \file inlude/lite/global.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "macro.h" | |||
#include "network.h" | |||
#include <functional> | |||
#include <memory> | |||
#include <vector> | |||
namespace lite { | |||
/** | |||
* \brief Model decryption function | |||
* | |||
* \param[in] const void* is the decrypted model memory pointer | |||
* \param[in] size_t the size the decrypted model memory in byte | |||
* \param[in] const std::vector<uint8_t>& the decryption key vector | |||
*/ | |||
using DecryptionFunc = std::function<std::vector<uint8_t>( | |||
const void*, size_t, const std::vector<uint8_t>&)>; | |||
/** | |||
* \brief register a custom decryption method and key to lite. | |||
* | |||
* \param[in] decrypt_name the name of the decryption, which will act as the | |||
* hash key to find the decryption method. | |||
* | |||
* \param[in] func the decryption function, which will decrypt the model with | |||
* the registered key, return a vector that contain the decrypted model. | |||
* | |||
* \param[in] key the decryption key of the method | |||
*/ | |||
LITE_API bool register_decryption_and_key(std::string decrypt_name, | |||
const DecryptionFunc& func, | |||
const std::vector<uint8_t>& key); | |||
/** | |||
* \brief update decryption function or key of a custom decryption method. | |||
* | |||
* \param[in] decrypt_name the name of the decryption, which will act as the | |||
* hash key to find the decryption method. | |||
* | |||
* \param[in] func the decryption function, which will decrypt the model with | |||
* the registered key, return a vector that contain the decrypted model. if | |||
* function is nullptr, it will not be updated. | |||
* | |||
* \param[in] key the decryption key of the method, if the size of key is zero, | |||
* it will not be updated | |||
*/ | |||
LITE_API bool update_decryption_or_key(std::string decrypt_name, | |||
const DecryptionFunc& func, | |||
const std::vector<uint8_t>& key); | |||
/** | |||
* \brief Model information parse function | |||
* | |||
* \param[in] const void* is the information memory | |||
* \param[in] size_t the size the information memory | |||
* \param[in] const std::string the model name used for check whether the | |||
* infomation match the model | |||
* \param[in] Config the model config, ParseInfoFunc can fill it with the | |||
* information in json, the config will influence Network loading later | |||
* \param[in] NetworkIO the model IO, ParseInfoFunc can fill it with the | |||
* information in json, the networkio will influence Network forwarding later | |||
* \param[in] std::unordered_map<std::string, LiteAny>& isolated_config_map, the | |||
* other config not inclue in config and networkIO, ParseInfoFunc can fill it | |||
* with the information in json, now support: | |||
* "device_id" : int, default 0 | |||
* "number_threads" : size_t, default 1 | |||
* "is_inplace_model" : bool, default false | |||
* "use_tensorrt" : bool, default false | |||
*/ | |||
using ParseInfoFunc = std::function<bool( | |||
const void*, size_t, const std::string model_name, Config& config, | |||
NetworkIO& network_io, | |||
std::unordered_map<std::string, LiteAny>& isolated_config_map, | |||
std::string& extra_info)>; | |||
/** | |||
* \brief register a custom parser function to lite. | |||
* | |||
* \param[in] info_type the name of the parser function, which will act as the | |||
* hash key to find the parser method. | |||
* | |||
* \param[in] parse_func the parser function, which will parse the given | |||
* information and modify the Network Config and IO. | |||
* | |||
*/ | |||
LITE_API bool register_parse_info_func(std::string info_type, | |||
const ParseInfoFunc& parse_func); | |||
/*! \brief Get version | |||
*/ | |||
LITE_API void get_version(int& major, int& minor, int& patch); | |||
/*! \brief Set the current log level. | |||
* \param[in] level The new log level | |||
*/ | |||
LITE_API void set_log_level(LiteLogLevel level); | |||
/*! \brief Get the current log level. | |||
* \return The current log level | |||
*/ | |||
LITE_API LiteLogLevel get_log_level(); | |||
/*! \brief Get device count | |||
* \param[in] device_type device type | |||
* \return the device count | |||
*/ | |||
LITE_API size_t get_device_count(LiteDeviceType device_type); | |||
/*! \brief try to coalesce all free memory in megenine | |||
*/ | |||
LITE_API void try_coalesce_all_free_memory(); | |||
/*! | |||
* \brief Set the loader to the lite | |||
* \param loader_path is the file path which store the cache | |||
*/ | |||
LITE_API void set_loader_lib_path(const std::string& loader_path); | |||
/*! | |||
* \brief Set the algo policy cache file for CPU/CUDA ... | |||
* \param cache_path is the file path which store the cache | |||
* \param always_sync sync the cache when model run | |||
*/ | |||
LITE_API void set_persistent_cache(const std::string& cache_path, | |||
bool always_sync = false); | |||
/*! | |||
* \brief dump the PersistentCache policy cache to file, if the network is set | |||
* to profile when forward, though this the algo policy will dump to file | |||
*/ | |||
LITE_API void dump_persistent_cache(const std::string& cache_path); | |||
/*! | |||
* \brief Set the TensorRT engine cache path for serialized prebuilt ICudaEngine | |||
*/ | |||
LITE_API void set_tensor_rt_cache(std::string tensorrt_cache_path); | |||
/*! | |||
* \brief dump the TensorRT cache to the file set in set_tensor_rt_cache | |||
*/ | |||
LITE_API void dump_tensor_rt_cache(); | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,20 @@ | |||
/** | |||
* \file include/lite/macro.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef LITE_MACRO_H_ | |||
#define LITE_MACRO_H_ | |||
#if defined(_WIN32) | |||
#define LITE_API __declspec(dllexport) | |||
#else | |||
#define LITE_API __attribute__((visibility("default"))) | |||
#endif | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,368 @@ | |||
/** | |||
* \file inlude/lite/network.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "macro.h" | |||
#include "tensor.h" | |||
#include <functional> | |||
#include <memory> | |||
#include <mutex> | |||
#include <string> | |||
#include <unordered_map> | |||
namespace lite { | |||
LITE_API inline LiteAlgoSelectStrategy operator|(LiteAlgoSelectStrategy x, | |||
LiteAlgoSelectStrategy y) { | |||
return static_cast<LiteAlgoSelectStrategy>(static_cast<uint32_t>(x) | | |||
static_cast<uint32_t>(y)); | |||
} | |||
/*! | |||
* \brief the inference options which will be translated to megenine | |||
* | |||
* \param weight_preprocess is the option wich optimize the inferece performance | |||
* with preprocess the const weights | |||
* | |||
* \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + | |||
* dimshuffle | |||
* | |||
* \param fake_next_exec whether only to perform non-computing tasks (like | |||
* memory allocation and queue initialization) for next exec. This would be | |||
* reset to false when the graph is executed. | |||
* | |||
* \param var_sanity_check_first_run Disable var sanity check on the first run. | |||
* Var sanity check is enabled on the first-time execution by default, and can | |||
* be used to find some potential memory access errors in the operator | |||
* implementation. | |||
* | |||
* \param const_shape This can be used to reduce memory usage since some | |||
* static inference data structures can be omitted. | |||
* | |||
* \param force_dynamic_alloc force dynamic memory alloc for all vars | |||
* | |||
* \param force_output_dynamic_alloc force dynamic memory alloc for output vars | |||
* which are used as CallbackCaller input when call compile() function | |||
* | |||
* \param no_profiling_on_shape_change do not re-profile to select best impl | |||
* algo when input shape changes (use previous algo) | |||
* | |||
* \param jit_level Execute supported operators with JIT (support MLIR, | |||
* NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: | |||
* 1 for basic elemwise opr; | |||
* 2 for including reduce operator | |||
* | |||
* \param record_level flag optimize the inference performace with record the | |||
* kernel tasks in first run, hereafter the inference all need to execute the | |||
* recorded tasks. | |||
* level = 0 means the normal inference, | |||
* level = 1 means use record inference, | |||
* level = 2 means record inference with free the extra memory | |||
* | |||
* \param graph_opt_level optimization level: | |||
* 0: disable | |||
* 1: level-1: inplace arith transformations during graph | |||
* construction | |||
* 2: level-2: level-1, plus global optimization before graph | |||
* compiling | |||
* 3: also enable JIT | |||
* <0: corresponding level, with result check for debug | |||
* | |||
* \param async_exec_level exec: dispatch on separate threads for different | |||
* comp_node. | |||
* 0: do not perform async dispatch | |||
* 1: dispatch async if there are more than one comp node with limited queue | |||
* mask 0b10: async if there are multiple comp nodes with | |||
* mask 0b100: always async | |||
*/ | |||
struct LITE_API Options { | |||
bool weight_preprocess = false; | |||
bool fuse_preprocess = false; | |||
bool fake_next_exec = false; | |||
bool var_sanity_check_first_run = true; | |||
bool const_shape = false; | |||
bool force_dynamic_alloc = false; | |||
bool force_output_dynamic_alloc = false; | |||
bool no_profiling_on_shape_change = false; | |||
uint8_t jit_level = 0; | |||
uint8_t comp_node_seq_record_level = 0; | |||
uint8_t graph_opt_level = 2; | |||
uint16_t async_exec_level = 1; | |||
//! layout transform options | |||
bool enable_nchw44 = false; | |||
bool enable_nchw44_dot = false; | |||
bool enable_nchw88 = false; | |||
bool enable_nhwcd4 = false; | |||
bool enable_nchw4 = false; | |||
bool enable_nchw32 = false; | |||
bool enable_nchw64 = false; | |||
}; | |||
/*! | |||
* \brief Configuration when load and compile the graph | |||
* | |||
* \param bare_model_cryption_name is the bare model cryption method name, bare | |||
*model is not pack json info inside | |||
* | |||
*\param has_compression flag whether the model is compressed, the compress | |||
*method will read form the model | |||
*/ | |||
struct LITE_API Config { | |||
bool has_compression = false; | |||
int device_id = 0; | |||
LiteDeviceType device_type = LiteDeviceType::LITE_CPU; | |||
LiteBackend backend = LiteBackend::LITE_DEFAULT; | |||
std::string bare_model_cryption_name = {}; | |||
Options options = {}; | |||
}; | |||
/*! | |||
* \brief config the network input and output item | |||
* | |||
*/ | |||
struct LITE_API IO { | |||
//! the tensor name in the graph corresponding to the IO | |||
std::string name; | |||
//! Used to mark where the input tensor comes from and the output where copy | |||
//! to, if is_host is true, the input is from host and output copy to host, | |||
//! otherwise device. Sometimes The input is from device and output no need | |||
//! copy to host, default is true. | |||
bool is_host = true; | |||
//! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
//! output tensor value is invaid, only shape will be set, default is VALUE | |||
LiteIOType io_type = LiteIOType::LITE_IO_VALUE; | |||
//! The layout of the config from user, if other layout is set before | |||
//! forward or get after forward by input tensor reset, this layout will by | |||
//! pass. if no other layout is set before forward, this layout will work. | |||
//! if this layout is no set, the model will forward with its origin layout. | |||
//! if in output, it will used to check. | |||
Layout config_layout = {}; | |||
}; | |||
/*! | |||
* \brief the input and output information when load the network | |||
* the NetworkIO will remain in the network until the network is destroyed | |||
*/ | |||
struct LITE_API NetworkIO { | |||
std::vector<IO> inputs = {}; | |||
std::vector<IO> outputs = {}; | |||
}; | |||
/*! | |||
* \brief A user-implemented allocator interface | |||
*/ | |||
class LITE_API Allocator { | |||
public: | |||
virtual ~Allocator() = default; | |||
//! allocate memory of size in the given device with the given align | |||
virtual void* allocate(LiteDeviceType device_type, int device_id, | |||
size_t size, size_t align) = 0; | |||
//! free the memory pointed by ptr in the given device | |||
virtual void free(LiteDeviceType device_type, int device_id, void* ptr) = 0; | |||
}; | |||
/*! | |||
* \brief the thread affinith callback type | |||
* \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), | |||
* thread_id of (nr_threads - 1) is the main worker thread. | |||
*/ | |||
using ThreadAffinityCallback = std::function<void(int thread_id)>; | |||
using AsyncCallback = std::function<void(void)>; | |||
/*! | |||
* \brief the start/finish callback function | |||
* \param unordered_map map from the io tensor name to the pair of which is the | |||
* corresponding IO of user config and the realy input or output tensor. | |||
*/ | |||
using StartCallback = std::function<void( | |||
const std::unordered_map<std::string, | |||
std::pair<IO, std::shared_ptr<Tensor>>>&)>; | |||
using FinishCallback = std::function<void( | |||
const std::unordered_map<std::string, | |||
std::pair<IO, std::shared_ptr<Tensor>>>&)>; | |||
/*! | |||
* \brief The network is construct form a model, implement model load, init, | |||
* forward, and display some model information | |||
*/ | |||
class LITE_API Network { | |||
public: | |||
class NetworkImplBase; | |||
~Network(); | |||
Network(const Config& config = {}, const NetworkIO& networkio = {}); | |||
Network(const NetworkIO& networkio, const Config& config = {}); | |||
//! load the model form memory | |||
void load_model(void* model_mem, size_t size); | |||
//! load the model from a model path | |||
void load_model(std::string model_path); | |||
//! only compute the output tensor in user configured | |||
void compute_only_configured_output(); | |||
//! get the network input and output tensor, the layout of which is | |||
//! sync from mge tensor, when the name of input and output tensor are the | |||
//! same, use LiteTensorPhase to separate | |||
std::shared_ptr<Tensor> get_io_tensor( | |||
std::string io_name, | |||
LiteTensorPhase phase = LiteTensorPhase::LITE_IO); | |||
//! get the network input by index | |||
std::shared_ptr<Tensor> get_input_tensor(size_t index); | |||
//! get the network output tensor by index | |||
std::shared_ptr<Tensor> get_output_tensor(size_t index); | |||
//! set the network forward in async mode and set the async callback | |||
//! function | |||
Network& set_async_callback(const AsyncCallback& async_callback); | |||
//! set the start forward callback function, which will be execute before | |||
//! forward. this can be used to check network input or dump model inputs | |||
//! for debug | |||
Network& set_start_callback(const StartCallback& start_callback); | |||
//! set the finish forward callback function, which will be execute after | |||
//! forward. this can be used to dump model outputs for debug | |||
Network& set_finish_callback(const FinishCallback& finish_callback); | |||
//! forward the network with filled input data and fill the output data | |||
//! to the output tensor | |||
void forward(); | |||
//! waite until forward finish in sync model | |||
void wait(); | |||
//! get the input tensor name in the order in load return | |||
std::string get_input_name(size_t index) const; | |||
//! get the output tensor name in the order in load return | |||
std::string get_output_name(size_t index) const; | |||
//! get all the input tensor name in the order in load return | |||
std::vector<std::string> get_all_input_name() const; | |||
//! get all the output tensor name in the order in load return | |||
std::vector<std::string> get_all_output_name() const; | |||
//! set/get device id, default device id = 0 | |||
Network& set_device_id(int device_id); | |||
int get_device_id() const; | |||
//! set/get stream id, default stream id = 0 | |||
Network& set_stream_id(int stream_id); | |||
int get_stream_id() const; | |||
//! enable profile the network, a file will be generated | |||
void enable_profile_performance(std::string profile_file_path); | |||
//! get model extra info | |||
const std::string& get_model_extra_info(); | |||
//! get device type | |||
LiteDeviceType get_device_type() const; | |||
public: | |||
friend class NetworkHelper; | |||
private: | |||
//! update member from implement | |||
void update_from_implement(); | |||
//! decrypt and parse the model file | |||
void prase_model(std::shared_ptr<void> model_data, size_t size); | |||
private: | |||
bool m_loaded = false; | |||
Config m_config; | |||
NetworkIO m_network_io; | |||
std::unique_ptr<NetworkImplBase> m_impl; | |||
std::string m_extra_info; | |||
}; | |||
/*********************** MGE special network function ***************/ | |||
class LITE_API Runtime { | |||
public: | |||
//! When device is CPU, this interface will set the to be loaded model | |||
//! run in multi thread mode with the given thread number. | |||
static void set_cpu_threads_number(std::shared_ptr<Network> dst_network, | |||
size_t nr_threads); | |||
static size_t get_cpu_threads_number(std::shared_ptr<Network> dst_network); | |||
//! set threads affinity callback; | |||
static void set_runtime_thread_affinity( | |||
std::shared_ptr<Network> network, | |||
const ThreadAffinityCallback& thread_affinity_callback); | |||
//! Set cpu default mode when device is CPU, in some low computation | |||
//! device or single core device, this mode will get good performace | |||
static void set_cpu_inplace_mode(std::shared_ptr<Network> dst_network); | |||
static bool is_cpu_inplace_mode(std::shared_ptr<Network> dst_network); | |||
//! Set use tensorrt forward | |||
static void use_tensorrt(std::shared_ptr<Network> dst_network); | |||
//! set opr algorithm selection strategy in the network | |||
//! shared_batch_size: the batch size used by fastrun, | |||
//! Non-zero value means that fastrun use this batch size | |||
//! regardless of the batch size of the model. Zero means | |||
//! fastrun use batch size of the model | |||
//! binary_equal_between_batch: if the content of each input batch is binary | |||
//! equal,whether the content of each output | |||
//! batch is promised to be equal | |||
static void set_network_algo_policy( | |||
std::shared_ptr<Network> dst_network, | |||
LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size = 0, | |||
bool binary_equal_between_batch = false); | |||
//! set workspace_limit for oprs with multiple algorithms, set | |||
//! workspace limitation can save memory but may influence the performance | |||
static void set_network_algo_workspace_limit( | |||
std::shared_ptr<Network> dst_network, size_t workspace_limit); | |||
//! set the network memroy allocator, the allocator is defined by user | |||
static void set_memory_allocator(std::shared_ptr<Network> dst_network, | |||
std::shared_ptr<Allocator> user_allocator); | |||
//! share the runtime memory with other network, the weights is not shared | |||
static void share_runtime_memory_with(std::shared_ptr<Network> dst_network, | |||
std::shared_ptr<Network> src_network); | |||
//! Dump input/output values of all internal variables to output | |||
//! file, in txt format | |||
static void enable_io_txt_dump(std::shared_ptr<Network> dst_network, | |||
std::string io_txt_out_file); | |||
//! Dump input/output values of all internal variables to output | |||
//! directory, in binary format | |||
static void enable_io_bin_dump(std::shared_ptr<Network> dst_network, | |||
std::string io_bin_out_dir); | |||
//! load a new network which will share weights with src network | |||
static void shared_weight_with_network( | |||
std::shared_ptr<Network> dst_network, | |||
const std::shared_ptr<Network> src_network); | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,224 @@ | |||
/** | |||
* \file inlude/lite/tensor.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "common_enum_c.h" | |||
#include "macro.h" | |||
#include <memory> | |||
#include <unordered_map> | |||
#include <vector> | |||
namespace lite { | |||
/*! | |||
* \brief the simple layout description | |||
*/ | |||
struct LITE_API Layout { | |||
static constexpr uint32_t MAXDIM = 7; | |||
size_t shapes[MAXDIM]; | |||
size_t ndim = 0; | |||
LiteDataType data_type = LiteDataType::LITE_FLOAT; | |||
//! get the total byte of a layout | |||
size_t get_elem_size() const; | |||
//! compare whether the two layout is equal | |||
bool operator==(const Layout& other) const; | |||
}; | |||
/*! | |||
* \brief warpper of the MegEngine Tensor | |||
* | |||
* The memory is not alloc directly, when call get_memory_ptr() the memory | |||
* will be allocated in tensor implement, which will be deleted automatically | |||
* | |||
* Note: if the tensor memory is set through reset() interface, the memory is | |||
* managed by the user, it will not be freed by the tensor | |||
* | |||
* If the device or layout is not set, when copy form other source tensor, its | |||
* device and layout will be copy form the source tensor | |||
* | |||
* if is_pinned_host is set, the storage memory of the tensor is pinned memory, | |||
* this is used to Optimize the H2D or D2H memory copy, if the device or layout | |||
* is not set, when copy form other device(CUDA) tensor, this tensor | |||
* will be automatically set to pinned tensor | |||
*/ | |||
class LITE_API Tensor { | |||
class TensorImpl; | |||
public: | |||
class TensorImplBase; | |||
Tensor(); | |||
Tensor(LiteDeviceType device_type, bool is_pinned_host = false); | |||
Tensor(LiteDeviceType device_type, const Layout& layout, | |||
bool is_pinned_host = false); | |||
Tensor(int device_id, LiteDeviceType device_type, const Layout& layout = {}, | |||
bool is_pinned_host = false); | |||
Tensor(int device_id, int stream_id, LiteDeviceType device_type, | |||
bool is_pinned_host = false); | |||
Tensor(LiteBackend backend, | |||
LiteDeviceType device_type = LiteDeviceType::LITE_CPU, | |||
int device_id = 0, const Layout& layout = {}, | |||
bool is_pinned_host = false); | |||
~Tensor(); | |||
LiteDeviceType get_device_type() const { return m_device_type; }; | |||
int get_device_id() const { return m_device_id; }; | |||
Layout get_layout() const { return m_layout; }; | |||
bool is_pinned_host() const { return m_is_pinned_host; }; | |||
//! set layout will change the layout and reallocate memory of the tensor | |||
void set_layout(const Layout& layout); | |||
//! which will trigger memory alloc in tensor implement | |||
void* get_memory_ptr() const; | |||
//! get the memory with the offset describe in idx | |||
void* get_memory_ptr(const std::vector<size_t>& idx) const; | |||
//! get the tensor capacity in byte | |||
size_t get_tensor_total_size_in_byte() const; | |||
//! use the user allocated data to reset the memory of the tensor, the | |||
//! memory will not be managed by the lite, later, the user should delete | |||
//! it. | |||
void reset(void* prepared_data, size_t data_length_in_byte); | |||
//! use the user allocated data and corresponding layout to reset the data | |||
//! and layout of the tensor, the memory will not be managed by lite, later, | |||
//! the user should delete it. | |||
void reset(void* prepared_data, const Layout& layout); | |||
//! reshape the tensor with new shape, keep the data_type the same | |||
void reshape(const std::vector<int>& shape); | |||
//! get a new tensor slice from the origin tensor | |||
std::shared_ptr<Tensor> slice(const std::vector<size_t>& start, | |||
const std::vector<size_t>& end, | |||
const std::vector<size_t>& step = {}); | |||
//! set the tensor memory with zero | |||
void fill_zero(); | |||
//! copy tensor form other tensor | |||
//! Note: the best way for tensor copy is just set the dst device, left | |||
//! layout empty, when copying the dst layout will be set the same with | |||
//! src | |||
void copy_from(const Tensor& src); | |||
//! share memory with other tensor | |||
void share_memory_with(const Tensor& src_tensor); | |||
//! whether the memory of tensor is continue | |||
bool is_continue_memory() const; | |||
//! update the menbers from the implement | |||
void update_from_implement(); | |||
public: | |||
friend class TensorHelper; | |||
private: | |||
std::shared_ptr<TensorImplBase> m_tensor_impl; | |||
//! flag whether the storage of the tensor is pinned, this is only used | |||
//! when the compnode is not in CPU | |||
bool m_is_pinned_host = false; | |||
int m_device_id = 0; | |||
Layout m_layout; | |||
//! the device of the tensor should not be changed after the tensor has | |||
//! constructed | |||
LiteDeviceType m_device_type = LiteDeviceType::LITE_CPU; | |||
}; | |||
/** | |||
* \brief a class can hold any type data, but not check whether the visit type | |||
* is valid | |||
*/ | |||
class LITE_API LiteAny { | |||
public: | |||
LiteAny() = default; | |||
template <class T> | |||
LiteAny(T value) : m_holder(new AnyHolder<T>(value)) { | |||
m_is_string = std::is_same<std::string, T>(); | |||
} | |||
LiteAny(const LiteAny& any) { | |||
m_holder = any.m_holder->clone(); | |||
m_is_string = any.is_string(); | |||
} | |||
LiteAny& operator=(const LiteAny& any) { | |||
m_holder = any.m_holder->clone(); | |||
m_is_string = any.is_string(); | |||
return *this; | |||
} | |||
bool is_string() const { return m_is_string; } | |||
class HolderBase { | |||
public: | |||
virtual ~HolderBase() = default; | |||
virtual std::shared_ptr<HolderBase> clone() = 0; | |||
virtual size_t type_length() const = 0; | |||
}; | |||
template<class T> | |||
class AnyHolder : public HolderBase { | |||
public: | |||
AnyHolder(const T value) : | |||
m_value(value) { | |||
} | |||
virtual std::shared_ptr<HolderBase> clone() override { | |||
return std::make_shared<AnyHolder>(m_value); | |||
} | |||
virtual size_t type_length() const override { return sizeof(T); } | |||
public: | |||
T m_value; | |||
}; | |||
//! if type is miss matching, it will throw | |||
void type_missmatch(size_t expect, size_t get) const; | |||
//! only check the storage type and the visit type length, so it's not safe | |||
template <class T> | |||
T unsafe_cast() const { | |||
if (sizeof(T) != m_holder->type_length()) { | |||
type_missmatch(m_holder->type_length(), sizeof(T)); | |||
} | |||
return static_cast<LiteAny::AnyHolder<T>*>(m_holder.get())->m_value; | |||
} | |||
//! only check the storage type and the visit type length, so it's not safe | |||
void* cast_void_ptr() const { | |||
return &static_cast<LiteAny::AnyHolder<char>*>(m_holder.get())->m_value; | |||
} | |||
private: | |||
std::shared_ptr<HolderBase> m_holder; | |||
bool m_is_string = false; | |||
}; | |||
/*********************** special tensor function ***************/ | |||
class LITE_API TensorUtils { | |||
public: | |||
//! concat all the input tensor to one on the specified dim, the result | |||
//! tensor reside in dst_device_id of dst_device, if dst_device is | |||
//! LITE_DEVICE_DEFAULT, the device will get from the first tensor | |||
static std::shared_ptr<Tensor> concat( | |||
const std::vector<Tensor>& tensors, int dim, | |||
LiteDeviceType dst_device = LiteDeviceType::LITE_DEVICE_DEFAULT, | |||
int dst_device_id = -1); | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,169 @@ | |||
/** | |||
* \file lite-c/include/lite-c/global-c.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef LITE_C_GLOBAL_H_ | |||
#define LITE_C_GLOBAL_H_ | |||
#include "macro.h" | |||
#include "network_c.h" | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
/*! \brief Get version | |||
*/ | |||
LITE_API int LITE_get_version(int* major, int* minor, int* patch); | |||
/*! \brief Get the last error message. | |||
* \return the message pointer | |||
*/ | |||
LITE_API const char* LITE_get_last_error(); | |||
/*! \brief Get device count | |||
* \param[in] device_type device type | |||
* \return the device count | |||
*/ | |||
LITE_API int LITE_get_device_count(LiteDeviceType device_type, size_t* count); | |||
/*! \brief try to coalesce all free memory in megenine | |||
*/ | |||
LITE_API int LITE_try_coalesce_all_free_memory(); | |||
/** | |||
* \brief Model decryption function | |||
* | |||
* \param[in] input_data is the decrypted model memory pointer | |||
* \param[in] input_size the size the decrypted model memory in byte | |||
* \param[in] key_data decryption key data | |||
* \param[in] key_size the size of decryption key data | |||
* \param[out] output_data the data of decrypted data, if output_data is | |||
* nullptr, just query the output memory length, else write the decryted data to | |||
* the output_data | |||
* \return size of decrypted data | |||
*/ | |||
typedef size_t (*LiteDecryptionFunc)(const void* input_data, size_t input_size, | |||
const uint8_t* key_data, size_t key_size, | |||
const void* output_data); | |||
/** | |||
* \brief Model information parse function | |||
* | |||
* \param[in] info_data is the information memory | |||
* \param[in] info_size the size the information memory | |||
* \param[in] model_name the model name used for check whether the | |||
* infomation match the model | |||
* \param[in] config the model config, ParseInfoFunc can fill it with the | |||
* information in json, the config will influence Network loading later | |||
* \param[in] network_io the model IO, ParseInfoFunc can fill it with the | |||
* information in json, the networkio will influence Network forwarding later | |||
* \param[in] device_id the address to store device_id, default 0 | |||
* \param[in] nr_threads the address to store nr_threads, default 1 | |||
* \param[in] is_inplace_model the address to store is_cpu_inplace_mode, default | |||
* \param[in] use_tensorrt the address to store is_cpu_inplace_mode, default | |||
* false | |||
*/ | |||
typedef int (*LiteParseInfoFunc)(const void* info_data, size_t info_size, | |||
const char* model_name, LiteConfig* config, | |||
LiteNetworkIO* network_io, int* device_id, | |||
size_t* nr_threads, int* is_cpu_inplace_mode, | |||
int* use_tensorrt); | |||
/** | |||
* \brief register a custom decryption method and key to lite. | |||
* | |||
* \param[in] decrypt_name the name of the decryption, which will act as the | |||
* hash key to find the decryption method. | |||
* | |||
* \param[in] func the decryption function, which will decrypt the model with | |||
* the registered key, return a vector that contain the decrypted model. | |||
* \param[in] key_data the decryption key of the method | |||
* \param[in] key_size the size of decryption key | |||
*/ | |||
LITE_API int LITE_register_decryption_and_key(const char* decrypt_name, | |||
const LiteDecryptionFunc func, | |||
const uint8_t* key_data, | |||
size_t key_size); | |||
/** | |||
* \brief update decryption function or key of a custom decryption method. | |||
* | |||
* \param[in] decrypt_name the name of the decryption, which will act as the | |||
* hash key to find the decryption method. | |||
* | |||
* \param[in] func the decryption function, which will decrypt the model with | |||
* the registered key, return a vector that contain the decrypted model. if | |||
* function is nullptr, it will not be updated. | |||
* | |||
* \param[in] key the decryption key of the method, if the size of key is zero, | |||
* it will not be updated | |||
*/ | |||
LITE_API int LITE_update_decryption_or_key(const char* decrypt_name, | |||
const LiteDecryptionFunc func, | |||
const uint8_t* key_data, | |||
size_t key_size); | |||
/** | |||
* \brief register a custom parser function to lite. | |||
* | |||
* \param[in] info_type the name of the parser function, which will act as the | |||
* hash key to find the parser method. | |||
* | |||
* \param[in] parse_func the parser function, which will parse the given | |||
* information and modify the Network Config and IO. | |||
* | |||
*/ | |||
LITE_API int LITE_register_parse_info_func(const char* info_type, | |||
const LiteParseInfoFunc parse_func); | |||
/*! | |||
* \brief Set the loader to the lite | |||
* \param[in] loader_path is the file path which store the cache | |||
*/ | |||
LITE_API int LITE_set_loader_lib_path(const char* loader_path); | |||
/*! | |||
* \brief Set the algo policy cache file for CPU/CUDA ... | |||
* \param[in] cache_path is the file path which store the cache | |||
* \param[in] always_sync sync the cache when cache updated | |||
*/ | |||
LITE_API int LITE_set_persistent_cache(const char* cache_path, int always_sync); | |||
/*! | |||
* \brief Set the tensor policy cache file for CPU/CUDA ... | |||
* \param[in] cache_path is the file path which store the cache | |||
*/ | |||
LITE_API int LITE_set_tensor_rt_cache(const char* cache_path); | |||
/*! \brief Set the current log level. | |||
* \param[in] level The new log level | |||
*/ | |||
LITE_API int LITE_set_log_level(LiteLogLevel level); | |||
/*! \brief Get the current log level. | |||
* \param[in] level The pointer to log level | |||
*/ | |||
LITE_API int LITE_get_log_level(LiteLogLevel* level); | |||
/*! | |||
* \brief dump the algo policy cache to file, if the network is set to profile | |||
* when forward, though this the algo policy will dump to file | |||
* \param[in] cache_path is the file path which store the cache | |||
*/ | |||
LITE_API int LITE_dump_persistent_cache(const char* cache_path); | |||
/*! | |||
* \brief dump the tensorrt policy cache to file | |||
*/ | |||
LITE_API int LITE_dump_tensor_rt_cache(); | |||
#endif | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,525 @@ | |||
/** | |||
* \file lite-c/include/lite-c/network_c.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef LITE_C_NETWORK_H_ | |||
#define LITE_C_NETWORK_H_ | |||
#include "tensor_c.h" | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
/*! | |||
* \brief the inference options which will be translated to megenine | |||
* | |||
* \param weight_preprocess is the option wich optimize the inferece performance | |||
* with preprocess the const weights | |||
* | |||
* \param fuse_preprocess fuse preprocess patten, like astype + pad_channel + | |||
* dimshuffle | |||
* | |||
* \param fake_next_exec whether only to perform non-computing tasks (like | |||
* memory allocation and queue initialization) for next exec. This would be | |||
* reset to false when the graph is executed. | |||
* | |||
* \param var_sanity_check_first_run Disable var sanity check on the first run. | |||
* Var sanity check is enabled on the first-time execution by default, and can | |||
* be used to find some potential memory access errors in the operator | |||
* implementation. | |||
* | |||
* \param const_shape This can be used to reduce memory usage since some | |||
* static inference data structures can be omitted. | |||
* | |||
* \param force_dynamic_alloc force dynamic memory alloc for all vars | |||
* | |||
* \param force_output_dynamic_alloc force dynamic memory alloc for output vars | |||
* which are used as CallbackCaller input when call compile() function | |||
* | |||
* \param no_profiling_on_shape_change do not re-profile to select best impl | |||
* algo when input shape changes (use previous algo) | |||
* | |||
* \param jit_level Execute supported operators with JIT (support MLIR, | |||
* NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: | |||
* 1 for basic elemwise opr; | |||
* 2 for including reduce operator | |||
* | |||
* \param record_level flag optimize the inference performace with record the | |||
* kernel tasks in first run, hereafter the inference all need to execute the | |||
* recorded tasks. | |||
* level = 0 means the normal inference, | |||
* level = 1 means use record inference, | |||
* level = 2 means record inference with free the extra memory | |||
* | |||
* \param graph_opt_level optimization level: | |||
* 0: disable | |||
* 1: level-1: inplace arith transformations during graph | |||
* construction | |||
* 2: level-2: level-1, plus global optimization before graph | |||
* compiling | |||
* 3: also enable JIT | |||
* <0: corresponding level, with result check for debug | |||
* | |||
* \param async_exec_level exec: dispatch on separate threads for different | |||
* comp_node. | |||
* 0: do not perform async dispatch | |||
* 1: dispatch async if there are more than one comp node with limited queue | |||
* mask 0b10: async if there are multiple comp nodes with | |||
* mask 0b100: always async | |||
*/ | |||
typedef struct Options { | |||
int weight_preprocess; | |||
int fuse_preprocess; | |||
int fake_next_exec; | |||
int var_sanity_check_first_run; | |||
int const_shape; | |||
int force_dynamic_alloc; | |||
int force_output_dynamic_alloc; | |||
int no_profiling_on_shape_change; | |||
int jit_level; | |||
int comp_node_seq_record_level; | |||
int graph_opt_level; | |||
int async_exec_level; | |||
//! layout transform options | |||
int enable_nchw44; | |||
int enable_nchw44_dot; | |||
int enable_nchw88; | |||
int enable_nhwcd4; | |||
int enable_nchw4; | |||
int enable_nchw32; | |||
int enable_nchw64; | |||
} LiteOptions; | |||
//! define a default Options | |||
extern LITE_API const LiteOptions default_option; | |||
/*! | |||
* \brief Configuration when load and compile the graph | |||
* | |||
* \param bare_model_cryption_name is the bare model cryption method name, bare | |||
*model is not pack json info inside | |||
* | |||
*\param has_compression flag whether the model is compressed, the compress | |||
*method will read form the model | |||
*/ | |||
typedef struct LiteConfig { | |||
int has_compression; | |||
int device_id; | |||
LiteDeviceType device_type; | |||
LiteBackend backend; | |||
const char* bare_model_cryption_name; | |||
LiteOptions options; | |||
} LiteConfig; | |||
//! get default config | |||
LITE_API LiteConfig* default_config(); | |||
/*! | |||
* \brief config the network input and output item | |||
* | |||
*/ | |||
typedef struct LiteIO { | |||
//! the tensor name in the graph corresponding to the IO | |||
const char* name; | |||
//! Used to mark where the input tensor comes from and the output where copy | |||
//! to, if is_host is true, the input is from host and output copy to host, | |||
//! otherwise device. Sometimes The input is from device and output no need | |||
//! copy to host, default is true. | |||
int is_host; | |||
//! The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
//! output tensor value is invaid, only shape will be set, default is VALUE | |||
LiteIOType io_type; | |||
//! The layout of the config from user, if other layout is set before | |||
//! forward or get after forward, this layout will by pass. if no other | |||
//! layout is set before forward, this layout will work. if this layout is | |||
//! no set, the model will forward with its origin layout. if in output, it | |||
//! will used to check. | |||
LiteLayout config_layout; | |||
} LiteIO; | |||
//! define a default IO | |||
extern LITE_API const LiteIO default_io; | |||
/*! | |||
* \brief the input and output information when load the network | |||
* the NetworkIO will remain in the network until the network is destroyed | |||
*/ | |||
typedef struct LiteNetworkIO { | |||
LiteIO* inputs; | |||
LiteIO* outputs; | |||
size_t input_size; //! the number IO in inputs | |||
size_t output_size; //! the number IO in outputs | |||
} LiteNetworkIO; | |||
//! get default NetworkIO | |||
LITE_API LiteNetworkIO* default_network_io(); | |||
/*! | |||
* \brief A user-implemented allocator function | |||
*/ | |||
//! allocate memory of size in the given device with the given align | |||
typedef void* (*LiteAllocate)(LiteDeviceType device_type, int device_id, | |||
size_t size, size_t align); | |||
//! free the memory pointed by ptr in the given device | |||
typedef void (*LiteFree)(LiteDeviceType device_type, int device_id, void* ptr); | |||
/*! | |||
* \brief the thread affinith callback type | |||
* \param thread_id thread_id is the a number begin from 0 to (nr_threads - 1), | |||
* thread_id of (nr_threads - 1) is the main worker thread. | |||
*/ | |||
typedef int (*LiteThreadAffinityCallback)(int thread_id); | |||
typedef int (*LiteAsyncCallback)(); | |||
/*! | |||
* \brief the start/finish callback function | |||
* \param unordered_map map from the io tensor name to the pair of which is the | |||
* corresponding IO of user config and the realy input or output tensor. | |||
*/ | |||
typedef int (*LiteStartCallback)(const LiteIO* inputs, | |||
const LiteTensor* input_tensors, size_t size); | |||
typedef int (*LiteFinishCallback)(const LiteIO* outputs, | |||
const LiteTensor* output_tensors, | |||
size_t size); | |||
/*! | |||
* \brief The network is construct form a model, implement model load, init, | |||
* forward, and display some model information | |||
*/ | |||
typedef void* LiteNetwork; | |||
/** | |||
* \brief Create a lite Network object with default config and networkIO. | |||
* \param[out] network The netwrok pointer | |||
* \return int if the return is not zero, error happened, the error message | |||
* can get by LITE_get_last_error | |||
*/ | |||
LITE_API int LITE_make_default_network(LiteNetwork* network); | |||
/** | |||
* \brief Create a lite Network object from the given config and networkIO. | |||
* \param[in] config The configration to create the network | |||
* \param[in] network_io The configration io to create the network | |||
* \param[out] network The network pointer | |||
*/ | |||
LITE_API int LITE_make_network(LiteNetwork* network, const LiteConfig config, | |||
const LiteNetworkIO network_io); | |||
/** | |||
* \brief Create a lite Network object from the given config and networkIO. | |||
* \param[in] config The configration to create the network | |||
* \param[out] network The network pointer | |||
*/ | |||
LITE_API int LITE_make_network_config(LiteNetwork* network, const LiteConfig config); | |||
/** | |||
* \brief load the model to network form memory | |||
* \param[in] model_mem The model in memory | |||
* \param[in] size The size of the model memory | |||
* \param[out] network The network to be load model in | |||
*/ | |||
LITE_API int LITE_load_model_from_mem(LiteNetwork network, void* model_mem, | |||
size_t size); | |||
/** | |||
* \brief load the model to network form given path | |||
* \param[in] model_path The model path | |||
* \param[out] network The network to be load model in | |||
*/ | |||
LITE_API int LITE_load_model_from_path(LiteNetwork network, | |||
const char* model_path); | |||
/** | |||
* \brief load a new network which will share weights with src network | |||
* \param[in] origin_network The origin network pointer | |||
* \param[out] network The network pointer | |||
*/ | |||
LITE_API int LITE_shared_weight_with_network(LiteNetwork dst_network, | |||
const LiteNetwork src_network); | |||
/** | |||
* \brief Destroy a lite network object. | |||
* \param[in] network The network pointer | |||
* \return int if the return is not zero, error happened, the error message | |||
* can get by LITE_get_last_error | |||
*/ | |||
LITE_API int LITE_destroy_network(LiteNetwork network); | |||
/** | |||
* \brief forward the network with filled input data and fill the output data | |||
* to the output tensor | |||
* \param[in] network The loaded model | |||
*/ | |||
LITE_API int LITE_forward(const LiteNetwork network); | |||
/** | |||
* \brief waite until forward finish in sync model | |||
* \param[in] network The loaded model | |||
*/ | |||
LITE_API int LITE_wait(const LiteNetwork network); | |||
/** | |||
* \brief get the network input and ouput tensor, the layout of which is | |||
* get from model | |||
* \param[in] network The loaded model | |||
* \param[in] io_name The input or output name | |||
* \param[in] phase The tensor phase | |||
* \param[out] tensor The IO tensor get from the network | |||
*/ | |||
LITE_API int LITE_get_io_tensor(LiteNetwork network, const char* io_name, | |||
LiteTensorPhase phase, LiteTensor* tensor); | |||
/** | |||
* \brief get the input tensor name in the order in loaded model | |||
* \param[in] network The loaded model | |||
* \param[in] index The index of input tensor | |||
* \param[out] name The input tensor name | |||
*/ | |||
LITE_API int LITE_get_input_name(const LiteNetwork network, size_t index, | |||
const char** name); | |||
/** | |||
* \brief get the output tensor name in the order in loaded model | |||
* \param[in] network The loaded model | |||
* \param[in] index The index of output tensor | |||
* \param[out] name The output tensor name | |||
*/ | |||
LITE_API int LITE_get_output_name(const LiteNetwork network, size_t index, | |||
const char** name); | |||
/** | |||
* \brief get all the input tensor name in the order in loaded model | |||
* \param[in] network The loaded model | |||
* \param[in] size The number of the input tensor | |||
* \param[out] name The input tensor names | |||
*/ | |||
LITE_API int LITE_get_all_input_name(const LiteNetwork network, size_t* size, | |||
const char** name); | |||
/** | |||
* \brief get all the output tensor name in the order in loaded model | |||
* \param[in] network The loaded model | |||
* \param[in] size The number of output tensor | |||
* \param[out] name The output tensor name | |||
*/ | |||
LITE_API int LITE_get_all_output_name(const LiteNetwork network, size_t* size, | |||
const char** name); | |||
/** | |||
* \brief get whether the model is running in cpu inplace mode | |||
* \param[in] network The loaded model | |||
* \param[out] is_cpu_inplace_mode whether is in cpu inplace mode | |||
*/ | |||
LITE_API int LITE_is_cpu_inplace_mode(const LiteNetwork network, | |||
int* is_cpu_inplace_mode); | |||
/** | |||
* \brief get the number of thread the network will run with | |||
* \param[in] network The loaded model | |||
* \param[out] nr_threads the thread number when the network running | |||
*/ | |||
LITE_API int LITE_get_cpu_threads_number(const LiteNetwork network, | |||
size_t* nr_threads); | |||
/** | |||
* \brief get the device id the network will run with | |||
* \param[in] network The loaded model | |||
* \param[out] device_id the device id of the network will run | |||
*/ | |||
LITE_API int LITE_get_device_id(const LiteNetwork network, int* device_id); | |||
/** | |||
* \brief get the stream id the network will run with | |||
* \param[in] network The loaded model | |||
* \param[out] stream_id the stream id of the network will run | |||
*/ | |||
LITE_API int LITE_get_stream_id(const LiteNetwork network, int* stream_id); | |||
/** | |||
* \brief get the device type the network will run with | |||
* \param[in] network The loaded model | |||
* \param[out] device_type the device type of the network will run | |||
*/ | |||
LITE_API int LITE_get_device_type(const LiteNetwork network, | |||
LiteDeviceType* device_type); | |||
/** | |||
* \brief get the device type the network will run with | |||
* \param[in] network The loaded model | |||
* \param[out] info : the json format memory | |||
* \param[out] info_size: the json format memory size | |||
*/ | |||
LITE_API int LITE_get_model_extra_info(const LiteNetwork network, | |||
const char** info, int* info_size); | |||
/** | |||
* \brief Set cpu default mode when device is CPU, in some low computation | |||
* device or single core device, this mode will get good performace | |||
* \param[in] network The loaded model | |||
*/ | |||
LITE_API int LITE_set_cpu_inplace_mode(LiteNetwork network); | |||
/** | |||
* \brief When device is CPU, this interface will set the to be loaded model | |||
* run in multi thread mode with the given thread number. | |||
* \param[in] network The loaded model | |||
* \param[in] nr_threads The threads number | |||
*/ | |||
LITE_API int LITE_set_cpu_threads_number(LiteNetwork network, | |||
size_t nr_threads); | |||
/** | |||
* \brief set device id, default device id = 0 | |||
* \param[in] network The loaded model | |||
* \param[in] device_id The device id to be set | |||
*/ | |||
LITE_API int LITE_set_device_id(LiteNetwork network, int device_id); | |||
/** | |||
* \brief set stream id, default stream id = 0 | |||
* \param[in] network The loaded model | |||
* \param[in] stream_id The stream id to be set | |||
*/ | |||
LITE_API int LITE_set_stream_id(LiteNetwork network, int stream_id); | |||
/** | |||
* \brief enable tensorrt | |||
* \param[in] network The loaded model | |||
*/ | |||
LITE_API int LITE_use_tensorrt(LiteNetwork network); | |||
/** | |||
* \brief set opr algorithm selection strategy in the network | |||
* \param[in] network The loaded model | |||
* \param[in] select_strategy The operator algorithm selection strategy | |||
*/ | |||
LITE_API int LITE_set_network_algo_policy(LiteNetwork network, | |||
LiteAlgoSelectStrategy strategy); | |||
/** | |||
* \brief set opr algorithm selection strategy in the network | |||
* \param[in] network The loaded model | |||
* \param[in] shared_batch_size: the batch size used by fastrun, | |||
* Non-zero value means that fastrun use this batch size | |||
* regardless of the batch size of the model. Zero means | |||
* fastrun use batch size of the model | |||
* \param[in] binary_equal_between_batch: if the content of each input batch is | |||
* binary equal,whether the content of each output batch is | |||
* promised to be equal | |||
*/ | |||
LITE_API int LITE_set_network_algo_fastrun_config( | |||
LiteNetwork network, unsigned int shared_batch_size, | |||
int binary_equal_between_batch); | |||
/** | |||
* \brief set workspace_limit for oprs with multiple algorithms, set | |||
* workspace limit can save memory but may influence the performance | |||
* \param[in] network The loaded model | |||
* \param[in] workspace_limit The operator algorithm workspace limit | |||
*/ | |||
LITE_API int LITE_set_network_algo_workspace_limit(LiteNetwork network, | |||
size_t workspace_limit); | |||
/** | |||
* \brief set the network forward in async mode and set the async callback | |||
* function | |||
* \param[in] network The loaded model | |||
* \param[in] async_callback when network finish forwarding, the callbak | |||
* will be called | |||
*/ | |||
LITE_API int LITE_set_async_callback(LiteNetwork network, | |||
const LiteAsyncCallback async_callback); | |||
/** | |||
* \brief set the start forward callback function, which will be execute beform | |||
* forward, this can be used to check network input or dump model inputs | |||
* for debug | |||
* \param[in] network The loaded model | |||
* \param[in] start_callback when network start forwarding, the callbak | |||
* will be called | |||
*/ | |||
LITE_API int LITE_set_start_callback(LiteNetwork network, | |||
const LiteStartCallback start_callback); | |||
/** | |||
* \brief set the finish forward callback function, which will be execute after | |||
* forward, this can be used to dump model outputs for debug | |||
* \param[in] network The loaded model | |||
* \param[in] finish_callback when network finish forwarding, the callbak | |||
* will be called | |||
*/ | |||
LITE_API int LITE_set_finish_callback(LiteNetwork network, | |||
const LiteFinishCallback finish_callback); | |||
/** | |||
* \brief set threads affinity callback | |||
* \param[in] network The loaded model | |||
* \param[in] thread_affinity_callback | |||
*/ | |||
LITE_API int LITE_set_runtime_thread_affinity( | |||
LiteNetwork network, | |||
const LiteThreadAffinityCallback thread_affinity_callback); | |||
/** | |||
* \brief set the network memroy allocator, the allocator is defined by user | |||
* \param[in] network The loaded model | |||
* \param[in] allocate_fun The allocate function of the user defined allocator | |||
* \param[in] free_fun The free function of the user defined allocator | |||
*/ | |||
LITE_API int LITE_set_memory_allocator(LiteNetwork network, | |||
const LiteAllocate allocate_fun, | |||
const LiteFree free_fun); | |||
/** | |||
* \brief the dst_network share the runtime memory with src_network | |||
* \param[in] src_network The source network | |||
* \param[in] dst_network The dst network to shared memory with src_network | |||
*/ | |||
LITE_API int LITE_share_runtime_memroy(LiteNetwork src_network, | |||
LiteNetwork dst_network); | |||
/** | |||
* \brief enable profile the network, a JSON format file will be generated | |||
* \param[in] network The loaded model | |||
* \param[in] profile_json_file_path The profile result file path | |||
*/ | |||
LITE_API int LITE_enable_profile_performance( | |||
LiteNetwork network, const char* profile_json_file_path); | |||
/** | |||
* \brief Dump input/output values of all internal variables to output file, | |||
* in text format | |||
* \param[in] network The loaded model | |||
* \param[in] io_txt_out_file The dumped txt file name | |||
*/ | |||
LITE_API int LITE_enable_io_txt_dump(LiteNetwork network, | |||
const char* io_txt_out_file); | |||
/** | |||
* \brief Dump input/output values of all internal variables to output | |||
* directory, in binary format | |||
* \param[in] network The loaded model | |||
* \param[in] io_bin_out_dir The dumped bin file directory | |||
*/ | |||
LITE_API int LITE_enable_io_bin_dump(LiteNetwork network, | |||
const char* io_bin_out_dir); | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,251 @@ | |||
/** | |||
* \file lite-c/include/lite-c/tensor_c.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef LITE_TENSOR_C_H_ | |||
#define LITE_TENSOR_C_H_ | |||
#include "common_enum_c.h" | |||
#include "macro.h" | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
#include "stddef.h" | |||
#include "stdint.h" | |||
#define LAYOUT_MAX_DIM (7) | |||
/*! | |||
* \brief the simple layout description | |||
*/ | |||
typedef struct LiteLayout { | |||
size_t shapes[LAYOUT_MAX_DIM]; | |||
size_t ndim; | |||
LiteDataType data_type; | |||
} LiteLayout; | |||
//! define a default LiteLayout | |||
extern LITE_API const LiteLayout default_layout; | |||
/*! | |||
* \brief warpper of the MegEngine Tensor | |||
* | |||
* if is_pinned_host is set, the storage memory of the tensor is pinned memory, | |||
* this is used to Optimize the H2D or D2H memory copy, if the device or layout | |||
* is not set, when copy form other device(CUDA, OpenCL) tensor, this tensor | |||
* will be automatically set to pinned tensor | |||
*/ | |||
typedef struct LiteTensorDesc { | |||
//! flag whether the storage of the tensor is pinned, this is only used when | |||
//! the compnode is not in CPU | |||
int is_pinned_host; | |||
//! the layout of the tensor | |||
LiteLayout layout; | |||
//! the device of the tensor should not be changed after the tensor has | |||
//! constructed | |||
LiteDeviceType device_type; | |||
//! device id of the tensor | |||
int device_id; | |||
} LiteTensorDesc; | |||
//! define a default TensorDesc | |||
extern LITE_API const LiteTensorDesc default_desc; | |||
/*! | |||
* \brief The pointer to a Lite Tensor object | |||
*/ | |||
typedef void* LiteTensor; | |||
/** | |||
* \brief Create a lite tensor object from the given describe. | |||
* \param[in] tensor_describe The description to create the Tensor | |||
* \param[out] tensor The Tensor pointer | |||
* \return int if the return is not zero, error happened, the error message | |||
* can get by LITE_get_last_error | |||
*/ | |||
LITE_API int LITE_make_tensor(const LiteTensorDesc tensor_describe, | |||
LiteTensor* tensor); | |||
/** | |||
* \brief Destroy a lite tensor object. | |||
* \param[in] tensor The Tensor pointer | |||
* \return int if the return is not zero, error happened, the error message | |||
* can get by LITE_get_last_error | |||
*/ | |||
LITE_API int LITE_destroy_tensor(LiteTensor tensor); | |||
/** | |||
* \brief change the layout of a Tensor object. | |||
* \param[in] tensor The Tensor | |||
* \param[out] layout The Layout to be set to a tensor | |||
*/ | |||
LITE_API int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout); | |||
/** | |||
* \brief use the user allocated data to reset the memory of the tensor, the | |||
* memory will not be managed by the lite, later, the user should delete | |||
* it. | |||
* \param[in] tensor The Tensor | |||
* \param[in] prepared_data The allocated memory which satisfy the Tensor | |||
* \param[in] data_length_in_byte The length of the allocated memory | |||
* layout | |||
*/ | |||
LITE_API int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data, | |||
size_t data_length_in_byte); | |||
/** | |||
* \brief use the user allocated data and corresponding layout to reset the | |||
* data and layout of the tensor, the memory will not be managed by lite, later, | |||
* the user should delete it. | |||
* \param[in] tensor The Tensor | |||
* \param[in] layout The Layout to be set to the tensor | |||
* \param[in] prepared_data The allocated memory which satisfy the layout to be | |||
* set | |||
*/ | |||
LITE_API int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout, | |||
void* prepared_data); | |||
/** | |||
* \brief reshape a tensor with the memroy not change, the total number of | |||
* element in the reshaped tensor must equal to the origin tensor, the input | |||
* shape must only contain one or zero -1 to flag it can be deduced | |||
* automatically. | |||
* \param[in] tensor The Tensor to be reshape | |||
* \param[in] shape the user input shape | |||
* \param[in] size the number of data in shape, | |||
*/ | |||
LITE_API int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size); | |||
/** | |||
* \brief slice a tensor with input param | |||
* \param[in] tensor The Tensor to be slice | |||
* \param[in] start start index of every axis of to be sliced | |||
* \param[in] end end index of every axis of to be sliced | |||
* \param[in] step step of every axis of to be sliced, if nullptr, step will be | |||
* 1 | |||
* \param[in] size the number axis to be sliced | |||
* \param[out] sliced_tensor the result tensor sliced from the origin tensor | |||
*/ | |||
LITE_API int LITE_tensor_slice(const LiteTensor tensor, const size_t* start, | |||
const size_t* end, const size_t* step, | |||
size_t size, LiteTensor* slice_tensor); | |||
/** | |||
* \brief fill zero to the tensor | |||
* \param[in] tensor The Tensor to be memset | |||
*/ | |||
LITE_API int LITE_tensor_fill_zero(LiteTensor tensor); | |||
/** | |||
* \brief copy tensor form other tensor | |||
* \param[out] dst_tensor The Tensor to copy into | |||
* \param[in] src_tensor The Tensor to copy from | |||
*/ | |||
LITE_API int LITE_tensor_copy(LiteTensor dst_tensor, | |||
const LiteTensor src_tensor); | |||
/** | |||
* \brief share memory form other tensor | |||
* \param[out] dst_tensor The Tensor to share into | |||
* \param[in] src_tensor The Tensor to be shared | |||
*/ | |||
LITE_API int LITE_tensor_share_memory_with(LiteTensor dst_tensor, | |||
const LiteTensor src_tensor); | |||
/** | |||
* \brief get the memory pointer of a Tensor object. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] data a pointer to void pointer | |||
*/ | |||
LITE_API int LITE_get_tensor_memory(const LiteTensor tensor, void** data); | |||
/** | |||
* \brief get the memory pointer of a Tensor object. | |||
* \param[in] tensor The input Tensor | |||
* \param[in] index The coordinate in the tensor | |||
* \param[in] size The lenght of coordinate | |||
* \param[out] data a pointer to void pointer | |||
*/ | |||
LITE_API int LITE_get_tensor_memory_with_index(const LiteTensor tensor, | |||
const size_t* index, size_t size, | |||
void** data); | |||
/** | |||
* \brief get the tensor capacity in byte of a Tensor object. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] size_ptr a pointer to the return size | |||
*/ | |||
LITE_API int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, | |||
size_t* size); | |||
/** | |||
* \brief get the tensor layout of a Tensor object. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] layout_ptr a pointer will be write with the layout of the tensor | |||
*/ | |||
LITE_API int LITE_get_tensor_layout(const LiteTensor tensor, | |||
LiteLayout* layout); | |||
/** | |||
* \brief get the tensor device of a Tensor object. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] device_ptr a pointer will be write with the device of the tensor | |||
*/ | |||
LITE_API int LITE_get_tensor_device_type(const LiteTensor tensor, | |||
LiteDeviceType* device_type); | |||
/** | |||
* \brief get the tensor device id of a Tensor object. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] device_id a pointer will be write with the device id of the | |||
* tensor | |||
*/ | |||
LITE_API int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id); | |||
/** | |||
* \brief whether the tensor is is_pinned_host. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] is_pinned_host_ptr a int pointer will be write with whether the | |||
* tensor is pinned host | |||
*/ | |||
LITE_API int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host); | |||
/** | |||
* \brief whether the tensor memory is continue. | |||
* \param[in] tensor The input Tensor | |||
* \param[out] is_continue a int pointer will be write with whether the | |||
* tensor continue | |||
*/ | |||
LITE_API int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue); | |||
/** | |||
* \brief concat the inputs tensor to one big tensor | |||
* \param[in] tensors ptr The input Tensors | |||
* \param[in] nr_tensors number input Tensor | |||
* \param[in] dim the dim concat act on | |||
* \param[in] dst_device the device type of result tensor, when | |||
* LITE_DEVICE_DEFAULT, the result tensor device type will get from the first | |||
* tensor | |||
* \param[in] device_id the device id of result tensor, when -1, the result | |||
* tensor device id will get from the first tensor | |||
* \param[out] result_tensor the result tensor after concat | |||
*/ | |||
LITE_API int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim, | |||
LiteDeviceType dst_device, int device_id, | |||
LiteTensor* result_tensor); | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,73 @@ | |||
/** | |||
* \file lite-c/src/common.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef LITE_C_COMMON_H_ | |||
#define LITE_C_COMMON_H_ | |||
#include "../src/misc.h" | |||
#include "lite-c/network_c.h" | |||
#include "lite-c/tensor_c.h" | |||
#include "lite/network.h" | |||
#include <exception> | |||
#include <stdexcept> | |||
//! convert c Layout to lite::Layout | |||
lite::Layout convert_to_layout(const LiteLayout& layout); | |||
//! convert lite::Layout to C Layout | |||
LiteLayout convert_to_clayout(const lite::Layout& layout); | |||
//! convert c config to lite::config | |||
lite::Config convert_to_lite_config(const LiteConfig c_config); | |||
//! convert C NetworkIO io to lite::NetworkIO | |||
lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io); | |||
/*! | |||
* \brief handle exception | |||
* \param e the exception | |||
* \return the return value of the error | |||
*/ | |||
int LiteHandleException(const std::exception& e); | |||
#if LITE_ENABLE_EXCEPTION | |||
/*! \brief macro to guard a function */ | |||
#define LITE_CAPI_BEGIN() try { | |||
/*! \brief every function starts with LITE_CAPI_BEGIN(); | |||
* ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS | |||
*/ | |||
#define LITE_CAPI_END() \ | |||
} \ | |||
catch (std::exception & _except_) { \ | |||
return LiteHandleException(_except_); \ | |||
} \ | |||
return 0; | |||
#else | |||
/*! \brief macro to guard a function */ | |||
#define LITE_CAPI_BEGIN() { | |||
/*! \brief every function starts with LITE_CAPI_BEGIN(); | |||
* ends with LITE_CAPI_END or LITE_CAPI_END_WITH_STMS | |||
*/ | |||
#define LITE_CAPI_END() \ | |||
} \ | |||
return 0; | |||
#endif | |||
/*! | |||
* \brief catch the exception with stms | |||
*/ | |||
#define LITE_CAPI_END_WITH_STMS(_stms) \ | |||
} \ | |||
catch (std::exception & _except_) { \ | |||
_stms; \ | |||
return LiteHandleException(_except_); \ | |||
} \ | |||
return 0; | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,192 @@ | |||
/** | |||
* \file lite-c/src/tensor.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite/global.h" | |||
#include "common.h" | |||
#include "lite-c/global_c.h" | |||
#include <exception> | |||
#include <mutex> | |||
namespace { | |||
class ErrorMsg { | |||
public: | |||
std::string& get_error_msg() { return error_msg; } | |||
void set_error_msg(const std::string& msg) { error_msg = msg; } | |||
private: | |||
std::string error_msg; | |||
}; | |||
ErrorMsg& get_global_error() { | |||
static thread_local ErrorMsg error_msg; | |||
return error_msg; | |||
} | |||
} // namespace | |||
int LiteHandleException(const std::exception& e) { | |||
get_global_error().set_error_msg(e.what()); | |||
return -1; | |||
} | |||
const char* LITE_get_last_error() { | |||
return get_global_error().get_error_msg().c_str(); | |||
} | |||
int LITE_get_version(int* major, int* minor, int* patch) { | |||
LITE_ASSERT(major && minor && patch, "The ptr pass to LITE api is null"); | |||
lite::get_version(*major, *minor, *patch); | |||
return 0; | |||
} | |||
int LITE_get_device_count(LiteDeviceType device_type, size_t* count) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(count, "The ptr pass to LITE api is null"); | |||
*count = lite::get_device_count(device_type); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_try_coalesce_all_free_memory(){ | |||
LITE_CAPI_BEGIN(); | |||
lite::try_coalesce_all_free_memory(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_register_decryption_and_key(const char* decrypt_name, | |||
const LiteDecryptionFunc func, | |||
const uint8_t* key_data, size_t key_size) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(decrypt_name && key_data && func, | |||
"The ptr pass to LITE api is null"); | |||
std::vector<uint8_t> key; | |||
for (size_t i = 0; i < key_size; i++) { | |||
key.push_back(key_data[i]); | |||
} | |||
auto decrypt_func = [func](const void* input_data, size_t input_size, | |||
const std::vector<uint8_t>& key) { | |||
auto size = | |||
func(input_data, input_size, key.data(), key.size(), nullptr); | |||
std::vector<uint8_t> output(size, 0); | |||
func(input_data, input_size, key.data(), key.size(), output.data()); | |||
return output; | |||
}; | |||
lite::register_decryption_and_key(decrypt_name, decrypt_func, key); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_update_decryption_or_key(const char* decrypt_name, | |||
const LiteDecryptionFunc func, | |||
const uint8_t* key_data, size_t key_size) { | |||
LITE_CAPI_BEGIN(); | |||
std::vector<uint8_t> key; | |||
for (size_t i = 0; i < key_size; i++) { | |||
key.push_back(key_data[i]); | |||
} | |||
lite::DecryptionFunc decrypt_func = nullptr; | |||
if (func) { | |||
decrypt_func = [func](const void* input_data, size_t input_size, | |||
const std::vector<uint8_t>& key) { | |||
auto size = func(input_data, input_size, key.data(), key.size(), | |||
nullptr); | |||
std::vector<uint8_t> output(size, 0); | |||
func(input_data, input_size, key.data(), key.size(), output.data()); | |||
return output; | |||
}; | |||
} | |||
lite::update_decryption_or_key(decrypt_name, decrypt_func, key); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_register_parse_info_func(const char* info_type, | |||
const LiteParseInfoFunc parse_func) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(info_type && parse_func, "The ptr pass to LITE api is null"); | |||
auto lite_func = [parse_func]( | |||
const void* info_data, size_t info_size, | |||
const std::string model_name, lite::Config& config, | |||
lite::NetworkIO& network_io, | |||
std::unordered_map<std::string, lite::LiteAny>& | |||
separate_config_map, | |||
std::string& extra_info) { | |||
LITE_MARK_USED_VAR(extra_info); | |||
size_t nr_threads = 1; | |||
int device_id = 0, is_cpu_inplace_mode = false, use_tensorrt = false; | |||
LiteNetworkIO c_io; | |||
LiteConfig c_config; | |||
auto ret = parse_func(info_data, info_size, model_name.c_str(), | |||
&c_config, &c_io, &device_id, &nr_threads, | |||
&is_cpu_inplace_mode, &use_tensorrt); | |||
config = convert_to_lite_config(c_config); | |||
network_io = convert_to_lite_io(c_io); | |||
if (device_id != 0) { | |||
separate_config_map["device_id"] = device_id; | |||
} | |||
if (nr_threads != 1) { | |||
separate_config_map["nr_threads"] = nr_threads; | |||
} | |||
if (is_cpu_inplace_mode != false) { | |||
separate_config_map["is_inplace_mode"] = is_cpu_inplace_mode; | |||
} | |||
if (use_tensorrt != false) { | |||
separate_config_map["use_tensorrt"] = use_tensorrt; | |||
} | |||
return ret; | |||
}; | |||
lite::register_parse_info_func(info_type, lite_func); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_loader_lib_path(const char* loader_path) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(loader_path, "The ptr pass to LITE api is null"); | |||
lite::set_loader_lib_path(loader_path); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_persistent_cache(const char* cache_path, int always_sync) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); | |||
lite::set_persistent_cache(cache_path, always_sync); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_tensor_rt_cache(const char* cache_path) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); | |||
lite::set_tensor_rt_cache(cache_path); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_log_level(LiteLogLevel level) { | |||
LITE_CAPI_BEGIN(); | |||
lite::set_log_level(level); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_log_level(LiteLogLevel* level) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(level, "The ptr pass to LITE api is null"); | |||
*level = lite::get_log_level(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_dump_persistent_cache(const char* cache_path) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(cache_path, "The ptr pass to LITE api is null"); | |||
lite::dump_persistent_cache(cache_path); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_dump_tensor_rt_cache() { | |||
LITE_CAPI_BEGIN(); | |||
lite::dump_tensor_rt_cache(); | |||
LITE_CAPI_END(); | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,580 @@ | |||
/** | |||
* \file lite-c/src/network.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite/network.h" | |||
#include "common.h" | |||
#include "lite-c/network_c.h" | |||
#include "../../src/network_impl_base.h" | |||
#include <memory> | |||
#include <mutex> | |||
#include <unordered_map> | |||
#include <string.h> | |||
//! define a default Options | |||
const LiteOptions default_option = { | |||
.weight_preprocess = false, | |||
.fuse_preprocess = false, | |||
.fake_next_exec = false, | |||
.var_sanity_check_first_run = true, | |||
.const_shape = false, | |||
.force_dynamic_alloc = false, | |||
.force_output_dynamic_alloc = false, | |||
.no_profiling_on_shape_change = false, | |||
.jit_level = 0, | |||
.comp_node_seq_record_level = 0, | |||
.graph_opt_level = 2, | |||
.async_exec_level = 1, | |||
//! layout transform options | |||
.enable_nchw44 = 0, | |||
.enable_nchw44_dot = 0, | |||
.enable_nchw88 = 0, | |||
.enable_nhwcd4 = 0, | |||
.enable_nchw4 = 0, | |||
.enable_nchw32 = 0, | |||
.enable_nchw64 = 0, | |||
}; | |||
//! define a default config | |||
LiteConfig default_config_t = {.has_compression = false, | |||
.device_id = -1, | |||
.device_type = LiteDeviceType::LITE_CPU, | |||
.backend = LiteBackend::LITE_DEFAULT, | |||
.bare_model_cryption_name = nullptr, | |||
.options = default_option}; | |||
LiteConfig* default_config() { | |||
return &default_config_t; | |||
} | |||
//! define a default IO | |||
const LiteIO default_io = {.name = nullptr, | |||
.is_host = true, | |||
.io_type = LiteIOType::LITE_IO_VALUE, | |||
.config_layout = default_layout}; | |||
//! define a default NetworkIO | |||
LiteNetworkIO default_network_io_t = {.inputs = nullptr, | |||
.outputs = nullptr, | |||
.input_size = 0, | |||
.output_size = 0}; | |||
LiteNetworkIO* default_network_io() { | |||
return &default_network_io_t; | |||
} | |||
namespace { | |||
std::unordered_map<void*, std::shared_ptr<lite::Network>>& | |||
get_gloabl_network_holder() { | |||
static thread_local std::unordered_map<void*, | |||
std::shared_ptr<lite::Network>> | |||
network_holder; | |||
return network_holder; | |||
} | |||
/*! | |||
* \brief A user-implemented allocator interface | |||
*/ | |||
class UserAllocator : public lite::Allocator { | |||
public: | |||
UserAllocator(LiteAllocate allocate_func, LiteFree free_func) | |||
: m_allocator(allocate_func), m_free(free_func) { | |||
LITE_ASSERT(m_allocator && m_free); | |||
} | |||
//! allocate memory of size in the given device with the given align | |||
void* allocate(LiteDeviceType device_type, int device_id, size_t size, | |||
size_t align) override { | |||
return m_allocator(device_type, device_id, size, align); | |||
} | |||
//! free the memory pointed by ptr in the given device | |||
void free(LiteDeviceType device_type, int device_id, void* ptr) override { | |||
m_free(device_type, device_id, ptr); | |||
} | |||
private: | |||
LiteAllocate m_allocator; | |||
LiteFree m_free; | |||
}; | |||
} // namespace | |||
//! convert c config to lite::config | |||
lite::Config convert_to_lite_config(const LiteConfig c_config) { | |||
lite::Config lite_config; | |||
lite_config.device_type = c_config.device_type; | |||
if (c_config.bare_model_cryption_name) { | |||
lite_config.bare_model_cryption_name = | |||
c_config.bare_model_cryption_name; | |||
} | |||
lite_config.backend = c_config.backend; | |||
lite_config.has_compression = c_config.has_compression; | |||
lite_config.device_id = c_config.device_id; | |||
lite_config.options.weight_preprocess = c_config.options.weight_preprocess; | |||
lite_config.options.fuse_preprocess = c_config.options.fuse_preprocess; | |||
lite_config.options.fake_next_exec = c_config.options.fake_next_exec; | |||
lite_config.options.var_sanity_check_first_run = | |||
c_config.options.var_sanity_check_first_run; | |||
lite_config.options.const_shape = c_config.options.const_shape; | |||
lite_config.options.force_dynamic_alloc = c_config.options.const_shape; | |||
lite_config.options.force_output_dynamic_alloc = | |||
c_config.options.force_output_dynamic_alloc; | |||
lite_config.options.no_profiling_on_shape_change = | |||
c_config.options.no_profiling_on_shape_change; | |||
lite_config.options.jit_level = c_config.options.jit_level; | |||
lite_config.options.comp_node_seq_record_level = | |||
c_config.options.comp_node_seq_record_level; | |||
lite_config.options.graph_opt_level = c_config.options.graph_opt_level; | |||
lite_config.options.async_exec_level = c_config.options.async_exec_level; | |||
lite_config.options.enable_nchw44 = c_config.options.enable_nchw44; | |||
lite_config.options.enable_nchw44_dot = c_config.options.enable_nchw44_dot; | |||
lite_config.options.enable_nchw88 = c_config.options.enable_nchw88; | |||
lite_config.options.enable_nchw4 = c_config.options.enable_nchw4; | |||
lite_config.options.enable_nhwcd4 = c_config.options.enable_nhwcd4; | |||
lite_config.options.enable_nchw32 = c_config.options.enable_nchw32; | |||
lite_config.options.enable_nchw64 = c_config.options.enable_nchw64; | |||
return lite_config; | |||
} | |||
//! convert C NetworkIO io to lite::NetworkIO | |||
lite::NetworkIO convert_to_lite_io(const LiteNetworkIO c_network_io) { | |||
lite::NetworkIO network_io; | |||
for (size_t i = 0; i < c_network_io.input_size; i++) { | |||
LiteIO* c_io = c_network_io.inputs + i; | |||
LITE_ASSERT(c_io->name, "input name of io tensor must set."); | |||
network_io.inputs.push_back( | |||
{c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type, | |||
convert_to_layout(c_io->config_layout)}); | |||
} | |||
for (size_t i = 0; i < c_network_io.output_size; i++) { | |||
LiteIO* c_io = c_network_io.outputs + i; | |||
LITE_ASSERT(c_io->name, "output name of io tensor must set."); | |||
network_io.outputs.push_back( | |||
{c_io->name, static_cast<bool>(c_io->is_host), c_io->io_type, | |||
convert_to_layout(c_io->config_layout)}); | |||
} | |||
return network_io; | |||
} | |||
int LITE_make_default_network(LiteNetwork* network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto lite_network = std::make_shared<lite::Network>(); | |||
get_gloabl_network_holder()[lite_network.get()] = lite_network; | |||
*network = lite_network.get(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_make_network(LiteNetwork* network, const LiteConfig config, | |||
const LiteNetworkIO network_io) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto lite_network = std::make_shared<lite::Network>( | |||
convert_to_lite_config(config), convert_to_lite_io(network_io)); | |||
get_gloabl_network_holder()[lite_network.get()] = lite_network; | |||
*network = lite_network.get(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_make_network_config(LiteNetwork* network, const LiteConfig config) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto lite_network = | |||
std::make_shared<lite::Network>(convert_to_lite_config(config)); | |||
get_gloabl_network_holder()[lite_network.get()] = lite_network; | |||
*network = lite_network.get(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_load_model_from_mem(LiteNetwork network, void* model_mem, | |||
size_t size) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(model_mem, "The model memory pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->load_model(model_mem, size); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_load_model_from_path(LiteNetwork network, const char* model_path) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(model_path, "The model path pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->load_model(model_path); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_destroy_network(LiteNetwork network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
get_gloabl_network_holder().erase(network); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_forward(const LiteNetwork network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->forward(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_wait(const LiteNetwork network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->wait(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_io_tensor(LiteNetwork network, const char* io_name, | |||
LiteTensorPhase phase, LiteTensor* tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto io_tensor = | |||
static_cast<lite::Network*>(network)->get_io_tensor(io_name, phase); | |||
*tensor = io_tensor.get(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_input_name(const LiteNetwork network, size_t index, | |||
const char** name) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network && name, "The network pass to LITE api is null"); | |||
*name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
->get_input_name(index); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_output_name(const LiteNetwork network, size_t index, | |||
const char** name) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(name, "The name ptr pass to LITE api is null"); | |||
*name = lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
->get_output_name(index); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_all_input_name(const LiteNetwork network, size_t* size, | |||
const char** name) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto&& names = | |||
lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
->get_all_input_name(); | |||
if (size) | |||
*size = names.size(); | |||
if (name) { | |||
for (auto in_name : names) { | |||
*name = in_name; | |||
name++; | |||
} | |||
} | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_all_output_name(const LiteNetwork network, size_t* size, | |||
const char** name) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto&& names = | |||
lite::NetworkHelper::implement(static_cast<lite::Network*>(network)) | |||
->get_all_output_name(); | |||
if (size) | |||
*size = names.size(); | |||
if (name) { | |||
for (auto in_name : names) { | |||
*name = in_name; | |||
name++; | |||
} | |||
} | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_device_id(LiteNetwork network, int device_id) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->set_device_id(device_id); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_device_id(const LiteNetwork network, int* device_id) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(device_id, "The device_id pass to LITE api is null"); | |||
*device_id = static_cast<lite::Network*>(network)->get_device_id(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_stream_id(LiteNetwork network, int stream_id) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->set_stream_id(stream_id); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_stream_id(const LiteNetwork network, int* stream_id) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(stream_id, "The stream_id pass to LITE api is null"); | |||
*stream_id = static_cast<lite::Network*>(network)->get_stream_id(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_model_extra_info(const LiteNetwork network, const char** info, | |||
int* info_size) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(info_size, "The info and info_size are all null"); | |||
auto& extra_info = | |||
static_cast<lite::Network*>(network)->get_model_extra_info(); | |||
*info_size = extra_info.size(); | |||
*info = extra_info.c_str(); | |||
LITE_MARK_USED_VAR(info); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_device_type(const LiteNetwork network, | |||
LiteDeviceType* device_type) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(device_type, "The device_type pass to LITE api is null"); | |||
*device_type = static_cast<lite::Network*>(network)->get_device_type(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_async_callback(LiteNetwork network, | |||
const LiteAsyncCallback async_callback) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(async_callback, "The ptr pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->set_async_callback( | |||
std::move(async_callback)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_start_callback(LiteNetwork network, | |||
const LiteStartCallback start_callback) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto lite_start_callback = | |||
[start_callback]( | |||
const std::unordered_map< | |||
std::string, | |||
std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>& | |||
inputs_map) -> void { | |||
std::vector<LiteIO> ios; | |||
std::vector<LiteTensor> io_tensors; | |||
size_t nr_io = 0; | |||
for (const auto& io : inputs_map) { | |||
nr_io++; | |||
auto&& lite_io = io.second.first; | |||
ios.push_back({lite_io.name.c_str(), lite_io.is_host, | |||
lite_io.io_type, | |||
convert_to_clayout(lite_io.config_layout)}); | |||
io_tensors.push_back(io.second.second.get()); | |||
} | |||
start_callback(ios.data(), io_tensors.data(), nr_io); | |||
}; | |||
static_cast<lite::Network*>(network)->set_start_callback( | |||
lite_start_callback); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_finish_callback(LiteNetwork network, | |||
const LiteFinishCallback finish_callback) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
auto lite_finish_callback = | |||
[finish_callback]( | |||
const std::unordered_map< | |||
std::string, | |||
std::pair<lite::IO, std::shared_ptr<lite::Tensor>>>& | |||
outputs_map) -> void { | |||
std::vector<LiteIO> ios; | |||
std::vector<LiteTensor> io_tensors; | |||
size_t nr_io = 0; | |||
for (const auto& io : outputs_map) { | |||
nr_io++; | |||
auto&& lite_io = io.second.first; | |||
ios.push_back({lite_io.name.c_str(), lite_io.is_host, | |||
lite_io.io_type, | |||
convert_to_clayout(lite_io.config_layout)}); | |||
io_tensors.push_back(io.second.second.get()); | |||
} | |||
finish_callback(ios.data(), io_tensors.data(), nr_io); | |||
}; | |||
static_cast<lite::Network*>(network)->set_finish_callback( | |||
lite_finish_callback); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_enable_profile_performance(LiteNetwork network, | |||
const char* profile_json_file_path) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
static_cast<lite::Network*>(network)->enable_profile_performance( | |||
profile_json_file_path); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_is_cpu_inplace_mode(const LiteNetwork network, | |||
int* is_cpu_inplace_mode) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network && is_cpu_inplace_mode, | |||
"The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
*is_cpu_inplace_mode = lite::Runtime::is_cpu_inplace_mode(network_shared); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_cpu_threads_number(const LiteNetwork network, size_t* nr_threads) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
LITE_ASSERT(nr_threads, "The ptr pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
*nr_threads = lite::Runtime::get_cpu_threads_number(network_shared); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_cpu_inplace_mode(LiteNetwork network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_cpu_inplace_mode(network_shared); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_use_tensorrt(LiteNetwork network){ | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::use_tensorrt(network_shared); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_cpu_threads_number(LiteNetwork network, size_t nr_threads) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_cpu_threads_number(network_shared, nr_threads); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_network_algo_policy(LiteNetwork network, | |||
LiteAlgoSelectStrategy strategy) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_network_algo_policy(network_shared, strategy); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_network_algo_fastrun_config(LiteNetwork network, | |||
unsigned int shared_batch_size, | |||
int binary_equal_between_batch) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_network_algo_policy( | |||
network_shared, LiteAlgoSelectStrategy(0), shared_batch_size, | |||
binary_equal_between_batch); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_network_algo_workspace_limit(LiteNetwork network, | |||
size_t workspace_limit) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_network_algo_workspace_limit(network_shared, | |||
workspace_limit); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_runtime_thread_affinity( | |||
LiteNetwork network, | |||
const LiteThreadAffinityCallback thread_affinity_callback) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_runtime_thread_affinity( | |||
network_shared, std::move(thread_affinity_callback)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_memory_allocator(LiteNetwork network, | |||
const LiteAllocate allocate_fun, | |||
const LiteFree free_fun) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network && allocate_fun && free_fun, | |||
"The ptr pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::set_memory_allocator( | |||
network_shared, | |||
std::make_shared<UserAllocator>(allocate_fun, free_fun)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_enable_io_txt_dump(LiteNetwork network, const char* io_txt_out_file) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::enable_io_txt_dump(network_shared, io_txt_out_file); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_enable_io_bin_dump(LiteNetwork network, const char* io_bin_out_dir) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(network, "The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> network_shared{ | |||
static_cast<lite::Network*>(network), [](void*) {}}; | |||
lite::Runtime::enable_io_bin_dump(network_shared, io_bin_out_dir); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_shared_weight_with_network(LiteNetwork dst_network, | |||
const LiteNetwork src_network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(dst_network && src_network, | |||
"The network pass to LITE api is null"); | |||
const std::shared_ptr<lite::Network> src_shared_net{ | |||
static_cast<lite::Network*>(src_network), [](void*) {}}; | |||
std::shared_ptr<lite::Network> dst_shared_net{ | |||
static_cast<lite::Network*>(dst_network), [](void*) {}}; | |||
lite::Runtime::shared_weight_with_network(dst_shared_net, src_shared_net); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_share_runtime_memroy(LiteNetwork dst_network, | |||
LiteNetwork src_network) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(src_network && dst_network, | |||
"The network pass to LITE api is null"); | |||
std::shared_ptr<lite::Network> src_shared{ | |||
static_cast<lite::Network*>(src_network), [](void*) {}}; | |||
std::shared_ptr<lite::Network> dst_shared{ | |||
static_cast<lite::Network*>(dst_network), [](void*) {}}; | |||
lite::Runtime::share_runtime_memory_with(dst_shared, src_shared); | |||
LITE_CAPI_END(); | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,257 @@ | |||
/** | |||
* \file lite-c/src/tensor.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite/tensor.h" | |||
#include "../../src/tensor_impl_base.h" | |||
#include "common.h" | |||
#include "lite-c/tensor_c.h" | |||
#include <set> | |||
#include <string> | |||
#include <unordered_map> | |||
const LiteLayout default_layout = {.shapes = {0, 0, 0, 0, 0}, | |||
.ndim = 0, | |||
.data_type = LiteDataType::LITE_FLOAT}; | |||
const LiteTensorDesc default_desc = {.is_pinned_host = false, | |||
.layout = default_layout, | |||
.device_type = LiteDeviceType::LITE_CPU, | |||
.device_id = 0}; | |||
namespace { | |||
std::unordered_map<void*, std::shared_ptr<lite::Tensor>>& | |||
get_global_tensor_holder() { | |||
static thread_local std::unordered_map<void*, std::shared_ptr<lite::Tensor>> | |||
global_holder; | |||
return global_holder; | |||
} | |||
std::unordered_map<std::string, lite::LiteAny>& | |||
get_global_tensor_attr_holder() { | |||
static thread_local std::unordered_map<std::string, lite::LiteAny> | |||
global_holder; | |||
return global_holder; | |||
} | |||
} // namespace | |||
//! convert the lite::Layout to Layout | |||
LiteLayout convert_to_clayout(const lite::Layout& layout) { | |||
LiteLayout clayout; | |||
clayout.ndim = layout.ndim; | |||
LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "layout ndim is to large"); | |||
for (size_t i = 0; i < layout.ndim; i++) { | |||
clayout.shapes[i] = layout.shapes[i]; | |||
} | |||
clayout.data_type = layout.data_type; | |||
return clayout; | |||
} | |||
//! convert the C Layout to lite::Layout | |||
lite::Layout convert_to_layout(const LiteLayout& clayout) { | |||
lite::Layout layout; | |||
layout.ndim = clayout.ndim; | |||
LITE_ASSERT(layout.ndim < LAYOUT_MAX_DIM, "clayout ndim is to large"); | |||
for (size_t i = 0; i < layout.ndim; i++) { | |||
layout.shapes[i] = clayout.shapes[i]; | |||
} | |||
layout.data_type = clayout.data_type; | |||
return layout; | |||
} | |||
int LITE_make_tensor(const LiteTensorDesc tensor_describe, LiteTensor* tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE_make_tensor is null"); | |||
lite::Layout layout = convert_to_layout(tensor_describe.layout); | |||
auto lite_tensor = std::make_shared<lite::Tensor>( | |||
tensor_describe.device_id, tensor_describe.device_type, layout, | |||
tensor_describe.is_pinned_host); | |||
get_global_tensor_holder()[lite_tensor.get()] = lite_tensor; | |||
*tensor = lite_tensor.get(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_destroy_tensor(LiteTensor tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
get_global_tensor_holder().erase(tensor); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_set_tensor_layout(LiteTensor tensor, const LiteLayout layout) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
auto tensor_ptr = static_cast<lite::Tensor*>(tensor); | |||
tensor_ptr->set_layout(convert_to_layout(layout)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_reset_tensor_memory(LiteTensor tensor, void* prepared_data, | |||
size_t data_length_in_byte) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null"); | |||
static_cast<lite::Tensor*>(tensor)->reset(prepared_data, | |||
data_length_in_byte); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_reset_tensor(LiteTensor tensor, const LiteLayout layout, | |||
void* prepared_data) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(prepared_data, "The prepared_data pass to LITE c_api is null"); | |||
static_cast<lite::Tensor*>(tensor)->reset(prepared_data, | |||
convert_to_layout(layout)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_tensor_reshape(LiteTensor tensor, const int* shape, int size) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor && shape, "The tensor pass to LITE c_api is null"); | |||
std::vector<int> shapes; | |||
for (int i = 0; i < size; i++) { | |||
shapes.push_back(shape[i]); | |||
} | |||
static_cast<lite::Tensor*>(tensor)->reshape(shapes); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_tensor_slice(const LiteTensor tensor, const size_t* start, | |||
const size_t* end, const size_t* step, size_t size, | |||
LiteTensor* slice_tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor && start && end && slice_tensor, | |||
"The tensor pass to LITE c_api is null"); | |||
std::vector<size_t> starts, ends, steps; | |||
for (size_t i = 0; i < size; i++) { | |||
starts.push_back(start[i]); | |||
ends.push_back(end[i]); | |||
if (step) { | |||
steps.push_back(step[i]); | |||
} | |||
} | |||
auto ret_tensor = | |||
static_cast<lite::Tensor*>(tensor)->slice(starts, ends, steps); | |||
get_global_tensor_holder()[ret_tensor.get()] = ret_tensor; | |||
*slice_tensor = ret_tensor.get(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_tensor_fill_zero(LiteTensor tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
static_cast<lite::Tensor*>(tensor)->fill_zero(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_tensor_copy(LiteTensor dst_tensor, const LiteTensor src_tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(dst_tensor && src_tensor, | |||
"The tensor pass to LITE c_api is null"); | |||
static_cast<lite::Tensor*>(dst_tensor) | |||
->copy_from(*static_cast<lite::Tensor*>(src_tensor)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_tensor_share_memory_with(LiteTensor dst_tensor, | |||
const LiteTensor src_tensor) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(dst_tensor && src_tensor, | |||
"The tensor pass to LITE c_api is null"); | |||
static_cast<lite::Tensor*>(dst_tensor) | |||
->share_memory_with(*static_cast<lite::Tensor*>(src_tensor)); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_tensor_memory(const LiteTensor tensor, void** data) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(data, "The data ptr pass to LITE c_api is null"); | |||
*data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_tensor_memory_with_index(const LiteTensor tensor, | |||
const size_t* index, size_t size, | |||
void** data) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor && index && data, | |||
"The tensor pass to LITE c_api is null"); | |||
std::vector<size_t> index_v; | |||
for (size_t i = 0; i < size; i++) { | |||
index_v.push_back(index[i]); | |||
} | |||
*data = static_cast<lite::Tensor*>(tensor)->get_memory_ptr(index_v); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_tensor_total_size_in_byte(const LiteTensor tensor, size_t* size) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(size, "The size ptr pass to LITE c_api is null"); | |||
*size = static_cast<lite::Tensor*>(tensor)->get_tensor_total_size_in_byte(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_tensor_layout(const LiteTensor tensor, LiteLayout* layout) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(layout, "The layout ptr pass to LITE c_api is null"); | |||
*layout = convert_to_clayout( | |||
static_cast<lite::Tensor*>(tensor)->get_layout()); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_tensor_device_type(const LiteTensor tensor, | |||
LiteDeviceType* device_type) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(device_type, "The device ptr pass to LITE c_api is null"); | |||
*device_type = static_cast<lite::Tensor*>(tensor)->get_device_type(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_get_tensor_device_id(const LiteTensor tensor, int* device_id) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor && device_id, "The tensor pass to LITE c_api is null"); | |||
*device_id = static_cast<lite::Tensor*>(tensor)->get_device_id(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_is_pinned_host(const LiteTensor tensor, int* is_pinned_host) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(is_pinned_host, | |||
"The is_pinned_host ptr pass to LITE c_api is null"); | |||
*is_pinned_host = static_cast<lite::Tensor*>(tensor)->is_pinned_host(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_is_memory_continue(const LiteTensor tensor, int* is_continue) { | |||
LITE_CAPI_BEGIN(); | |||
LITE_ASSERT(tensor, "The tensor pass to LITE c_api is null"); | |||
LITE_ASSERT(is_continue, "The is_continue ptr pass to LITE c_api is null"); | |||
*is_continue = static_cast<lite::Tensor*>(tensor)->is_continue_memory(); | |||
LITE_CAPI_END(); | |||
} | |||
int LITE_tensor_concat(LiteTensor* tensors, int nr_tensor, int dim, | |||
LiteDeviceType dst_device, int device_id, | |||
LiteTensor* result_tensor) { | |||
LITE_CAPI_BEGIN(); | |||
std::vector<lite::Tensor> v_tensors; | |||
for (int i = 0; i < nr_tensor; i++) { | |||
v_tensors.push_back(*static_cast<lite::Tensor*>(tensors[i])); | |||
} | |||
auto tensor = | |||
lite::TensorUtils::concat(v_tensors, dim, dst_device, device_id); | |||
get_global_tensor_holder()[tensor.get()] = tensor; | |||
*result_tensor = tensor.get(); | |||
LITE_CAPI_END() | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,12 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
from .base import * | |||
from .global_setting import * | |||
from .network import * | |||
from .struct import * | |||
from .tensor import * | |||
from .utils import * |
@@ -0,0 +1,152 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import ctypes | |||
import glob | |||
import logging | |||
import os | |||
import sys | |||
from ctypes import * | |||
if sys.platform == "win32": | |||
lib_path = os.path.join(os.path.dirname(__file__), "libs") | |||
dll_paths = list(filter(os.path.exists, [lib_path,])) | |||
assert len(dll_paths) > 0 | |||
kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) | |||
has_load_library_attr = hasattr(kernel32, "AddDllDirectory") | |||
old_error_mode = kernel32.SetErrorMode(0x0001) | |||
kernel32.LoadLibraryW.restype = ctypes.c_void_p | |||
if has_load_library_attr: | |||
kernel32.AddDllDirectory.restype = ctypes.c_void_p | |||
kernel32.LoadLibraryExW.restype = ctypes.c_void_p | |||
for dll_path in dll_paths: | |||
if sys.version_info >= (3, 8): | |||
os.add_dll_directory(dll_path) | |||
elif has_load_library_attr: | |||
res = kernel32.AddDllDirectory(dll_path) | |||
if res is None: | |||
err = ctypes.WinError(ctypes.get_last_error()) | |||
err.strerror += ' Error adding "{}" to the DLL search PATH.'.format( | |||
dll_path | |||
) | |||
raise err | |||
else: | |||
print("WARN: python or OS env have some issue, may load DLL failed!!!") | |||
import glob | |||
dlls = glob.glob(os.path.join(lib_path, "*.dll")) | |||
path_patched = False | |||
for dll in dlls: | |||
is_loaded = False | |||
if has_load_library_attr: | |||
res = kernel32.LoadLibraryExW(dll, None, 0x00001100) | |||
last_error = ctypes.get_last_error() | |||
if res is None and last_error != 126: | |||
err = ctypes.WinError(last_error) | |||
err.strerror += ' Error loading "{}" or one of its dependencies.'.format( | |||
dll | |||
) | |||
raise err | |||
elif res is not None: | |||
is_loaded = True | |||
if not is_loaded: | |||
if not path_patched: | |||
os.environ["PATH"] = ";".join(dll_paths + [os.environ["PATH"]]) | |||
path_patched = True | |||
res = kernel32.LoadLibraryW(dll) | |||
if res is None: | |||
err = ctypes.WinError(ctypes.get_last_error()) | |||
err.strerror += ' Error loading "{}" or one of its dependencies.'.format( | |||
dll | |||
) | |||
raise err | |||
kernel32.SetErrorMode(old_error_mode) | |||
class _LiteCLib: | |||
def __init__(self): | |||
cwd = os.getcwd() | |||
package_dir = os.path.dirname(os.path.realpath(__file__)) | |||
debug_path = os.getenv("LITE_LIB_PATH") | |||
os.chdir(package_dir) | |||
lite_libs = glob.glob("libs/liblite*") | |||
os.chdir(cwd) | |||
if debug_path is None: | |||
assert len(lite_libs) == 1 | |||
self._lib = CDLL(os.path.join(package_dir, lite_libs[0])) | |||
else: | |||
self._lib = CDLL(debug_path) | |||
self._register_api( | |||
"LITE_get_version", [POINTER(c_int), POINTER(c_int), POINTER(c_int)] | |||
) | |||
self.lib.LITE_get_version.restype = None | |||
self._register_api("LITE_set_log_level", [c_int]) | |||
self._register_api("LITE_get_log_level", []) | |||
self._register_api("LITE_get_last_error", [], False) | |||
self.lib.LITE_get_last_error.restype = c_char_p | |||
def _errcheck(self, result, func, args): | |||
if result: | |||
error = self.lib.LITE_get_last_error() | |||
msg = error.decode("utf-8") | |||
logging.error("{}".format(msg)) | |||
raise RuntimeError("{}".format(msg)) | |||
return result | |||
def _register_api(self, api_name, arg_types, error_check=True): | |||
func = getattr(self.lib, api_name) | |||
func.argtypes = arg_types | |||
func.restype = c_int | |||
if error_check: | |||
func.errcheck = self._errcheck | |||
@property | |||
def lib(self): | |||
return self._lib | |||
@property | |||
def version(self): | |||
major = c_int() | |||
minor = c_int() | |||
patch = c_int() | |||
self.lib.LITE_get_version(byref(major), byref(minor), byref(patch)) | |||
return "{}.{}.{}".format(major.value, minor.value, patch.value) | |||
def set_log_level(self, level): | |||
self.lib.LITE_set_log_level(level) | |||
def get_log_level(self): | |||
return self.lib.LITE_get_log_level() | |||
_lib = _LiteCLib() | |||
version = _lib.version | |||
set_log_level = _lib.set_log_level | |||
get_log_level = _lib.get_log_level | |||
_Cnetwork = c_void_p | |||
_Ctensor = c_void_p | |||
class _LiteCObjMetaClass(type): | |||
"""metaclass for lite object""" | |||
def __new__(cls, name, bases, attrs): | |||
for api in attrs["_api_"]: | |||
_lib._register_api(*api) | |||
del attrs["_api_"] | |||
attrs["_lib"] = _lib.lib | |||
return super().__new__(cls, name, bases, attrs) | |||
class _LiteCObjBase(metaclass=_LiteCObjMetaClass): | |||
_api_ = [] |
@@ -0,0 +1,120 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
from ctypes import * | |||
import numpy as np | |||
from .base import _Ctensor, _lib, _LiteCObjBase | |||
from .network import * | |||
from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure | |||
from .tensor import * | |||
LiteDecryptionFunc = CFUNCTYPE( | |||
c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p | |||
) | |||
class _GlobalAPI(_LiteCObjBase): | |||
""" | |||
get the api from the lib | |||
""" | |||
_api_ = [ | |||
("LITE_get_device_count", [c_int, POINTER(c_size_t)]), | |||
("LITE_try_coalesce_all_free_memory", []), | |||
( | |||
"LITE_register_decryption_and_key", | |||
[c_char_p, LiteDecryptionFunc, POINTER(c_uint8), c_size_t], | |||
), | |||
( | |||
"LITE_update_decryption_or_key", | |||
[c_char_p, c_void_p, POINTER(c_uint8), c_size_t], | |||
), | |||
("LITE_set_loader_lib_path", [c_char_p]), | |||
("LITE_set_persistent_cache", [c_char_p, c_int]), | |||
# ('LITE_set_tensor_rt_cache', [c_char_p]), | |||
("LITE_dump_persistent_cache", [c_char_p]), | |||
("LITE_dump_tensor_rt_cache", [c_char_p]), | |||
] | |||
def decryption_func(func): | |||
"""the decryption function decorator | |||
:type func: a function accept three array, in_arr, key_arr and out_arr, if out_arr is None, just query the out array lenght in byte | |||
""" | |||
@CFUNCTYPE(c_size_t, c_void_p, c_size_t, POINTER(c_uint8), c_size_t, c_void_p) | |||
def wrapper(c_in_data, in_length, c_key_data, key_length, c_out_data): | |||
in_arr = np.frombuffer(c_in_data, dtype=np.uint8, count=in_length) | |||
key_arr = np.frombuffer(c_key_data, dtype=np.uint8, count=key_length) | |||
if c_out_data: | |||
out_length = func(in_arr, None) | |||
out_arr = np.frombuffer(c_out_data, dtype=np.uint8, count=out_length) | |||
return func(in_arr, key_arr, out_arr) | |||
# just query the output length | |||
else: | |||
return func(in_arr, key_arr, None) | |||
return wrapper | |||
class LiteGlobal(object): | |||
""" | |||
some global config in lite | |||
""" | |||
_api = _GlobalAPI()._lib | |||
@staticmethod | |||
def register_decryption_and_key(decryption_name, decryption_func, key): | |||
c_name = c_char_p(decryption_name.encode("utf-8")) | |||
key_length = len(key) | |||
c_key = (c_uint8 * key_length)(*key) | |||
LiteGlobal._api.LITE_register_decryption_and_key( | |||
c_name, decryption_func, c_key, key_length | |||
) | |||
@staticmethod | |||
def update_decryption_key(decryption_name, key): | |||
c_name = c_char_p(decryption_name.encode("utf-8")) | |||
key_length = len(key) | |||
c_key = (c_uint8 * key_length)(*key) | |||
LiteGlobal._api.LITE_update_decryption_or_key(c_name, None, c_key, key_length) | |||
@staticmethod | |||
def set_loader_lib_path(path): | |||
c_path = c_char_p(path.encode("utf-8")) | |||
LiteGlobal._api.LITE_set_loader_lib_path(c_path) | |||
@staticmethod | |||
def set_persistent_cache(path, always_sync=False): | |||
c_path = c_char_p(path.encode("utf-8")) | |||
LiteGlobal._api.LITE_set_persistent_cache(c_path, always_sync) | |||
@staticmethod | |||
def set_tensorrt_cache(path): | |||
c_path = c_char_p(path.encode("utf-8")) | |||
LiteGlobal._api.LITE_set_tensorrt_cache(c_path) | |||
@staticmethod | |||
def dump_persistent_cache(path): | |||
c_path = c_char_p(path.encode("utf-8")) | |||
LiteGlobal._api.LITE_dump_persistent_cache(c_path) | |||
@staticmethod | |||
def dump_tensorrt_cache(): | |||
LiteGlobal._api.LITE_dump_tensorrt_cache() | |||
@staticmethod | |||
def get_device_count(device_type): | |||
count = c_size_t() | |||
LiteGlobal._api.LITE_get_device_count(device_type, byref(count)) | |||
return count.value | |||
@staticmethod | |||
def try_coalesce_all_free_memory(): | |||
LiteGlobal._api.LITE_try_coalesce_all_free_memory() |
@@ -0,0 +1,531 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
from ctypes import * | |||
import numpy as np | |||
from .base import _Cnetwork, _Ctensor, _lib, _LiteCObjBase | |||
from .struct import * | |||
from .tensor import * | |||
class LiteOptions(Structure): | |||
""" | |||
the inference options will be used to config a network | |||
""" | |||
_fields_ = [ | |||
("weight_preprocess", c_int), | |||
("fuse_preprocess", c_int), | |||
("fake_next_exec", c_int), | |||
("var_sanity_check_first_run", c_int), | |||
("const_shape", c_int), | |||
("force_dynamic_alloc", c_int), | |||
("force_output_dynamic_alloc", c_int), | |||
("no_profiling_on_shape_change", c_int), | |||
("jit_level", c_int), | |||
("comp_node_seq_record_level", c_int), | |||
("graph_opt_level", c_int), | |||
("async_exec_level", c_int), | |||
# layout transform options | |||
("enable_nchw44", c_int), | |||
("enable_nchw44_dot", c_int), | |||
("enable_nchw88", c_int), | |||
("enable_nhwcd4", c_int), | |||
("enable_nchw4", c_int), | |||
("enable_nchw32", c_int), | |||
("enable_nchw64", c_int), | |||
] | |||
def __init__(self): | |||
self.weight_preprocess = False | |||
self.fuse_preprocess = False | |||
self.fake_next_exec = False | |||
self.var_sanity_check_first_run = True | |||
self.const_shape = False | |||
self.force_dynamic_alloc = False | |||
self.force_output_dynamic_alloc = False | |||
self.no_profiling_on_shape_change = False | |||
self.jit_level = 0 | |||
self.comp_node_seq_record_level = 0 | |||
self.graph_opt_level = 2 | |||
self.async_exec_level = 1 | |||
def __repr__(self): | |||
data = { | |||
"weight_preprocess": bool(self.weight_preprocess), | |||
"fuse_preprocess": bool(self.fuse_preprocess), | |||
"fake_next_exec": bool(self.fake_next_exec), | |||
"var_sanity_check_first_run": bool(self.var_sanity_check_first_run), | |||
"const_shape": bool(self.const_shape), | |||
"force_dynamic_alloc": bool(self.force_dynamic_alloc), | |||
"force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc), | |||
"no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change), | |||
"jit_level": self.jit_level, | |||
"comp_node_seq_record_level": self.comp_node_seq_record_level, | |||
"graph_opt_level": self.graph_opt_level, | |||
"async_exec_level": self.async_exec_level, | |||
} | |||
return data.__repr__() | |||
class LiteConfig(Structure): | |||
""" | |||
Configuration when load and compile the graph | |||
bare_model_cryption_name: is the bare model cryption method name, bare | |||
model is not pack model info inside | |||
use_loader_dynamic_param: when model forward with device loader of npu, | |||
use_loader_dynamic_param used to flag whether the loader use device input or | |||
output, if use device input or output it will set Non-zero , else set zero | |||
has_compression: flag whether the model is compressed, the compress | |||
method will used to read the model | |||
""" | |||
_fields_ = [ | |||
("has_compression", c_int), | |||
("device_id", c_int), | |||
("device_type", c_int), | |||
("backend", c_int), | |||
("bare_model_cryption_name", c_char_p), | |||
("options", LiteOptions), | |||
] | |||
def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None): | |||
self.device_type = device_type | |||
if option: | |||
self.options = option | |||
else: | |||
self.options = LiteOptions() | |||
self.bare_model_cryption_name = c_char_p(b"") | |||
self.use_loader_dynamic_param = 0 | |||
self.has_compression = 0 | |||
self.backend = LiteBackend.LITE_DEFAULT | |||
def __repr__(self): | |||
data = { | |||
"has_compression": bool(self.has_compression), | |||
"device_id": LiteDeviceType(self.device_id), | |||
"device_type": LiteDeviceType(self.device_type), | |||
"backend": LiteBackend(self.backend), | |||
"bare_model_cryption_name": self.bare_model_cryption_name.decode("utf-8"), | |||
"options": self.options, | |||
} | |||
return data.__repr__() | |||
class LiteIO(Structure): | |||
""" | |||
config the network input and output item | |||
name: the tensor name in the graph corresponding to the IO | |||
is_host: Used to mark where the input tensor comes from and the output where copy | |||
to, if is_host is true, the input is from host and output copy to host, | |||
otherwise device. Sometimes The input is from device and output no need | |||
copy to host, default is true. | |||
io_type: The IO type, it can be SHAPE or VALUE, when SHAPE is set, the input or | |||
output tensor value is invaid, only shape will be set, default is VALUE | |||
config_layout: The layout of the config from user, if other layout is set before | |||
forward or get after forward, this layout will by pass. if no other | |||
layout is set before forward, this layout will work. if this layout is | |||
no set, the model will forward with its origin layout. if in output, it | |||
will used to check. | |||
""" | |||
_fields_ = [ | |||
("name", c_char_p), | |||
("is_host", c_int), | |||
("io_type", c_int), | |||
("config_layout", LiteLayout), | |||
] | |||
def __init__( | |||
self, name, is_host=True, io_type=LiteIOType.LITE_IO_VALUE, layout=None | |||
): | |||
if type(name) == str: | |||
self.name = c_char_p(name.encode("utf-8")) | |||
else: | |||
self.name = c_char_p(name) | |||
if layout: | |||
self.config_layout = layout | |||
else: | |||
self.config_layout = LiteLayout() | |||
self.is_host = is_host | |||
self.io_type = io_type | |||
def __repr__(self): | |||
data = { | |||
"name": self.name, | |||
"is_host": bool(self.is_host), | |||
"io_type": LiteIOType(self.io_type), | |||
"config_layout": self.config_layout, | |||
} | |||
return data.__repr__() | |||
def __hash__(self): | |||
return hash(self.name) | |||
class _LiteNetworkIO(Structure): | |||
""" | |||
the input and output information when load the network | |||
""" | |||
_fields_ = [ | |||
("inputs", POINTER(LiteIO)), | |||
("outputs", POINTER(LiteIO)), | |||
("input_size", c_size_t), | |||
("output_size", c_size_t), | |||
] | |||
def __init__(self): | |||
self.inputs = POINTER(LiteIO)() | |||
self.outputs = POINTER(LiteIO)() | |||
self.input_size = 0 | |||
self.output_size = 0 | |||
class LiteNetworkIO(object): | |||
""" | |||
the input and output information for user to construct _LiteNetWorkIO | |||
""" | |||
def __init__(self): | |||
self.inputs = [] | |||
self.outputs = [] | |||
def add_input(self, input_io): | |||
assert isinstance(input_io, LiteIO) | |||
self.inputs.append(input_io) | |||
def add_output(self, output_io): | |||
assert isinstance(output_io, LiteIO) | |||
self.outputs.append(output_io) | |||
def _create_network_io(self): | |||
network_io = _LiteNetworkIO() | |||
length = 1 if len(self.inputs) == 0 else len(self.inputs) | |||
self.c_inputs = (LiteIO * length)(*self.inputs) | |||
length = 1 if len(self.outputs) == 0 else len(self.outputs) | |||
self.c_outputs = (LiteIO * length)(*self.outputs) | |||
network_io.inputs = pointer(self.c_inputs[0]) | |||
network_io.outputs = pointer(self.c_outputs[0]) | |||
network_io.input_size = len(self.inputs) | |||
network_io.output_size = len(self.outputs) | |||
return network_io | |||
def __repr__(self): | |||
data = {"inputs": list(self.inputs), "outputs": list(self.outputs)} | |||
return data.__repr__() | |||
LiteAsyncCallback = CFUNCTYPE(c_int) | |||
def start_finish_callback(func): | |||
@CFUNCTYPE(c_int, POINTER(LiteIO), POINTER(_Ctensor), c_size_t) | |||
def wrapper(c_ios, c_tensors, size): | |||
ios = {} | |||
for i in range(size): | |||
tensor = LiteTensor() | |||
tensor._tensor = c_tensors[i] | |||
tensor.update() | |||
io = c_ios[i] | |||
ios[io] = tensor | |||
return func(ios) | |||
return wrapper | |||
class _NetworkAPI(_LiteCObjBase): | |||
""" | |||
get the network api from the lib | |||
""" | |||
_api_ = [ | |||
("LITE_make_default_network", [POINTER(_Cnetwork)]), | |||
("LITE_make_network", [POINTER(_Cnetwork), LiteConfig, _LiteNetworkIO]), | |||
("LITE_load_model_from_mem", [_Cnetwork, c_void_p, c_size_t]), | |||
("LITE_load_model_from_path", [_Cnetwork, c_char_p]), | |||
("LITE_shared_weight_with_network", [_Cnetwork, _Ctensor]), | |||
("LITE_destroy_network", [_Cnetwork]), | |||
("LITE_forward", [_Cnetwork]), | |||
("LITE_wait", [_Cnetwork]), | |||
("LITE_get_io_tensor", [_Cnetwork, c_char_p, c_int, POINTER(_Ctensor)]), | |||
("LITE_get_input_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]), | |||
("LITE_get_output_name", [_Cnetwork, c_size_t, POINTER(c_char_p)]), | |||
("LITE_get_all_input_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]), | |||
("LITE_get_all_output_name", [_Cnetwork, POINTER(c_size_t), POINTER(c_char_p)]), | |||
("LITE_is_cpu_inplace_mode", [_Cnetwork, POINTER(c_int)]), | |||
("LITE_get_cpu_threads_number", [_Cnetwork, POINTER(c_size_t)]), | |||
("LITE_get_device_id", [_Cnetwork, POINTER(c_int)]), | |||
("LITE_set_device_id", [_Cnetwork, c_int]), | |||
("LITE_set_cpu_inplace_mode", [_Cnetwork]), | |||
("LITE_use_tensorrt", [_Cnetwork]), | |||
("LITE_set_cpu_threads_number", [_Cnetwork, c_size_t]), | |||
("LITE_set_stream_id", [_Cnetwork, c_int]), | |||
("LITE_get_stream_id", [_Cnetwork, POINTER(c_int)]), | |||
("LITE_set_network_algo_policy", [_Cnetwork, c_int]), | |||
("LITE_set_network_algo_fastrun_config", [_Cnetwork, c_int, c_int]), | |||
("LITE_set_network_algo_workspace_limit", [_Cnetwork, c_size_t]), | |||
("LITE_share_runtime_memroy", [_Cnetwork, _Cnetwork]), | |||
("LITE_enable_profile_performance", [_Cnetwork, c_char_p]), | |||
("LITE_enable_io_txt_dump", [_Cnetwork, c_char_p]), | |||
("LITE_enable_io_bin_dump", [_Cnetwork, c_char_p]), | |||
("LITE_set_async_callback", [_Cnetwork, LiteAsyncCallback]), | |||
("LITE_set_start_callback", [_Cnetwork]), | |||
("LITE_set_finish_callback", [_Cnetwork]), | |||
] | |||
class LiteNetwork(object): | |||
""" | |||
the network to load a model and forward | |||
""" | |||
_api = _NetworkAPI()._lib | |||
def __init__(self, config=None, io=None): | |||
""" | |||
create a network with config and networkio | |||
""" | |||
self._network = _Cnetwork() | |||
if config: | |||
self.config = config | |||
else: | |||
self.config = LiteConfig() | |||
if io: | |||
self.network_io = io | |||
else: | |||
self.network_io = LiteNetworkIO() | |||
c_network_io = self.network_io._create_network_io() | |||
self._api.LITE_make_network(byref(self._network), self.config, c_network_io) | |||
def __repr__(self): | |||
data = {"config": self.config, "IOs": self.network_io} | |||
return data.__repr__() | |||
def __del__(self): | |||
self._api.LITE_destroy_network(self._network) | |||
def load(self, path): | |||
c_path = c_char_p(path.encode("utf-8")) | |||
self._api.LITE_load_model_from_path(self._network, c_path) | |||
def forward(self): | |||
self._api.LITE_forward(self._network) | |||
def wait(self): | |||
self._api.LITE_wait(self._network) | |||
def is_cpu_inplace_mode(self): | |||
""" | |||
whether the network run in cpu inpalce mode | |||
""" | |||
inplace = c_int() | |||
self._api.LITE_is_cpu_inplace_mode(self._network, byref(inplace)) | |||
return bool(inplace.value) | |||
def enable_cpu_inplace_mode(self): | |||
""" | |||
set cpu forward in inplace mode with which cpu forward only create one | |||
thread | |||
Note: this must be set before the network loaded | |||
""" | |||
self._api.LITE_set_cpu_inplace_mode(self._network) | |||
def use_tensorrt(self): | |||
""" | |||
Note: this must be set before the network loaded | |||
""" | |||
self._api.LITE_use_tensorrt(self._network) | |||
@property | |||
def device_id(self): | |||
""" | |||
get the device id | |||
""" | |||
device_id = c_int() | |||
self._api.LITE_get_device_id(self._network, byref(device_id)) | |||
return device_id.value | |||
@device_id.setter | |||
def device_id(self, device_id): | |||
""" | |||
set the device id | |||
Note: this must be set before the network loaded | |||
""" | |||
self._api.LITE_set_device_id(self._network, device_id) | |||
@property | |||
def stream_id(self): | |||
""" | |||
get the stream id | |||
""" | |||
stream_id = c_int() | |||
self._api.LITE_get_stream_id(self._network, byref(stream_id)) | |||
return stream_id.value | |||
@stream_id.setter | |||
def stream_id(self, stream_id): | |||
""" | |||
set the stream id | |||
Note: this must be set before the network loaded | |||
""" | |||
self._api.LITE_set_stream_id(self._network, stream_id) | |||
@property | |||
def threads_number(self): | |||
""" | |||
get the thread number of the netwrok | |||
""" | |||
nr_thread = c_size_t() | |||
self._api.LITE_get_cpu_threads_number(self._network, byref(nr_thread)) | |||
return nr_thread.value | |||
@threads_number.setter | |||
def threads_number(self, nr_threads): | |||
""" | |||
set the network forward in multithread mode, and the thread number | |||
Note: this must be set before the network loaded | |||
""" | |||
self._api.LITE_set_cpu_threads_number(self._network, nr_threads) | |||
def get_io_tensor(self, name, phase=LiteTensorPhase.LITE_IO): | |||
""" | |||
get input or output tensor by its name | |||
""" | |||
if type(name) == str: | |||
c_name = c_char_p(name.encode("utf-8")) | |||
else: | |||
c_name = c_char_p(name) | |||
tensor = LiteTensor() | |||
self._api.LITE_get_io_tensor( | |||
self._network, c_name, phase, byref(tensor._tensor) | |||
) | |||
tensor.update() | |||
return tensor | |||
def get_input_name(self, index): | |||
""" | |||
get the input name by the index in the network | |||
""" | |||
c_name = c_char_p() | |||
self._api.LITE_get_input_name(self._network, index, byref(c_name)) | |||
return c_name.value.decode("utf-8") | |||
def get_output_name(self, index): | |||
""" | |||
get the output name by the index in the network | |||
""" | |||
c_name = c_char_p() | |||
self._api.LITE_get_output_name(self._network, index, byref(c_name)) | |||
return c_name.value.decode("utf-8") | |||
def get_all_input_name(self): | |||
""" | |||
get all the input tensor name in the network | |||
""" | |||
nr_input = c_size_t() | |||
self._api.LITE_get_all_input_name(self._network, byref(nr_input), None) | |||
if nr_input.value > 0: | |||
names = (c_char_p * nr_input.value)() | |||
self._api.LITE_get_all_input_name(self._network, None, names) | |||
ret_name = [names[i].decode("utf-8") for i in range(nr_input.value)] | |||
return ret_name | |||
def get_all_output_name(self): | |||
""" | |||
get all the output tensor name in the network | |||
""" | |||
nr_output = c_size_t() | |||
self._api.LITE_get_all_output_name(self._network, byref(nr_output), None) | |||
if nr_output.value > 0: | |||
names = (c_char_p * nr_output.value)() | |||
self._api.LITE_get_all_output_name(self._network, None, names) | |||
ret_name = [names[i].decode("utf-8") for i in range(nr_output.value)] | |||
return ret_name | |||
def share_weights_with(self, src_network): | |||
""" | |||
share weights with the loaded network | |||
""" | |||
assert isinstance(src_network, LiteNetwork) | |||
self._api.LITE_shared_weight_with_network(self._network, src_network._network) | |||
def share_runtime_memroy(self, src_network): | |||
""" | |||
share runtime memory with the srouce network | |||
""" | |||
assert isinstance(src_network, LiteNetwork) | |||
self._api.LITE_share_runtime_memroy(self._network, src_network._network) | |||
def async_with_callback(self, async_callback): | |||
async_callback = LiteAsyncCallback(async_callback) | |||
self._api.LITE_set_async_callback(self._network, async_callback) | |||
def set_start_callback(self, start_callback): | |||
""" | |||
when the network start forward, the callback will be called, | |||
the start_callback with param mapping from LiteIO to the corresponding | |||
LiteTensor | |||
""" | |||
self._api.LITE_set_start_callback(self._network, start_callback) | |||
def set_finish_callback(self, finish_callback): | |||
""" | |||
when the network finish forward, the callback will be called, | |||
the finish_callback with param mapping from LiteIO to the corresponding | |||
LiteTensor | |||
""" | |||
self._api.LITE_set_finish_callback(self._network, finish_callback) | |||
def enable_profile_performance(self, profile_file): | |||
c_file = profile_file.encode("utf-8") | |||
self._api.LITE_enable_profile_performance(self._network, c_file) | |||
def set_network_algo_workspace_limit(self, size_limit): | |||
self._api.LITE_set_network_algo_workspace_limit(self._network, size_limit) | |||
def set_network_algo_policy( | |||
self, policy, shared_batch_size=0, binary_equal_between_batch=False | |||
): | |||
""" | |||
shared_batch_size: the batch size used by fastrun, | |||
Non-zero value means that fastrun use this batch size | |||
regardless of the batch size of the model. Zero means | |||
fastrun use batch size of the model | |||
binary_equal_between_batch: if the content of each input batch is | |||
binary equal,whether the content of each output batch is | |||
promised to be equal | |||
""" | |||
self._api.LITE_set_network_algo_policy(self._network, policy) | |||
self._api.LITE_set_network_algo_fastrun_config( | |||
self._network, shared_batch_size, binary_equal_between_batch | |||
) | |||
def io_txt_dump(self, txt_file): | |||
c_file = txt_file.encode("utf-8") | |||
self._api.LITE_enable_io_txt_dump(self._network, c_file) | |||
def io_bin_dump(self, bin_dir): | |||
c_dir = bin_dir.encode("utf-8") | |||
self._api.LITE_enable_io_bin_dump(self._network, c_dir) |
@@ -0,0 +1,90 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import logging | |||
from ctypes import * | |||
from enum import Enum, IntEnum | |||
class LiteBackend(IntEnum): | |||
LITE_DEFAULT = 0 | |||
class LiteDeviceType(IntEnum): | |||
LITE_CPU = 0 | |||
LITE_CUDA = 1 | |||
LITE_ATLAS = 3 | |||
LITE_NPU = 4 | |||
LITE_DEVICE_DEFAULT = 5 | |||
class LiteDataType(IntEnum): | |||
LITE_FLOAT = 0 | |||
LITE_HALF = 1 | |||
LITE_INT = 2 | |||
LITE_INT16 = 3 | |||
LITE_INT8 = 4 | |||
LITE_UINT8 = 5 | |||
class LiteTensorPhase(IntEnum): | |||
LITE_IO = 0 | |||
LITE_INPUT = 1 | |||
LITE_OUTPUT = 2 | |||
class LiteIOType(IntEnum): | |||
""" | |||
the input and output type, include SHAPE and VALUE | |||
sometimes user only need the shape of the output tensor | |||
""" | |||
LITE_IO_VALUE = 0 | |||
LITE_IO_SHAPE = 1 | |||
class LiteAlgoSelectStrategy(IntEnum): | |||
""" | |||
operation algorithm seletion strategy type, some operations have | |||
multi algorithms, different algorithm has different attribute, according to | |||
the strategy, the best algorithm will be selected. | |||
Note: These strategies can be combined | |||
LITE_ALGO_HEURISTIC | LITE_ALGO_PROFILE means: if profile cache not valid, | |||
use heuristic instead | |||
LITE_ALGO_HEURISTIC | LITE_ALGO_REPRODUCIBLE means: heuristic choice the | |||
reproducible algo | |||
LITE_ALGO_PROFILE | LITE_ALGO_REPRODUCIBLE means: profile the best | |||
algorithm from the reproducible algorithms set | |||
LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED means: profile the best | |||
algorithm form the optimzed algorithms, thus profile will process fast | |||
LITE_ALGO_PROFILE | LITE_ALGO_OPTIMIZED | LITE_ALGO_REPRODUCIBLE means: | |||
profile the best algorithm form the optimzed and reproducible algorithms | |||
""" | |||
LITE_ALGO_HEURISTIC = 1 | |||
LITE_ALGO_PROFILE = 2 | |||
LITE_ALGO_REPRODUCIBLE = 4 | |||
LITE_ALGO_OPTIMIZED = 8 | |||
class LiteLogLevel(IntEnum): | |||
""" | |||
DEBUG: The most verbose level, printing debugging info | |||
INFO: The default level | |||
WARN: Printing warnings | |||
ERROR: The least verbose level, printing errors only | |||
""" | |||
DEBUG = 0 | |||
INFO = 1 | |||
WARN = 2 | |||
ERROR = 3 |
@@ -0,0 +1,471 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
from ctypes import * | |||
import numpy as np | |||
from .base import _Ctensor, _lib, _LiteCObjBase | |||
from .struct import LiteDataType, LiteDeviceType, LiteIOType, Structure | |||
MAX_DIM = 7 | |||
_lite_type_to_nptypes = { | |||
LiteDataType.LITE_INT: np.int32, | |||
LiteDataType.LITE_FLOAT: np.float32, | |||
LiteDataType.LITE_UINT8: np.uint8, | |||
LiteDataType.LITE_INT8: np.int8, | |||
LiteDataType.LITE_INT16: np.int16, | |||
LiteDataType.LITE_HALF: np.float16, | |||
} | |||
_nptype_to_lite_type = {val: key for key, val in _lite_type_to_nptypes.items()} | |||
_str_nptypes_to_lite_nptypes = { | |||
np.dtype("int32"): LiteDataType.LITE_INT, | |||
np.dtype("float32"): LiteDataType.LITE_FLOAT, | |||
np.dtype("uint8"): LiteDataType.LITE_UINT8, | |||
np.dtype("int8"): LiteDataType.LITE_INT8, | |||
np.dtype("int16"): LiteDataType.LITE_INT16, | |||
np.dtype("float16"): LiteDataType.LITE_HALF, | |||
} | |||
ctype_to_lite_dtypes = { | |||
c_int: LiteDataType.LITE_INT, | |||
c_uint: LiteDataType.LITE_INT, | |||
c_float: LiteDataType.LITE_FLOAT, | |||
c_ubyte: LiteDataType.LITE_UINT8, | |||
c_byte: LiteDataType.LITE_INT8, | |||
c_short: LiteDataType.LITE_INT16, | |||
c_ushort: LiteDataType.LITE_INT16, | |||
} | |||
class LiteLayout(Structure): | |||
""" | |||
the simple layout description | |||
""" | |||
_fields_ = [ | |||
("shapes", c_size_t * MAX_DIM), | |||
("ndim", c_size_t), | |||
("data_type", c_int), | |||
] | |||
def __init__(self, shape=None, dtype=None): | |||
if shape: | |||
shape = list(shape) | |||
assert len(shape) <= MAX_DIM, "Layout max dim is 7." | |||
self.shapes = (c_size_t * MAX_DIM)(*shape) | |||
self.ndim = len(shape) | |||
else: | |||
self.shapes = (c_size_t * MAX_DIM)() | |||
self.ndim = 0 | |||
if not dtype: | |||
self.data_type = LiteDataType.LITE_FLOAT | |||
elif isinstance(dtype, LiteDataType): | |||
self.data_type = dtype | |||
elif type(dtype) == str: | |||
self.data_type = _str_nptypes_to_lite_nptypes[np.dtype(dtype)] | |||
elif isinstance(dtype, np.dtype): | |||
ctype = np.ctypeslib.as_ctypes_type(dtype) | |||
self.data_type = ctype_to_lite_dtypes[ctype] | |||
elif isinstance(dtype, type): | |||
self.data_type = _nptype_to_lite_type[dtype] | |||
else: | |||
raise RuntimeError("unkonw data type") | |||
def __repr__(self): | |||
data = { | |||
"shapes": list(self.shapes), | |||
"ndim": self.ndim, | |||
"data_type": _lite_type_to_nptypes[LiteDataType(self.data_type)], | |||
} | |||
return data.__repr__() | |||
class _LiteTensorDesc(Structure): | |||
""" | |||
warpper of the MegEngine Tensor | |||
:is_pinned_host: when set, the storage memory of the tensor is pinned memory, | |||
this is used to Optimize the H2D or D2H memory copy, if the device or layout | |||
is not set, when copy form other device(CUDA) tensor, this tensor | |||
will be automatically set to pinned tensor | |||
""" | |||
_fields_ = [ | |||
("is_pinned_host", c_int), | |||
("layout", LiteLayout), | |||
("device_type", c_int), | |||
("device_id", c_int), | |||
] | |||
def __init__(self): | |||
self.layout = LiteLayout() | |||
self.device_type = LiteDeviceType.LITE_CPU | |||
self.is_pinned_host = False | |||
self.device_id = 0 | |||
def __repr__(self): | |||
data = { | |||
"is_pinned_host": self.is_pinned_host, | |||
"layout": LiteLayout(self.layout), | |||
"device_type": LiteDeviceType(self.device_type.value), | |||
"device_id": self.device_id, | |||
} | |||
return data.__repr__() | |||
class _TensorAPI(_LiteCObjBase): | |||
""" | |||
get the api from the lib | |||
""" | |||
_api_ = [ | |||
("LITE_make_tensor", [_LiteTensorDesc, POINTER(_Ctensor)]), | |||
("LITE_set_tensor_layout", [_Ctensor, LiteLayout]), | |||
("LITE_reset_tensor_memory", [_Ctensor, c_void_p, c_size_t]), | |||
("LITE_reset_tensor", [_Ctensor, LiteLayout, c_void_p]), | |||
("LITE_tensor_reshape", [_Ctensor, POINTER(c_int), c_int]), | |||
( | |||
"LITE_tensor_slice", | |||
[ | |||
_Ctensor, | |||
POINTER(c_size_t), | |||
POINTER(c_size_t), | |||
POINTER(c_size_t), | |||
c_size_t, | |||
POINTER(_Ctensor), | |||
], | |||
), | |||
( | |||
"LITE_tensor_concat", | |||
[POINTER(_Ctensor), c_int, c_int, c_int, c_int, POINTER(_Ctensor),], | |||
), | |||
("LITE_tensor_fill_zero", [_Ctensor]), | |||
("LITE_tensor_copy", [_Ctensor, _Ctensor]), | |||
("LITE_tensor_share_memory_with", [_Ctensor, _Ctensor]), | |||
("LITE_get_tensor_memory", [_Ctensor, POINTER(c_void_p)]), | |||
("LITE_get_tensor_total_size_in_byte", [_Ctensor, POINTER(c_size_t)]), | |||
("LITE_get_tensor_layout", [_Ctensor, POINTER(LiteLayout)]), | |||
("LITE_get_tensor_device_type", [_Ctensor, POINTER(c_int)]), | |||
("LITE_get_tensor_device_id", [_Ctensor, POINTER(c_int)]), | |||
("LITE_destroy_tensor", [_Ctensor]), | |||
("LITE_is_pinned_host", [_Ctensor, POINTER(c_int)]), | |||
] | |||
class LiteTensor(object): | |||
""" | |||
the tensor to hold a block of data | |||
""" | |||
_api = _TensorAPI()._lib | |||
def __init__( | |||
self, | |||
layout=None, | |||
device_type=LiteDeviceType.LITE_CPU, | |||
device_id=0, | |||
is_pinned_host=False, | |||
): | |||
""" | |||
create a Tensor with layout, device, is_pinned_host param | |||
""" | |||
self._tensor = _Ctensor() | |||
if layout: | |||
self._layout = layout | |||
else: | |||
self._layout = LiteLayout() | |||
self._device_type = device_type | |||
self._device_id = device_id | |||
self._is_pinned_host = is_pinned_host | |||
tensor_desc = _LiteTensorDesc() | |||
tensor_desc.layout = self._layout | |||
tensor_desc.device_type = device_type | |||
tensor_desc.device_id = device_id | |||
tensor_desc.is_pinned_host = is_pinned_host | |||
self._api.LITE_make_tensor(tensor_desc, byref(self._tensor)) | |||
def __del__(self): | |||
self._api.LITE_destroy_tensor(self._tensor) | |||
def fill_zero(self): | |||
""" | |||
fill the buffer memory with zero | |||
""" | |||
self._api.LITE_tensor_fill_zero(self._tensor) | |||
self.update() | |||
def share_memory_with(self, src_tensor): | |||
""" | |||
share the same memory with the src_tensor, the self memory will be freed | |||
""" | |||
assert isinstance(src_tensor, LiteTensor) | |||
self._api.LITE_tensor_share_memory_with(self._tensor, src_tensor._tensor) | |||
self.update() | |||
@property | |||
def layout(self): | |||
self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout)) | |||
return self._layout | |||
@layout.setter | |||
def layout(self, layout): | |||
assert isinstance(layout, LiteLayout) | |||
self._layout = layout | |||
self._api.LITE_set_tensor_layout(self._tensor, layout) | |||
@property | |||
def is_pinned_host(self): | |||
""" | |||
whether the tensor is pinned tensor | |||
""" | |||
pinned = c_int() | |||
self._api.LITE_is_pinned_host(self._tensor, byref(pinned)) | |||
self._is_pinned_host = pinned | |||
return bool(self._is_pinned_host) | |||
@property | |||
def device_type(self): | |||
""" | |||
get device of the tensor | |||
""" | |||
device_type = c_int() | |||
self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type)) | |||
self._device_type = device_type | |||
return LiteDeviceType(device_type.value) | |||
@property | |||
def device_id(self): | |||
""" | |||
get device id of the tensor | |||
""" | |||
device_id = c_int() | |||
self._api.LITE_get_tensor_device_id(self._tensor, byref(device_id)) | |||
self._device_id = device_id.value | |||
return device_id.value | |||
@property | |||
def is_continue(self): | |||
""" | |||
whether the tensor memory is continue | |||
""" | |||
is_continue = c_int() | |||
self._api.LITE_is_memory_continue(self._tensor, byref(is_continue)) | |||
return bool(is_continue.value) | |||
@property | |||
def nbytes(self): | |||
""" | |||
get the length of the meomry in byte | |||
""" | |||
self.update() | |||
length = c_size_t() | |||
self._api.LITE_get_tensor_total_size_in_byte(self._tensor, byref(length)) | |||
return length.value | |||
def update(self): | |||
""" | |||
update the member from C, this will auto used after slice, share | |||
""" | |||
pinned = c_int() | |||
self._api.LITE_is_pinned_host(self._tensor, byref(pinned)) | |||
self._is_pinned_host = pinned | |||
device_type = c_int() | |||
self._api.LITE_get_tensor_device_type(self._tensor, byref(device_type)) | |||
self._device_type = device_type | |||
self._api.LITE_get_tensor_layout(self._tensor, byref(self._layout)) | |||
def copy_from(self, src_tensor): | |||
""" | |||
copy memory form the src_tensor | |||
""" | |||
assert isinstance(src_tensor, LiteTensor) | |||
self._api.LITE_tensor_copy(self._tensor, src_tensor._tensor) | |||
self.update() | |||
def reshape(self, shape): | |||
""" | |||
reshape the tensor with data not change, only change the shape | |||
:param shape: int arrary of dst_shape | |||
""" | |||
shape = list(shape) | |||
length = len(shape) | |||
c_shape = (c_int * length)(*shape) | |||
self._api.LITE_tensor_reshape(self._tensor, c_shape, length) | |||
self.update() | |||
def slice(self, start, end, step=None): | |||
""" | |||
slice the tensor with gaven start, end, step | |||
:param start: silce begin index of each dim | |||
:param end: silce end index of each dim | |||
:param step: silce step of each dim | |||
""" | |||
start = list(start) | |||
end = list(end) | |||
length = len(start) | |||
assert length == len(end), "slice with different length of start and end." | |||
if step: | |||
assert length == len(step), "slice with different length of start and step." | |||
step = list(step) | |||
else: | |||
step = [1 for i in range(length)] | |||
c_start = (c_size_t * length)(*start) | |||
c_end = (c_size_t * length)(*end) | |||
c_step = (c_size_t * length)(*step) | |||
slice_tensor = LiteTensor() | |||
self._api.LITE_tensor_slice( | |||
self._tensor, c_start, c_end, c_step, length, byref(slice_tensor._tensor) | |||
) | |||
slice_tensor.update() | |||
return slice_tensor | |||
def get_ctypes_memory(self): | |||
""" | |||
get the memory of the tensor, return c_void_p of the tensor memory | |||
""" | |||
self.update() | |||
mem = c_void_p() | |||
self._api.LITE_get_tensor_memory(self._tensor, byref(mem)) | |||
return mem | |||
def set_data_by_share(self, data, length=0, layout=None): | |||
""" | |||
share the data to the tensor | |||
param data: the data will shared to the tensor, it should be a | |||
numpy.ndarray or ctypes data | |||
""" | |||
self.update() | |||
if isinstance(data, np.ndarray): | |||
assert ( | |||
self.is_continue | |||
), "set_data_by_share can only apply in continue tensor." | |||
assert ( | |||
self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU | |||
), "set_data_by_share can only apply in cpu tensor or pinned tensor." | |||
np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] | |||
c_type = np.ctypeslib.as_ctypes_type(np_type) | |||
if self.nbytes != data.nbytes: | |||
self.layout = LiteLayout(data.shape, ctype_to_lite_dtypes[c_type]) | |||
self._shared_data = data | |||
data = data.ctypes.data_as(POINTER(c_type)) | |||
if layout is not None: | |||
self.layout = layout | |||
else: | |||
assert length == 0 or length == self.nbytes, "the data length is not match." | |||
self._api.LITE_reset_tensor_memory(self._tensor, data, self.nbytes) | |||
def set_data_by_copy(self, data, data_length=0, layout=None): | |||
""" | |||
copy the data to the tensor | |||
param data: the data to copy to tensor, it should be list, | |||
numpy.ndarraya or ctypes with length | |||
""" | |||
self.update() | |||
if layout is not None: | |||
self.layout = layout | |||
assert self.is_continue, "set_data_by_copy can only apply in continue tensor." | |||
assert ( | |||
self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU | |||
), "set_data_by_copy can only apply in cpu tensor or pinned tensor." | |||
np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] | |||
c_type = np.ctypeslib.as_ctypes_type(np_type) | |||
tensor_memory = c_void_p() | |||
if type(data) == list: | |||
length = len(data) | |||
self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) | |||
tensor_length = self.nbytes | |||
assert ( | |||
length * sizeof(c_type) <= tensor_length | |||
), "the length of input data to set to the tensor is too large." | |||
arr = (c_type * length)(*data) | |||
memmove(tensor_memory, arr, sizeof(c_type) * length) | |||
elif type(data) == np.ndarray: | |||
if self.nbytes != data.nbytes: | |||
self.layout = LiteLayout(data.shape, data.dtype) | |||
arr = data.ctypes.data_as(POINTER(c_type)) | |||
self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) | |||
assert self.nbytes == data.nbytes | |||
memmove(tensor_memory, arr, self.nbytes) | |||
else: | |||
assert ( | |||
data_length == self.nbytes or layout is not None | |||
), "when input data is ctypes, the length of input data or layout must set" | |||
self._api.LITE_get_tensor_memory(self._tensor, byref(tensor_memory)) | |||
memmove(tensor_memory, data, data_length) | |||
def to_numpy(self): | |||
""" | |||
get the buffer of the tensor | |||
""" | |||
self.update() | |||
if self.nbytes <= 0: | |||
return np.array([]) | |||
if self.is_continue and ( | |||
self.is_pinned_host or self.device_type == LiteDeviceType.LITE_CPU | |||
): | |||
ptr = c_void_p() | |||
self._api.LITE_get_tensor_memory(self._tensor, byref(ptr)) | |||
np_type = _lite_type_to_nptypes[LiteDataType(self._layout.data_type)] | |||
shape = [self._layout.shapes[i] for i in range(self._layout.ndim)] | |||
np_arr = np.zeros(shape, np_type) | |||
if np_arr.nbytes: | |||
memmove(np_arr.ctypes.data_as(c_void_p), ptr, np_arr.nbytes) | |||
return np_arr | |||
else: | |||
tmp_tensor = LiteTensor(self.layout) | |||
tmp_tensor.copy_from(self) | |||
return tmp_tensor.to_numpy() | |||
def __repr__(self): | |||
self.update() | |||
data = { | |||
"layout": self._layout, | |||
"device_type": LiteDeviceType(self._device_type.value), | |||
"device_id": int(self.device_id), | |||
"is_pinned_host": bool(self._is_pinned_host), | |||
} | |||
return data.__repr__() | |||
def LiteTensorConcat( | |||
tensors, dim, device_type=LiteDeviceType.LITE_DEVICE_DEFAULT, device_id=-1 | |||
): | |||
""" | |||
concat tensor in input dim to one tensor | |||
dim : the dim to act concat | |||
device_type: the result tensor device type | |||
device_id: the result tensor device id | |||
""" | |||
api = _TensorAPI()._lib | |||
length = len(tensors) | |||
c_tensors = [t._tensor for t in tensors] | |||
c_tensors = (_Ctensor * length)(*c_tensors) | |||
result_tensor = LiteTensor() | |||
api.LITE_tensor_concat( | |||
cast(byref(c_tensors), POINTER(c_void_p)), | |||
length, | |||
dim, | |||
device_type, | |||
device_id, | |||
byref(result_tensor._tensor), | |||
) | |||
result_tensor.update() | |||
return result_tensor |
@@ -0,0 +1,122 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import threading | |||
import numpy as np | |||
from .base import * | |||
from .struct import * | |||
from .tensor import * | |||
class TensorBatchCollector: | |||
""" | |||
this is a tensor utils to collect subtensor in batch continuous | |||
""" | |||
def __init__( | |||
self, | |||
shape, | |||
dtype=LiteDataType.LITE_INT8, | |||
device_type=LiteDeviceType.LITE_CUDA, | |||
device_id=0, | |||
is_pinned_host=False, | |||
tensor=None, | |||
): | |||
self._mutex = threading.Lock() | |||
self.dev_type = device_type | |||
self.is_pinned_host = is_pinned_host | |||
self.dev_id = 0 | |||
self.shape = shape | |||
self.dtype = LiteLayout(dtype=dtype).data_type | |||
self._free_list = list(range(self.shape[0])) | |||
if tensor is not None: | |||
assert ( | |||
tensor.layout.shapes[0 : tensor.layout.ndim] == shape | |||
), "The tensor set to TensorBatchCollector is not right." | |||
self._tensor = tensor | |||
self.dtype = tensor.layout.data_type | |||
self.device_type = tensor.device_type | |||
self.device_id = tensor.device_type | |||
else: | |||
self._tensor = LiteTensor( | |||
LiteLayout(shape, dtype), device_type, device_id, is_pinned_host | |||
) | |||
def collect_id(self, array, batch_id): | |||
if isinstance(array, np.ndarray): | |||
shape = array.shape | |||
assert list(shape) == self.shape[1:] | |||
in_dtype = ctype_to_lite_dtypes[np.ctypeslib.as_ctypes_type(array.dtype)] | |||
assert in_dtype == self.dtype | |||
# get the batch index | |||
with self._mutex: | |||
if batch_id in self._free_list: | |||
self._free_list.remove(batch_id) | |||
# get the subtensor | |||
subtensor = self._tensor.slice([batch_id], [batch_id + 1]) | |||
if subtensor.device_type == LiteDeviceType.LITE_CPU: | |||
subtensor.set_data_by_copy(array) | |||
else: | |||
pinned_tensor = LiteTensor( | |||
subtensor.layout, self.dev_type, self.dev_id, True | |||
) | |||
pinned_tensor.set_data_by_share(array) | |||
subtensor.copy_from(pinned_tensor) | |||
else: | |||
assert isinstance(array, LiteTensor) | |||
ndim = array.layout.ndim | |||
shape = list(array.layout.shapes)[0:ndim] | |||
assert list(shape) == self.shape[1:] | |||
in_dtype = array.layout.data_type | |||
assert in_dtype == self.dtype | |||
# get the batch index | |||
with self._mutex: | |||
if batch_id in self._free_list: | |||
self._free_list.remove(batch_id) | |||
# get the subtensor | |||
subtensor = self._tensor.slice([batch_id], [batch_id + 1]) | |||
subtensor.copy_from(array) | |||
return batch_id | |||
def collect(self, array): | |||
with self._mutex: | |||
if len(self._free_list) == 0: | |||
return -1 | |||
idx = self._free_list.pop(0) | |||
return self.collect_id(array, idx) | |||
def collect_by_ctypes(self, data, length): | |||
""" | |||
collect with ctypes data input | |||
""" | |||
with self._mutex: | |||
if len(self._free_list) == 0: | |||
return -1 | |||
idx = self._free_list.pop(0) | |||
# get the subtensor | |||
subtensor = self._tensor.slice([idx], [idx + 1]) | |||
if subtensor.device_type == LiteDeviceType.LITE_CPU: | |||
subtensor.set_data_by_copy(data, length) | |||
else: | |||
pinned_tensor = LiteTensor( | |||
subtensor.layout, self.dev_type, self.dev_id, True | |||
) | |||
pinned_tensor.set_data_by_share(data, length) | |||
subtensor.copy_from(pinned_tensor) | |||
def free(self, indexes): | |||
with self._mutex: | |||
self._free_list.extend(indexes) | |||
def get(self): | |||
return self._tensor | |||
def to_numpy(self): | |||
return self._tensor.to_numpy() |
@@ -0,0 +1,199 @@ | |||
# PyLite | |||
Lite的python接口提供更加方便灵活的使用Lite进行模型Inference,支持各种平台上运行,X86-CUDA,X86-CPU,Arm-CPU,Arm-CUDA平台。 | |||
## 安装 | |||
### whl包安装 | |||
Lite python接口的whl包会随着megbrain的发版发布,版本号和megbrain保持一致,目前发布的Lite的whl包,包括Linux、windows、macos平台,这些平台可以直接通过pip3安装。 | |||
```shell | |||
python3 -m pip install --upgrade pip | |||
python3 -m pip install megenginelite -i https://pypi.megvii-inc.com/simple | |||
``` | |||
### develop 安装 | |||
开发模式下,可以使用Cmake编译出lite动态库liblite.so/liblite.dll/liblite_shared.dylib,并使用这个动态库进行开发和debug。该方式安装的pylite只能在本地机器上使用,不能copy到其他机器上使用。 | |||
* 编译liblite.so。使用cmake编译出liblite.so | |||
* clone megbrain工程到本地 | |||
```shell | |||
git clone git@git-core.megvii-inc.com:brain-sdk/MegBrain.git | |||
``` | |||
* 进行Cmake编译,这里的cmake编译同megbrain的cmake编译,使用参数和宏也完全一样 | |||
* 编译准备 | |||
```shell | |||
cd MegBrain | |||
sh ./third_party/prepare.sh | |||
mkdir build | |||
cd build | |||
``` | |||
* 编译X86-CUDA版本 | |||
```shell | |||
cmake .. -DMGE_WITH_CUDA=ON -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) | |||
``` | |||
* 编译X86 CPU Only版本 | |||
```shell | |||
cmake .. -DMGE_WITH_CUDA=OFF -DMGE_WITH_TEST=ON -DCMAKE_BUILD_TYPE=Release && make -j$(nproc) | |||
``` | |||
* 编译完成之后,liblite.so 保存在build目录中的lite文件下 | |||
* 将liblite.so copy到megenginelite的python源文件目录下,就可以使用megenginelite了。 | |||
```shell | |||
MegBrain的工程目录为 ${mgb_hone} | |||
cp ${mgb_hone}/build/lite/liblite.so ${mgb_home}/lite/pylite/megenginelite/ | |||
cd ${mgb_home}/lite/pylite | |||
python3 -m "import megenginelite" | |||
``` | |||
这样就可以在${mgb_home}/lite/pylite 目录下面开发和debug lite的python接口了 | |||
## python3中使用megenginelite | |||
Lite的python接口是对其C/C++接口的一层封装,他们使用的模型都是相同的模型格式。megenginelite提供两种数据接口,分别是LiteTensor和LiteNetwork。 | |||
### LiteTensor | |||
LiteTensor提供了用户对数据的操作接口,提供了接口包括: | |||
* fill_zero: 将tensor的内存设置为全0 | |||
* share_memory_with: 可以和其他LiteTensor的共享内存 | |||
* copy_from: 从其他LiteTensor中copy数据到自身内存中 | |||
* reshape: 改变该LiteTensor的shape,内存数据保持不变 | |||
* slice: 对该LiteTensor中的数据进行切片,需要分别指定每一维切片的start,end,和step。 | |||
* set_data_by_share: 调用之后使得该LiteTensor中的内存共享自输入的array的内存,输入的array必须是numpy的ndarray,并且tensor在CPU上 | |||
* set_data_by_copy: 该LiteTensor将会从输入的data中copy数据,data可以是list和numpy的ndarray,需要保证data的数据量不超过tensor的容量,tensor在CPU上 | |||
* to_numpy: 将该LiteTensor中数据copy到numpy的array中,返回给用户,如果是非连续的LiteTensor,如slice出来的,将copy到连续的numpy array中,该接口主要数为了debug,有性能问题。 | |||
#### 使用example | |||
* LiteTensor 设置数据example | |||
``` | |||
def test_tensor_set_data(): | |||
layout = LiteLayout([2, 16], "int8") | |||
tensor = LiteTensor(layout) | |||
assert tensor.nbytes == 2 * 16 | |||
data = [i for i in range(32)] | |||
tensor.set_data_by_copy(data) | |||
real_data = tensor.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 16][i % 16] == i | |||
arr = np.ones([2, 16], "int8") | |||
tensor.set_data_by_copy(arr) | |||
real_data = tensor.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 16][i % 16] == 1 | |||
for i in range(32): | |||
arr[i // 16][i % 16] = i | |||
tensor.set_data_by_share(arr) | |||
real_data = tensor.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 16][i % 16] == i | |||
arr[0][8] = 100 | |||
arr[1][3] = 20 | |||
real_data = tensor.to_numpy() | |||
assert real_data[0][8] == 100 | |||
assert real_data[1][3] == 20 | |||
``` | |||
* tensor 共享内存example | |||
```python | |||
def test_tensor_share_memory_with(): | |||
layout = LiteLayout([4, 32], "int16") | |||
tensor = LiteTensor(layout) | |||
assert tensor.nbytes == 4 * 32 * 2 | |||
arr = np.ones([4, 32], "int16") | |||
for i in range(128): | |||
arr[i // 32][i % 32] = i | |||
tensor.set_data_by_share(arr) | |||
real_data = tensor.to_numpy() | |||
for i in range(128): | |||
assert real_data[i // 32][i % 32] == i | |||
tensor2 = LiteTensor(layout) | |||
tensor2.share_memory_with(tensor) | |||
real_data = tensor.to_numpy() | |||
real_data2 = tensor2.to_numpy() | |||
for i in range(128): | |||
assert real_data[i // 32][i % 32] == i | |||
assert real_data2[i // 32][i % 32] == i | |||
arr[1][18] = 5 | |||
arr[3][7] = 345 | |||
real_data = tensor2.to_numpy() | |||
assert real_data[1][18] == 5 | |||
assert real_data[3][7] == 345 | |||
``` | |||
更多的使用可以参考pylite中test/test_tensor.py中的使用 | |||
### LiteNetwork | |||
LiteNetwork主要为用户提供模型载入,运行等功能。使用的模型见lite的readme中关于模型的部分 | |||
* CPU基本模型载入运行的example | |||
``` | |||
def test_network_basic(): | |||
source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
input_data_path = os.path.join(source_dir, "input_data.npy") | |||
# read input to input_data | |||
input_data = np.load(input_data_path) | |||
model_path = os.path.join(source_dir, "shufflenet.mge") | |||
network = LiteNetwork() | |||
network.load(model_path) | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
assert input_tensor.layout.shapes[0] == 1 | |||
assert input_tensor.layout.shapes[1] == 3 | |||
assert input_tensor.layout.shapes[2] == 224 | |||
assert input_tensor.layout.shapes[3] == 224 | |||
assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT | |||
assert input_tensor.layout.ndim == 4 | |||
# copy input data to input_tensor of the network | |||
input_tensor.set_data_by_copy(input_data) | |||
for i in range(3): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum())) | |||
``` | |||
* CUDA上使用device内存作为模型输入,需要在构造network候配置config和IO信息 | |||
``` | |||
def test_network_device_IO(): | |||
source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
input_data_path = os.path.join(source_dir, "input_data.npy") | |||
model_path = os.path.join(source_dir, "shufflenet.mge") | |||
# read input to input_data | |||
input_data = np.load(input_data_path) | |||
input_layout = LiteLayout([1, 3, 224, 224]) | |||
host_input_data = LiteTensor(layout=input_layout) | |||
host_input_data.set_data_by_share(input_data) | |||
dev_input_data = LiteTensor(layout=input_layout, device_type=LiteDeviceType.LITE_CUDA) | |||
dev_input_data.copy_from(host_input_data) | |||
# construct LiteOption | |||
options = LiteOptions() | |||
options.weight_preprocess = 1 | |||
options.var_sanity_check_first_run = 0 | |||
net_config = LiteConfig(device_type=LiteDeviceType.LITE_CUDA, option=options) | |||
# constuct LiteIO, is_host=False means the input tensor will use device memory | |||
input_io = LiteIO("data", is_host=False) | |||
ios = LiteNetworkIO() | |||
ios.add_input(input_io) | |||
network = LiteNetwork(config=net_config, io=ios) | |||
network.load(model_path) | |||
input_name = network.get_input_name(0) | |||
dev_input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
# copy input data to input_tensor of the network | |||
dev_input_tensor.share_memory_with(dev_input_data) | |||
for i in range(3): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
print('shufflenet output max={}, sum={}'.format(output_data.max(), output_data.sum())) | |||
``` | |||
更多的使用可以参考pylite中test/test_network.py和test/test_network_cuda.py中的使用 |
@@ -0,0 +1 @@ | |||
numpy>=1.18 |
@@ -0,0 +1,20 @@ | |||
#!/usr/bin/env bash | |||
set -e | |||
cd $(dirname $0)/.. | |||
ISORT_ARG="" | |||
BLACK_ARG="" | |||
while getopts 'd' OPT; do | |||
case $OPT in | |||
d) | |||
ISORT_ARG="--diff --check-only" | |||
BLACK_ARG="--diff --check" | |||
;; | |||
?) | |||
echo "Usage: `basename $0` [-d]" | |||
esac | |||
done | |||
isort $ISORT_ARG -j $(nproc) -rc megenginelite test | |||
black $BLACK_ARG --target-version=py35 -- megenginelite test |
@@ -0,0 +1,127 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import os | |||
import re | |||
import pathlib | |||
import platform | |||
from distutils.file_util import copy_file | |||
from setuptools import setup, find_packages, Extension | |||
from setuptools.command.build_ext import build_ext as _build_ext | |||
class PrecompiledExtesion(Extension): | |||
def __init__(self, name): | |||
super().__init__(name, sources=[]) | |||
class build_ext(_build_ext): | |||
def build_extension(self, ext): | |||
if not isinstance(ext, PrecompiledExtesion): | |||
return super().build_extension(ext) | |||
if not self.inplace: | |||
fullpath = self.get_ext_fullpath(ext.name) | |||
extdir = pathlib.Path(fullpath) | |||
extdir.parent.mkdir(parents=True, exist_ok=True) | |||
modpath = self.get_ext_fullname(ext.name).split('.') | |||
if platform.system() == 'Windows': | |||
modpath[-1] += '.dll' | |||
elif platform.system() == 'Darwin': | |||
modpath[-1] += '.dylib' | |||
else: | |||
modpath[-1] += '.so' | |||
modpath = str(pathlib.Path(*modpath).resolve()) | |||
copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run) | |||
v = {} | |||
with open("megenginelite/version.py") as fp: | |||
exec(fp.read(), v) | |||
__version__ = v['__version__'] | |||
email = 'megengine@megvii.com' | |||
# https://www.python.org/dev/peps/pep-0440 | |||
# Public version identifiers: [N!]N(.N)*[{a|b|rc}N][.postN][.devN] | |||
# Local version identifiers: <public version identifier>[+<local version label>] | |||
# PUBLIC_VERSION_POSTFIX use to handle rc or dev info | |||
public_version_postfix = os.environ.get('PUBLIC_VERSION_POSTFIX') | |||
if public_version_postfix: | |||
__version__ = '{}{}'.format(__version__, public_version_postfix) | |||
local_version = [] | |||
strip_sdk_info = os.environ.get('STRIP_SDK_INFO', 'False').lower() | |||
sdk_name = os.environ.get('SDK_NAME', 'cpu') | |||
if 'true' == strip_sdk_info: | |||
print('wheel version strip sdk info') | |||
else: | |||
local_version.append(sdk_name) | |||
local_postfix = os.environ.get('LOCAL_VERSION') | |||
if local_postfix: | |||
local_version.append(local_postfix) | |||
if len(local_version): | |||
__version__ = '{}+{}'.format(__version__, '.'.join(local_version)) | |||
packages = find_packages() | |||
megenginelite_data = [ | |||
str(f.relative_to('megenginelite')) | |||
for f in pathlib.Path('megenginelite').glob('**/*') | |||
] | |||
if platform.system() == 'Windows': | |||
megenginelite_data.remove('libs\\liblite_shared.dll') | |||
elif platform.system() == 'Darwin': | |||
megenginelite_data.remove('libs/liblite_shared.dylib') | |||
else: | |||
megenginelite_data.remove('libs/liblite_shared.so') | |||
with open('requires.txt') as f: | |||
requires = f.read().splitlines() | |||
prebuild_modules=[PrecompiledExtesion('megenginelite.libs.liblite_shared')] | |||
setup_kwargs = dict( | |||
name=package_name, | |||
version=__version__, | |||
description='Inference Framework for MegEngine', | |||
author='Megvii Engine Team', | |||
author_email=email, | |||
packages=packages, | |||
package_data={ | |||
'megenginelite': megenginelite_data, | |||
}, | |||
ext_modules=prebuild_modules, | |||
install_requires=requires, | |||
cmdclass={'build_ext': build_ext}, | |||
) | |||
setup_kwargs.update(dict( | |||
classifiers=[ | |||
'Development Status :: 3 - Alpha', | |||
'Intended Audience :: Developers', | |||
'Intended Audience :: Education', | |||
'Intended Audience :: Science/Research', | |||
'License :: OSI Approved :: Apache Software License', | |||
'Programming Language :: C++', | |||
'Programming Language :: Python :: 3', | |||
'Programming Language :: Python :: 3.5', | |||
'Programming Language :: Python :: 3.6', | |||
'Programming Language :: Python :: 3.7', | |||
'Programming Language :: Python :: 3.8', | |||
'Topic :: Scientific/Engineering', | |||
'Topic :: Scientific/Engineering :: Mathematics', | |||
'Topic :: Scientific/Engineering :: Artificial Intelligence', | |||
'Topic :: Software Development', | |||
'Topic :: Software Development :: Libraries', | |||
'Topic :: Software Development :: Libraries :: Python Modules', | |||
], | |||
license='Apache 2.0', | |||
keywords='megengine deep learning', | |||
data_files = [("megengine", [ | |||
"../LICENSE", | |||
"../ACKNOWLEDGMENTS", | |||
])] | |||
)) | |||
setup(**setup_kwargs) |
@@ -0,0 +1,92 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import os | |||
import unittest | |||
import numpy as np | |||
from megenginelite import * | |||
set_log_level(2) | |||
class TestShuffleNet(unittest.TestCase): | |||
source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
input_data_path = os.path.join(source_dir, "input_data.npy") | |||
correct_data_path = os.path.join(source_dir, "output_data.npy") | |||
correct_data = np.load(correct_data_path).flatten() | |||
input_data = np.load(input_data_path) | |||
def check_correct(self, out_data, error=1e-4): | |||
out_data = out_data.flatten() | |||
assert np.isfinite(out_data.sum()) | |||
assert self.correct_data.size == out_data.size | |||
for i in range(out_data.size): | |||
assert abs(out_data[i] - self.correct_data[i]) < error | |||
def do_forward(self, network, times=3): | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
input_tensor.set_data_by_copy(self.input_data) | |||
for i in range(times): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
self.check_correct(output_data) | |||
class TestGlobal(TestShuffleNet): | |||
def test_device_count(self): | |||
LiteGlobal.try_coalesce_all_free_memory() | |||
count = LiteGlobal.get_device_count(LiteDeviceType.LITE_CPU) | |||
assert count > 0 | |||
def test_register_decryption_method(self): | |||
@decryption_func | |||
def function(in_arr, key_arr, out_arr): | |||
if not out_arr: | |||
return in_arr.size | |||
else: | |||
for i in range(in_arr.size): | |||
out_arr[i] = in_arr[i] ^ key_arr[0] ^ key_arr[0] | |||
return out_arr.size | |||
LiteGlobal.register_decryption_and_key("just_for_test", function, [15]) | |||
config = LiteConfig() | |||
config.bare_model_cryption_name = "just_for_test".encode("utf-8") | |||
network = LiteNetwork() | |||
model_path = os.path.join(self.source_dir, "shufflenet.mge") | |||
network.load(model_path) | |||
self.do_forward(network) | |||
def test_update_decryption_key(self): | |||
wrong_key = [0] * 32 | |||
LiteGlobal.update_decryption_key("AES_default", wrong_key) | |||
with self.assertRaises(RuntimeError): | |||
config = LiteConfig() | |||
config.bare_model_cryption_name = "AES_default".encode("utf-8") | |||
network = LiteNetwork(config) | |||
model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") | |||
network.load(model_path) | |||
right_key = [i for i in range(32)] | |||
LiteGlobal.update_decryption_key("AES_default", right_key) | |||
config = LiteConfig() | |||
config.bare_model_cryption_name = "AES_default".encode("utf-8") | |||
network = LiteNetwork(config) | |||
model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") | |||
network.load(model_path) | |||
self.do_forward(network) |
@@ -0,0 +1,405 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import os | |||
import unittest | |||
import numpy as np | |||
from megenginelite import * | |||
set_log_level(2) | |||
def test_version(): | |||
print("Lite verson: {}".format(version)) | |||
def test_network_io(): | |||
input_io1 = LiteIO("data1", is_host=False, io_type=LiteIOType.LITE_IO_VALUE) | |||
input_io2 = LiteIO( | |||
"data2", | |||
is_host=True, | |||
io_type=LiteIOType.LITE_IO_SHAPE, | |||
layout=LiteLayout([2, 4, 4]), | |||
) | |||
io = LiteNetworkIO() | |||
io.add_input(input_io1) | |||
io.add_input(input_io2) | |||
output_io1 = LiteIO("out1", is_host=False) | |||
output_io2 = LiteIO("out2", is_host=True, layout=LiteLayout([1, 1000])) | |||
io.add_output(output_io1) | |||
io.add_output(output_io2) | |||
assert len(io.inputs) == 2 | |||
assert len(io.outputs) == 2 | |||
assert io.inputs[0] == input_io1 | |||
assert io.outputs[0] == output_io1 | |||
c_io = io._create_network_io() | |||
assert c_io.input_size == 2 | |||
assert c_io.output_size == 2 | |||
class TestShuffleNet(unittest.TestCase): | |||
source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
input_data_path = os.path.join(source_dir, "input_data.npy") | |||
correct_data_path = os.path.join(source_dir, "output_data.npy") | |||
model_path = os.path.join(source_dir, "shufflenet.mge") | |||
correct_data = np.load(correct_data_path).flatten() | |||
input_data = np.load(input_data_path) | |||
def check_correct(self, out_data, error=1e-4): | |||
out_data = out_data.flatten() | |||
assert np.isfinite(out_data.sum()) | |||
assert self.correct_data.size == out_data.size | |||
for i in range(out_data.size): | |||
assert abs(out_data[i] - self.correct_data[i]) < error | |||
def do_forward(self, network, times=3): | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
input_tensor.set_data_by_copy(self.input_data) | |||
for i in range(times): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
self.check_correct(output_data) | |||
class TestNetwork(TestShuffleNet): | |||
def test_decryption(self): | |||
model_path = os.path.join(self.source_dir, "shufflenet_crypt_aes.mge") | |||
config = LiteConfig() | |||
config.bare_model_cryption_name = "AES_default".encode("utf-8") | |||
network = LiteNetwork(config) | |||
network.load(model_path) | |||
self.do_forward(network) | |||
def test_pack_model(self): | |||
model_path = os.path.join(self.source_dir, "test_packed_model_rc4.lite") | |||
network = LiteNetwork() | |||
network.load(model_path) | |||
self.do_forward(network) | |||
def test_network_basic(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
assert input_tensor.layout.shapes[0] == 1 | |||
assert input_tensor.layout.shapes[1] == 3 | |||
assert input_tensor.layout.shapes[2] == 224 | |||
assert input_tensor.layout.shapes[3] == 224 | |||
assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT | |||
assert input_tensor.layout.ndim == 4 | |||
self.do_forward(network) | |||
def test_network_shared_data(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
input_tensor.set_data_by_share(self.input_data) | |||
for i in range(3): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
self.check_correct(output_data) | |||
def test_network_get_name(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
input_names = network.get_all_input_name() | |||
assert input_names[0] == "data" | |||
output_names = network.get_all_output_name() | |||
assert output_names[0] == network.get_output_name(0) | |||
self.do_forward(network) | |||
def test_network_set_device_id(self): | |||
network = LiteNetwork() | |||
assert network.device_id == 0 | |||
network.device_id = 1 | |||
network.load(self.model_path) | |||
assert network.device_id == 1 | |||
with self.assertRaises(RuntimeError): | |||
network.device_id = 1 | |||
self.do_forward(network) | |||
def test_network_set_stream_id(self): | |||
network = LiteNetwork() | |||
assert network.stream_id == 0 | |||
network.stream_id = 1 | |||
network.load(self.model_path) | |||
assert network.stream_id == 1 | |||
with self.assertRaises(RuntimeError): | |||
network.stream_id = 1 | |||
self.do_forward(network) | |||
def test_network_set_thread_number(self): | |||
network = LiteNetwork() | |||
assert network.threads_number == 1 | |||
network.threads_number = 2 | |||
network.load(self.model_path) | |||
assert network.threads_number == 2 | |||
with self.assertRaises(RuntimeError): | |||
network.threads_number = 2 | |||
self.do_forward(network) | |||
def test_network_cpu_inplace(self): | |||
network = LiteNetwork() | |||
assert network.is_cpu_inplace_mode() == False | |||
network.enable_cpu_inplace_mode() | |||
network.load(self.model_path) | |||
assert network.is_cpu_inplace_mode() == True | |||
with self.assertRaises(RuntimeError): | |||
network.enable_cpu_inplace_mode() | |||
self.do_forward(network) | |||
def test_network_option(self): | |||
option = LiteOptions() | |||
option.weight_preprocess = 1 | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
network = LiteNetwork(config=config) | |||
network.load(self.model_path) | |||
self.do_forward(network) | |||
def test_network_reset_io(self): | |||
option = LiteOptions() | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
input_io = LiteIO("data") | |||
ios = LiteNetworkIO() | |||
ios.add_input(input_io) | |||
network = LiteNetwork(config=config, io=ios) | |||
network.load(self.model_path) | |||
input_tensor = network.get_io_tensor("data") | |||
assert input_tensor.device_type == LiteDeviceType.LITE_CPU | |||
self.do_forward(network) | |||
def test_network_by_share(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
assert input_tensor.device_type == LiteDeviceType.LITE_CPU | |||
layout = LiteLayout(self.input_data.shape, self.input_data.dtype) | |||
tensor_tmp = LiteTensor(layout=layout) | |||
tensor_tmp.set_data_by_share(self.input_data) | |||
input_tensor.share_memory_with(tensor_tmp) | |||
for i in range(3): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
self.check_correct(output_data) | |||
def test_network_share_weights(self): | |||
option = LiteOptions() | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
src_network = LiteNetwork(config=config) | |||
src_network.load(self.model_path) | |||
new_network = LiteNetwork() | |||
new_network.enable_cpu_inplace_mode() | |||
new_network.share_weights_with(src_network) | |||
self.do_forward(src_network) | |||
self.do_forward(new_network) | |||
def test_network_share_runtime_memory(self): | |||
option = LiteOptions() | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
src_network = LiteNetwork(config=config) | |||
src_network.load(self.model_path) | |||
new_network = LiteNetwork() | |||
new_network.enable_cpu_inplace_mode() | |||
new_network.share_runtime_memroy(src_network) | |||
new_network.load(self.model_path) | |||
self.do_forward(src_network) | |||
self.do_forward(new_network) | |||
# def test_network_async(self): | |||
# count = 0 | |||
# finished = False | |||
# | |||
# def async_callback(): | |||
# nonlocal finished | |||
# finished = True | |||
# return 0 | |||
# | |||
# option = LiteOptions() | |||
# option.var_sanity_check_first_run = 0 | |||
# config = LiteConfig(option=option) | |||
# | |||
# network = LiteNetwork(config=config) | |||
# network.load(self.model_path) | |||
# | |||
# network.async_with_callback(async_callback) | |||
# | |||
# input_tensor = network.get_io_tensor(network.get_input_name(0)) | |||
# output_tensor = network.get_io_tensor(network.get_output_name(0)) | |||
# | |||
# input_tensor.set_data_by_share(self.input_data) | |||
# network.forward() | |||
# | |||
# while not finished: | |||
# count += 1 | |||
# | |||
# assert count > 0 | |||
# output_data = output_tensor.to_numpy() | |||
# self.check_correct(output_data) | |||
# | |||
# def test_network_start_callback(self): | |||
# network = LiteNetwork() | |||
# network.load(self.model_path) | |||
# start_checked = False | |||
# | |||
# @start_finish_callback | |||
# def start_callback(ios): | |||
# nonlocal start_checked | |||
# start_checked = True | |||
# assert len(ios) == 1 | |||
# for key in ios: | |||
# io = key | |||
# data = ios[key].to_numpy().flatten() | |||
# input_data = self.input_data.flatten() | |||
# assert data.size == input_data.size | |||
# assert io.name.decode("utf-8") == "data" | |||
# for i in range(data.size): | |||
# assert data[i] == input_data[i] | |||
# return 0 | |||
# | |||
# network.set_start_callback(start_callback) | |||
# self.do_forward(network, 1) | |||
# assert start_checked == True | |||
# | |||
# def test_network_finish_callback(self): | |||
# network = LiteNetwork() | |||
# network.load(self.model_path) | |||
# finish_checked = False | |||
# | |||
# @start_finish_callback | |||
# def finish_callback(ios): | |||
# nonlocal finish_checked | |||
# finish_checked = True | |||
# assert len(ios) == 1 | |||
# for key in ios: | |||
# io = key | |||
# data = ios[key].to_numpy().flatten() | |||
# output_data = self.correct_data.flatten() | |||
# assert data.size == output_data.size | |||
# for i in range(data.size): | |||
# assert data[i] == output_data[i] | |||
# return 0 | |||
# | |||
# network.set_finish_callback(finish_callback) | |||
# self.do_forward(network, 1) | |||
# assert finish_checked == True | |||
def test_enable_profile(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
network.enable_profile_performance("./profile.json") | |||
self.do_forward(network) | |||
fi = open("./profile.json", "r") | |||
fi.close() | |||
os.remove("./profile.json") | |||
def test_io_txt_dump(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
network.io_txt_dump("./io_txt.txt") | |||
self.do_forward(network) | |||
def test_io_bin_dump(self): | |||
import shutil | |||
folder = "./out" | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
if not os.path.exists(folder): | |||
os.mkdir(folder) | |||
network.io_bin_dump(folder) | |||
self.do_forward(network) | |||
shutil.rmtree(folder) | |||
def test_algo_workspace_limit(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
print("modify the workspace limit.") | |||
network.set_network_algo_workspace_limit(10000) | |||
self.do_forward(network) | |||
def test_network_algo_policy(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
network.set_network_algo_policy( | |||
LiteAlgoSelectStrategy.LITE_ALGO_PROFILE | |||
| LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE | |||
) | |||
self.do_forward(network) | |||
def test_network_algo_policy_ignore_batch(self): | |||
network = LiteNetwork() | |||
network.load(self.model_path) | |||
network.set_network_algo_policy( | |||
LiteAlgoSelectStrategy.LITE_ALGO_PROFILE, | |||
shared_batch_size=1, | |||
binary_equal_between_batch=True, | |||
) | |||
self.do_forward(network) |
@@ -0,0 +1,220 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import functools | |||
import os | |||
import unittest | |||
import numpy as np | |||
from megenginelite import * | |||
set_log_level(2) | |||
def require_cuda(ngpu=1): | |||
"""a decorator that disables a testcase if cuda is not enabled""" | |||
def dector(func): | |||
@functools.wraps(func) | |||
def wrapped(*args, **kwargs): | |||
if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA) >= ngpu: | |||
return func(*args, **kwargs) | |||
return wrapped | |||
return dector | |||
class TestShuffleNetCuda(unittest.TestCase): | |||
source_dir = os.getenv("LITE_TEST_RESOUCE") | |||
input_data_path = os.path.join(source_dir, "input_data.npy") | |||
correct_data_path = os.path.join(source_dir, "output_data.npy") | |||
model_path = os.path.join(source_dir, "shufflenet.mge") | |||
correct_data = np.load(correct_data_path).flatten() | |||
input_data = np.load(input_data_path) | |||
def check_correct(self, out_data, error=1e-4): | |||
out_data = out_data.flatten() | |||
assert np.isfinite(out_data.sum()) | |||
assert self.correct_data.size == out_data.size | |||
for i in range(out_data.size): | |||
assert abs(out_data[i] - self.correct_data[i]) < error | |||
def do_forward(self, network, times=3): | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
input_tensor.set_data_by_copy(self.input_data) | |||
for i in range(times): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
self.check_correct(output_data) | |||
class TestNetwork(TestShuffleNetCuda): | |||
@require_cuda() | |||
def test_network_basic(self): | |||
config = LiteConfig() | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config) | |||
network.load(self.model_path) | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
assert input_tensor.layout.shapes[0] == 1 | |||
assert input_tensor.layout.shapes[1] == 3 | |||
assert input_tensor.layout.shapes[2] == 224 | |||
assert input_tensor.layout.shapes[3] == 224 | |||
assert input_tensor.layout.data_type == LiteDataType.LITE_FLOAT | |||
assert input_tensor.layout.ndim == 4 | |||
self.do_forward(network) | |||
@require_cuda() | |||
def test_network_shared_data(self): | |||
config = LiteConfig() | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config) | |||
network.load(self.model_path) | |||
input_name = network.get_input_name(0) | |||
input_tensor = network.get_io_tensor(input_name) | |||
output_name = network.get_output_name(0) | |||
output_tensor = network.get_io_tensor(output_name) | |||
input_tensor.set_data_by_share(self.input_data) | |||
for i in range(3): | |||
network.forward() | |||
network.wait() | |||
output_data = output_tensor.to_numpy() | |||
self.check_correct(output_data) | |||
@require_cuda(2) | |||
def test_network_set_device_id(self): | |||
config = LiteConfig() | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config) | |||
assert network.device_id == 0 | |||
network.device_id = 1 | |||
network.load(self.model_path) | |||
assert network.device_id == 1 | |||
with self.assertRaises(RuntimeError): | |||
network.device_id = 1 | |||
self.do_forward(network) | |||
@require_cuda() | |||
def test_network_option(self): | |||
option = LiteOptions() | |||
option.weight_preprocess = 1 | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config=config) | |||
network.load(self.model_path) | |||
self.do_forward(network) | |||
@require_cuda() | |||
def test_network_reset_io(self): | |||
option = LiteOptions() | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
input_io = LiteIO("data") | |||
ios = LiteNetworkIO() | |||
ios.add_input(input_io) | |||
network = LiteNetwork(config=config, io=ios) | |||
network.load(self.model_path) | |||
input_tensor = network.get_io_tensor("data") | |||
assert input_tensor.device_type == LiteDeviceType.LITE_CPU | |||
self.do_forward(network) | |||
@require_cuda() | |||
def test_network_share_weights(self): | |||
option = LiteOptions() | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
src_network = LiteNetwork(config=config) | |||
src_network.load(self.model_path) | |||
new_network = LiteNetwork() | |||
new_network.enable_cpu_inplace_mode() | |||
new_network.share_weights_with(src_network) | |||
self.do_forward(src_network) | |||
self.do_forward(new_network) | |||
@require_cuda() | |||
def test_network_share_runtime_memory(self): | |||
option = LiteOptions() | |||
option.var_sanity_check_first_run = 0 | |||
config = LiteConfig(option=option) | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
src_network = LiteNetwork(config=config) | |||
src_network.load(self.model_path) | |||
new_network = LiteNetwork() | |||
new_network.enable_cpu_inplace_mode() | |||
new_network.share_runtime_memroy(src_network) | |||
new_network.load(self.model_path) | |||
self.do_forward(src_network) | |||
self.do_forward(new_network) | |||
@require_cuda() | |||
def test_enable_profile(self): | |||
config = LiteConfig() | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config) | |||
network.load(self.model_path) | |||
network.enable_profile_performance("./profile.json") | |||
self.do_forward(network) | |||
fi = open("./profile.json", "r") | |||
fi.close() | |||
os.remove("./profile.json") | |||
@require_cuda() | |||
def test_algo_workspace_limit(self): | |||
config = LiteConfig() | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config) | |||
network.load(self.model_path) | |||
print("modify the workspace limit.") | |||
network.set_network_algo_workspace_limit(10000) | |||
self.do_forward(network) | |||
@require_cuda() | |||
def test_network_algo_policy(self): | |||
config = LiteConfig() | |||
config.device_type = LiteDeviceType.LITE_CUDA | |||
network = LiteNetwork(config) | |||
network.load(self.model_path) | |||
network.set_network_algo_policy( | |||
LiteAlgoSelectStrategy.LITE_ALGO_PROFILE | |||
| LiteAlgoSelectStrategy.LITE_ALGO_REPRODUCIBLE | |||
) | |||
self.do_forward(network) |
@@ -0,0 +1,291 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import functools | |||
import numpy as np | |||
from megenginelite import * | |||
def require_cuda(func): | |||
"""a decorator that disables a testcase if cuda is not enabled""" | |||
@functools.wraps(func) | |||
def wrapped(*args, **kwargs): | |||
if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA): | |||
return func(*args, **kwargs) | |||
return wrapped | |||
def test_tensor_make(): | |||
empty_layout = LiteLayout() | |||
assert empty_layout.ndim == 0 | |||
assert empty_layout.data_type == int(LiteDataType.LITE_FLOAT) | |||
empty_tensor = LiteTensor() | |||
assert empty_tensor.layout.ndim == empty_layout.ndim | |||
assert empty_tensor.layout.data_type == empty_layout.data_type | |||
layout = LiteLayout([4, 16]) | |||
layout = LiteLayout(dtype="float32") | |||
layout = LiteLayout([4, 16], "float32") | |||
layout = LiteLayout([4, 16], "float16") | |||
layout = LiteLayout([4, 16], np.float32) | |||
layout = LiteLayout([4, 16], np.int8) | |||
layout = LiteLayout([4, 16], LiteDataType.LITE_FLOAT) | |||
tensor = LiteTensor(layout) | |||
tensor = LiteTensor(layout, LiteDeviceType.LITE_CPU) | |||
assert tensor.layout == layout | |||
assert tensor.device_type == LiteDeviceType.LITE_CPU | |||
assert tensor.is_continue == True | |||
assert tensor.is_pinned_host == False | |||
assert tensor.nbytes == 4 * 16 * 4 | |||
assert tensor.device_id == 0 | |||
tensor = LiteTensor(layout, device_id=1) | |||
assert tensor.device_id == 1 | |||
def test_tensor_set_data(): | |||
layout = LiteLayout([2, 16], "int8") | |||
tensor = LiteTensor(layout) | |||
assert tensor.nbytes == 2 * 16 | |||
data = [i for i in range(32)] | |||
tensor.set_data_by_copy(data) | |||
real_data = tensor.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 16][i % 16] == i | |||
arr = np.ones([2, 16], "int8") | |||
tensor.set_data_by_copy(arr) | |||
real_data = tensor.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 16][i % 16] == 1 | |||
for i in range(32): | |||
arr[i // 16][i % 16] = i | |||
tensor.set_data_by_share(arr) | |||
real_data = tensor.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 16][i % 16] == i | |||
arr[0][8] = 100 | |||
arr[1][3] = 20 | |||
real_data = tensor.to_numpy() | |||
assert real_data[0][8] == 100 | |||
assert real_data[1][3] == 20 | |||
def test_fill_zero(): | |||
layout = LiteLayout([4, 8], "int16") | |||
tensor1 = LiteTensor(layout) | |||
assert tensor1.nbytes == 4 * 8 * 2 | |||
tensor1.set_data_by_copy([i for i in range(32)]) | |||
real_data = tensor1.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i | |||
tensor1.fill_zero() | |||
real_data = tensor1.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == 0 | |||
def test_copy_from(): | |||
layout = LiteLayout([4, 8], "int16") | |||
tensor1 = LiteTensor(layout) | |||
tensor2 = LiteTensor(layout) | |||
assert tensor1.nbytes == 4 * 8 * 2 | |||
assert tensor2.nbytes == 4 * 8 * 2 | |||
tensor1.set_data_by_copy([i for i in range(32)]) | |||
tensor2.copy_from(tensor1) | |||
real_data = tensor2.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i | |||
tensor1.set_data_by_copy([i + 5 for i in range(32)]) | |||
tensor2.copy_from(tensor1) | |||
real_data = tensor2.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i + 5 | |||
def test_reshape(): | |||
layout = LiteLayout([4, 8], "int16") | |||
tensor1 = LiteTensor(layout) | |||
assert tensor1.nbytes == 4 * 8 * 2 | |||
tensor1.set_data_by_copy([i for i in range(32)]) | |||
real_data = tensor1.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i | |||
tensor1.reshape([8, 4]) | |||
real_data = tensor1.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 4][i % 4] == i | |||
def test_slice(): | |||
layout = LiteLayout([4, 8], "int32") | |||
tensor1 = LiteTensor(layout) | |||
assert tensor1.nbytes == 4 * 8 * 4 | |||
tensor1.set_data_by_copy([i for i in range(32)]) | |||
real_data_org = tensor1.to_numpy() | |||
for i in range(32): | |||
assert real_data_org[i // 8][i % 8] == i | |||
tensor2 = tensor1.slice([1, 4], [3, 8]) | |||
assert tensor2.layout.shapes[0] == 2 | |||
assert tensor2.layout.shapes[1] == 4 | |||
assert tensor2.is_continue == False | |||
real_data = tensor2.to_numpy() | |||
for i in range(8): | |||
row = i // 4 | |||
col = i % 4 | |||
assert real_data[row][col] == real_data_org[row + 1][col + 4] | |||
def test_tensor_share_memory(): | |||
layout = LiteLayout([4, 8], "int16") | |||
tensor1 = LiteTensor(layout) | |||
tensor2 = LiteTensor(layout) | |||
assert tensor1.nbytes == 4 * 8 * 2 | |||
assert tensor2.nbytes == 4 * 8 * 2 | |||
tensor1.set_data_by_copy([i for i in range(32)]) | |||
tensor2.share_memory_with(tensor1) | |||
real_data = tensor2.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i | |||
tensor1.set_data_by_copy([i + 5 for i in range(32)]) | |||
real_data = tensor2.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i + 5 | |||
def test_tensor_share_ctype_memory(): | |||
layout = LiteLayout([4, 8], "int16") | |||
tensor1 = LiteTensor(layout) | |||
assert tensor1.nbytes == 4 * 8 * 2 | |||
arr = np.ones([4, 8], "int16") | |||
for i in range(32): | |||
arr[i // 8][i % 8] = i | |||
tensor1.set_data_by_share(arr.ctypes.data, 4 * 8 * 2) | |||
real_data = tensor1.to_numpy() | |||
for i in range(32): | |||
assert real_data[i // 8][i % 8] == i | |||
@require_cuda | |||
def test_tensor_share_ctype_memory_device(): | |||
layout = LiteLayout([4, 8], "int16") | |||
tensor_cpu = LiteTensor( | |||
layout=layout, device_type=LiteDeviceType.LITE_CUDA, is_pinned_host=True | |||
) | |||
tensor_cuda1 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA) | |||
tensor_cuda2 = LiteTensor(layout=layout, device_type=LiteDeviceType.LITE_CUDA) | |||
assert tensor_cpu.nbytes == 4 * 8 * 2 | |||
assert tensor_cuda1.nbytes == 4 * 8 * 2 | |||
assert tensor_cuda2.nbytes == 4 * 8 * 2 | |||
arr = np.ones([4, 8], "int16") | |||
for i in range(32): | |||
arr[i // 8][i % 8] = i | |||
tensor_cpu.set_data_by_share(arr.ctypes.data, 4 * 8 * 2) | |||
tensor_cuda1.copy_from(tensor_cpu) | |||
device_mem = tensor_cuda1.get_ctypes_memory() | |||
tensor_cuda2.set_data_by_share(device_mem, tensor_cuda1.nbytes) | |||
real_data1 = tensor_cuda1.to_numpy() | |||
real_data2 = tensor_cuda2.to_numpy() | |||
for i in range(32): | |||
assert real_data1[i // 8][i % 8] == i | |||
assert real_data2[i // 8][i % 8] == i | |||
def test_tensor_share_memory_with(): | |||
layout = LiteLayout([4, 32], "int16") | |||
tensor = LiteTensor(layout) | |||
assert tensor.nbytes == 4 * 32 * 2 | |||
arr = np.ones([4, 32], "int16") | |||
for i in range(128): | |||
arr[i // 32][i % 32] = i | |||
tensor.set_data_by_share(arr) | |||
real_data = tensor.to_numpy() | |||
for i in range(128): | |||
assert real_data[i // 32][i % 32] == i | |||
tensor2 = LiteTensor(layout) | |||
tensor2.share_memory_with(tensor) | |||
real_data = tensor.to_numpy() | |||
real_data2 = tensor2.to_numpy() | |||
for i in range(128): | |||
assert real_data[i // 32][i % 32] == i | |||
assert real_data2[i // 32][i % 32] == i | |||
arr[1][18] = 5 | |||
arr[3][7] = 345 | |||
real_data = tensor2.to_numpy() | |||
assert real_data[1][18] == 5 | |||
assert real_data[3][7] == 345 | |||
def test_empty_tensor(): | |||
empty_tensor = LiteTensor() | |||
assert empty_tensor.layout.ndim == 0 | |||
assert empty_tensor.layout.data_type == int(LiteDataType.LITE_FLOAT) | |||
# check empty tensor to numpy | |||
data = empty_tensor.to_numpy() | |||
def test_tensor_by_set_copy_with_new_layout(): | |||
layout = LiteLayout([4, 32], "int16") | |||
tensor = LiteTensor(layout) | |||
assert tensor.nbytes == 4 * 32 * 2 | |||
arr = np.ones([8, 64], "int32") | |||
tensor.set_data_by_copy(arr) | |||
new_layout = tensor.layout | |||
assert new_layout.ndim == 2 | |||
assert new_layout.shapes[0] == 8 | |||
assert new_layout.shapes[1] == 64 | |||
tensor = LiteTensor(layout) | |||
tensor.set_data_by_share(arr) | |||
new_layout = tensor.layout | |||
assert new_layout.ndim == 2 | |||
assert new_layout.shapes[0] == 8 | |||
assert new_layout.shapes[1] == 64 | |||
def test_tensor_concat(): | |||
layout = LiteLayout([4, 32], "int16") | |||
tensors = [] | |||
arr = np.ones([4, 32], "int16") | |||
for j in range(4): | |||
for i in range(128): | |||
arr[i // 32][i % 32] = j | |||
tensor = LiteTensor(layout) | |||
tensor.set_data_by_copy(arr) | |||
tensors.append(tensor) | |||
new_tensor = LiteTensorConcat(tensors, 0) | |||
real_data = new_tensor.to_numpy() | |||
for j in range(4): | |||
for i in range(128): | |||
index = j * 128 + i | |||
assert real_data[index // 32][index % 32] == j |
@@ -0,0 +1,199 @@ | |||
# -*- coding: utf-8 -*- | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# Copyright (c) Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import functools | |||
import numpy as np | |||
from megenginelite import * | |||
def require_cuda(func): | |||
"""a decorator that disables a testcase if cuda is not enabled""" | |||
@functools.wraps(func) | |||
def wrapped(*args, **kwargs): | |||
if LiteGlobal.get_device_count(LiteDeviceType.LITE_CUDA): | |||
return func(*args, **kwargs) | |||
return wrapped | |||
@require_cuda | |||
def test_tensor_collect_batch(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
) | |||
arr = np.ones([8, 8], "int32") | |||
for i in range(4): | |||
batch_tensor.collect(arr) | |||
arr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 8 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(64): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
def test_tensor_collect_batch_cpu(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU | |||
) | |||
arr = np.ones([8, 8], "int32") | |||
for i in range(4): | |||
batch_tensor.collect(arr) | |||
arr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 8 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(64): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
@require_cuda | |||
def test_tensor_collect_batch_by_index(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 8, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
) | |||
arr = np.ones([8, 8], "int32") | |||
arr += 1 # ==2 | |||
batch_tensor.collect_id(arr, 1) | |||
arr -= 1 # ==1 | |||
batch_tensor.collect_id(arr, 0) | |||
arr += 2 # ==3 | |||
batch_tensor.collect_id(arr, 2) | |||
arr += 1 # ==4 | |||
batch_tensor.collect_id(arr, 3) | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 8 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(64): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
@require_cuda | |||
def test_tensor_collect_batch_tensor(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
) | |||
nparr = np.ones([6, 8], "int32") | |||
tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) | |||
for i in range(4): | |||
tensor.set_data_by_share(nparr) | |||
batch_tensor.collect(tensor) | |||
nparr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 6 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(48): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
def test_tensor_collect_batch_tensor_cpu(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU | |||
) | |||
nparr = np.ones([6, 8], "int32") | |||
tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) | |||
for i in range(4): | |||
tensor.set_data_by_share(nparr) | |||
batch_tensor.collect(tensor) | |||
nparr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 6 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(48): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
@require_cuda | |||
def test_tensor_collect_batch_ctypes(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CUDA | |||
) | |||
nparr = np.ones([6, 8], "int32") | |||
for i in range(4): | |||
in_data = nparr.ctypes.data | |||
batch_tensor.collect_by_ctypes(in_data, nparr.nbytes) | |||
nparr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 6 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(48): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
def test_tensor_collect_batch_ctypes_cpu(): | |||
batch_tensor = TensorBatchCollector( | |||
[4, 6, 8], dtype=LiteDataType.LITE_INT, device_type=LiteDeviceType.LITE_CPU | |||
) | |||
nparr = np.ones([6, 8], "int32") | |||
for i in range(4): | |||
in_data = nparr.ctypes.data | |||
batch_tensor.collect_by_ctypes(in_data, nparr.nbytes) | |||
nparr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 6 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(48): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
@require_cuda | |||
def test_tensor_collect_batch_device_tensor(): | |||
all_tensor = LiteTensor( | |||
LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT), | |||
device_type=LiteDeviceType.LITE_CUDA, | |||
) | |||
batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor) | |||
nparr = np.ones([6, 8], "int32") | |||
tensor = LiteTensor(LiteLayout([6, 8], LiteDataType.LITE_INT)) | |||
for i in range(4): | |||
tensor.set_data_by_share(nparr) | |||
batch_tensor.collect(tensor) | |||
nparr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 6 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(48): | |||
assert data[i][j // 8][j % 8] == i + 1 | |||
@require_cuda | |||
def test_tensor_collect_batch_device_numpy(): | |||
all_tensor = LiteTensor( | |||
LiteLayout([4, 6, 8], dtype=LiteDataType.LITE_INT), | |||
device_type=LiteDeviceType.LITE_CUDA, | |||
) | |||
batch_tensor = TensorBatchCollector([4, 6, 8], tensor=all_tensor) | |||
nparr = np.ones([6, 8], "int32") | |||
for i in range(4): | |||
batch_tensor.collect(nparr) | |||
nparr += 1 | |||
data = batch_tensor.to_numpy() | |||
assert data.shape[0] == 4 | |||
assert data.shape[1] == 6 | |||
assert data.shape[2] == 8 | |||
for i in range(4): | |||
for j in range(48): | |||
assert data[i][j // 8][j % 8] == i + 1 |
@@ -0,0 +1,53 @@ | |||
/** | |||
* \file src/decryption/aes_decrypt.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "./mbedtls/aes.h" | |||
#include "decrypt_base.h" | |||
namespace lite { | |||
class AESDcryption { | |||
public: | |||
static std::vector<uint8_t> decrypt_model(const void* model_mem, | |||
size_t size, | |||
const std::vector<uint8_t>& key) { | |||
mbedtls_aes_context ctx; | |||
mbedtls_aes_init(&ctx); | |||
mbedtls_aes_setkey_dec(&ctx, key.data(), 256); | |||
auto data = static_cast<const uint8_t*>(model_mem); | |||
//! first 16 bytes is IV | |||
uint8_t iv[16]; | |||
//! last 8 bytes is file size(length) | |||
auto length_ptr = data + size - 8; | |||
size_t length = 0; | |||
for (int i = 0; i < 8; i++) { | |||
length |= length_ptr[i] << (8 * (7 - i)); | |||
} | |||
std::copy(data, data + 16, iv); | |||
auto output = std::vector<uint8_t>(size - 24); | |||
mbedtls_aes_crypt_cbc(&ctx, MBEDTLS_AES_DECRYPT, size - 24, iv, | |||
data + 16, output.data()); | |||
mbedtls_aes_free(&ctx); | |||
output.erase(output.begin() + length, output.end()); | |||
return output; | |||
} | |||
static std::vector<uint8_t> get_decrypt_key() { | |||
std::vector<uint8_t> key = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, | |||
0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, | |||
0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, | |||
0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, | |||
0x1C, 0x1D, 0x1E, 0x1F}; | |||
return key; | |||
} | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,49 @@ | |||
/** | |||
* \file src/decryption/decrypt_base.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite/global.h" | |||
#include "misc.h" | |||
namespace lite { | |||
struct DecryptionStaticData { | |||
std::unordered_map< | |||
std::string, | |||
std::pair<DecryptionFunc, std::shared_ptr<std::vector<uint8_t>>>> | |||
decryption_methods; | |||
LITE_MUTEX map_mutex; | |||
}; | |||
DecryptionStaticData& decryption_static_data(); | |||
template <int count> | |||
struct DecryptionRegister; | |||
} // namespace lite | |||
#define CONCAT_IMPL(a, b) a##b | |||
#define MACRO_CONCAT(a, b) CONCAT_IMPL(a, b) | |||
#define REGIST_DECRYPTION_METHOD(name_, func_, key_) \ | |||
REGIST_DECRYPTION_METHOD_WITH_NUM(__COUNTER__, name_, func_, key_) | |||
#define REGIST_DECRYPTION_METHOD_WITH_NUM(number_, name_, func_, key_) \ | |||
template <> \ | |||
struct DecryptionRegister<number_> { \ | |||
DecryptionRegister() { \ | |||
register_decryption_and_key(name_, func_, key_); \ | |||
} \ | |||
}; \ | |||
namespace { \ | |||
DecryptionRegister<number_> MACRO_CONCAT(decryption_, number_); \ | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,349 @@ | |||
/** | |||
* \file aes.h | |||
* | |||
* \brief AES block cipher | |||
* | |||
* Copyright (C) 2006-2015, ARM Limited, All Rights Reserved | |||
* SPDX-License-Identifier: Apache-2.0 | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); you may | |||
* not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | |||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* | |||
* This file is part of mbed TLS (https://tls.mbed.org) | |||
*/ | |||
/** | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#ifndef MBEDTLS_AES_H | |||
#define MBEDTLS_AES_H | |||
#if !defined(MBEDTLS_CONFIG_FILE) | |||
#include "config.h" | |||
#else | |||
#include MBEDTLS_CONFIG_FILE | |||
#endif | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
/* padlock.c and aesni.c rely on these values! */ | |||
#define MBEDTLS_AES_ENCRYPT 1 | |||
#define MBEDTLS_AES_DECRYPT 0 | |||
#define MBEDTLS_ERR_AES_INVALID_KEY_LENGTH -0x0020 /**< Invalid key length. */ | |||
#define MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH \ | |||
-0x0022 /**< Invalid data input length. */ | |||
#if (defined(__ARMCC_VERSION) || defined(_MSC_VER)) && !defined(inline) && \ | |||
!defined(__cplusplus) | |||
#define inline __inline | |||
#endif | |||
#if !defined(MBEDTLS_AES_ALT) | |||
// Regular implementation | |||
// | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
/** | |||
* \brief AES context structure | |||
* | |||
* \note buf is able to hold 32 extra bytes, which can be used: | |||
* - for alignment purposes if VIA padlock is used, and/or | |||
* - to simplify key expansion in the 256-bit case by | |||
* generating an extra round key | |||
*/ | |||
typedef struct { | |||
int nr; /*!< number of rounds */ | |||
uint32_t* rk; /*!< AES round keys */ | |||
uint32_t buf[68]; /*!< unaligned data */ | |||
} mbedtls_aes_context; | |||
/** | |||
* \brief Initialize AES context | |||
* | |||
* \param ctx AES context to be initialized | |||
*/ | |||
void mbedtls_aes_init(mbedtls_aes_context* ctx); | |||
/** | |||
* \brief Clear AES context | |||
* | |||
* \param ctx AES context to be cleared | |||
*/ | |||
void mbedtls_aes_free(mbedtls_aes_context* ctx); | |||
/** | |||
* \brief AES key schedule (encryption) | |||
* | |||
* \param ctx AES context to be initialized | |||
* \param key encryption key | |||
* \param keybits must be 128, 192 or 256 | |||
* | |||
* \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH | |||
*/ | |||
int mbedtls_aes_setkey_enc(mbedtls_aes_context* ctx, const unsigned char* key, | |||
unsigned int keybits); | |||
/** | |||
* \brief AES key schedule (decryption) | |||
* | |||
* \param ctx AES context to be initialized | |||
* \param key decryption key | |||
* \param keybits must be 128, 192 or 256 | |||
* | |||
* \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_KEY_LENGTH | |||
*/ | |||
int mbedtls_aes_setkey_dec(mbedtls_aes_context* ctx, const unsigned char* key, | |||
unsigned int keybits); | |||
/** | |||
* \brief AES-ECB block encryption/decryption | |||
* | |||
* \param ctx AES context | |||
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
* \param input 16-byte input block | |||
* \param output 16-byte output block | |||
* | |||
* \return 0 if successful | |||
*/ | |||
int mbedtls_aes_crypt_ecb(mbedtls_aes_context* ctx, int mode, | |||
const unsigned char input[16], | |||
unsigned char output[16]); | |||
#if defined(MBEDTLS_CIPHER_MODE_CBC) | |||
/** | |||
* \brief AES-CBC buffer encryption/decryption | |||
* Length should be a multiple of the block | |||
* size (16 bytes) | |||
* | |||
* \note Upon exit, the content of the IV is updated so that you can | |||
* call the function same function again on the following | |||
* block(s) of data and get the same result as if it was | |||
* encrypted in one call. This allows a "streaming" usage. | |||
* If on the other hand you need to retain the contents of the | |||
* IV, you should either save it manually or use the cipher | |||
* module instead. | |||
* | |||
* \param ctx AES context | |||
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
* \param length length of the input data | |||
* \param iv initialization vector (updated after use) | |||
* \param input buffer holding the input data | |||
* \param output buffer holding the output data | |||
* | |||
* \return 0 if successful, or MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH | |||
*/ | |||
int mbedtls_aes_crypt_cbc(mbedtls_aes_context* ctx, int mode, size_t length, | |||
unsigned char iv[16], const unsigned char* input, | |||
unsigned char* output); | |||
#endif /* MBEDTLS_CIPHER_MODE_CBC */ | |||
#if defined(MBEDTLS_CIPHER_MODE_CFB) | |||
/** | |||
* \brief AES-CFB128 buffer encryption/decryption. | |||
* | |||
* Note: Due to the nature of CFB you should use the same key schedule for | |||
* both encryption and decryption. So a context initialized with | |||
* mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and | |||
* MBEDTLS_AES_DECRYPT. | |||
* | |||
* \note Upon exit, the content of the IV is updated so that you can | |||
* call the function same function again on the following | |||
* block(s) of data and get the same result as if it was | |||
* encrypted in one call. This allows a "streaming" usage. | |||
* If on the other hand you need to retain the contents of the | |||
* IV, you should either save it manually or use the cipher | |||
* module instead. | |||
* | |||
* \param ctx AES context | |||
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
* \param length length of the input data | |||
* \param iv_off offset in IV (updated after use) | |||
* \param iv initialization vector (updated after use) | |||
* \param input buffer holding the input data | |||
* \param output buffer holding the output data | |||
* | |||
* \return 0 if successful | |||
*/ | |||
int mbedtls_aes_crypt_cfb128(mbedtls_aes_context* ctx, int mode, size_t length, | |||
size_t* iv_off, unsigned char iv[16], | |||
const unsigned char* input, unsigned char* output); | |||
/** | |||
* \brief AES-CFB8 buffer encryption/decryption. | |||
* | |||
* Note: Due to the nature of CFB you should use the same key schedule for | |||
* both encryption and decryption. So a context initialized with | |||
* mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and | |||
* MBEDTLS_AES_DECRYPT. | |||
* | |||
* \note Upon exit, the content of the IV is updated so that you can | |||
* call the function same function again on the following | |||
* block(s) of data and get the same result as if it was | |||
* encrypted in one call. This allows a "streaming" usage. | |||
* If on the other hand you need to retain the contents of the | |||
* IV, you should either save it manually or use the cipher | |||
* module instead. | |||
* | |||
* \param ctx AES context | |||
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT | |||
* \param length length of the input data | |||
* \param iv initialization vector (updated after use) | |||
* \param input buffer holding the input data | |||
* \param output buffer holding the output data | |||
* | |||
* \return 0 if successful | |||
*/ | |||
int mbedtls_aes_crypt_cfb8(mbedtls_aes_context* ctx, int mode, size_t length, | |||
unsigned char iv[16], const unsigned char* input, | |||
unsigned char* output); | |||
#endif /*MBEDTLS_CIPHER_MODE_CFB */ | |||
#if defined(MBEDTLS_CIPHER_MODE_CTR) | |||
/** | |||
* \brief AES-CTR buffer encryption/decryption | |||
* | |||
* Warning: You have to keep the maximum use of your counter in mind! | |||
* | |||
* Note: Due to the nature of CTR you should use the same key schedule for | |||
* both encryption and decryption. So a context initialized with | |||
* mbedtls_aes_setkey_enc() for both MBEDTLS_AES_ENCRYPT and | |||
* MBEDTLS_AES_DECRYPT. | |||
* | |||
* \param ctx AES context | |||
* \param length The length of the data | |||
* \param nc_off The offset in the current stream_block (for resuming | |||
* within current cipher stream). The offset pointer to | |||
* should be 0 at the start of a stream. | |||
* \param nonce_counter The 128-bit nonce and counter. | |||
* \param stream_block The saved stream-block for resuming. Is overwritten | |||
* by the function. | |||
* \param input The input data stream | |||
* \param output The output data stream | |||
* | |||
* \return 0 if successful | |||
*/ | |||
int mbedtls_aes_crypt_ctr(mbedtls_aes_context* ctx, size_t length, | |||
size_t* nc_off, unsigned char nonce_counter[16], | |||
unsigned char stream_block[16], | |||
const unsigned char* input, unsigned char* output); | |||
#endif /* MBEDTLS_CIPHER_MODE_CTR */ | |||
/** | |||
* \brief Internal AES block encryption function | |||
* (Only exposed to allow overriding it, | |||
* see MBEDTLS_AES_ENCRYPT_ALT) | |||
* | |||
* \param ctx AES context | |||
* \param input Plaintext block | |||
* \param output Output (ciphertext) block | |||
* | |||
* \return 0 if successful | |||
*/ | |||
int mbedtls_internal_aes_encrypt(mbedtls_aes_context* ctx, | |||
const unsigned char input[16], | |||
unsigned char output[16]); | |||
/** | |||
* \brief Internal AES block decryption function | |||
* (Only exposed to allow overriding it, | |||
* see MBEDTLS_AES_DECRYPT_ALT) | |||
* | |||
* \param ctx AES context | |||
* \param input Ciphertext block | |||
* \param output Output (plaintext) block | |||
* | |||
* \return 0 if successful | |||
*/ | |||
int mbedtls_internal_aes_decrypt(mbedtls_aes_context* ctx, | |||
const unsigned char input[16], | |||
unsigned char output[16]); | |||
#if !defined(MBEDTLS_DEPRECATED_REMOVED) | |||
#if defined(MBEDTLS_DEPRECATED_WARNING) | |||
#define MBEDTLS_DEPRECATED __attribute__((deprecated)) | |||
#else | |||
#define MBEDTLS_DEPRECATED | |||
#endif | |||
/** | |||
* \brief Internal AES block encryption function | |||
* (Only exposed to allow overriding it, | |||
* see MBEDTLS_AES_ENCRYPT_ALT) | |||
* | |||
* \deprecated Superseded by mbedtls_aes_encrypt_ext() in 2.5.0 | |||
* | |||
* \param ctx AES context | |||
* \param input Plaintext block | |||
* \param output Output (ciphertext) block | |||
*/ | |||
MBEDTLS_DEPRECATED static inline void mbedtls_aes_encrypt( | |||
mbedtls_aes_context* ctx, const unsigned char input[16], | |||
unsigned char output[16]) { | |||
mbedtls_internal_aes_encrypt(ctx, input, output); | |||
} | |||
/** | |||
* \brief Internal AES block decryption function | |||
* (Only exposed to allow overriding it, | |||
* see MBEDTLS_AES_DECRYPT_ALT) | |||
* | |||
* \deprecated Superseded by mbedtls_aes_decrypt_ext() in 2.5.0 | |||
* | |||
* \param ctx AES context | |||
* \param input Ciphertext block | |||
* \param output Output (plaintext) block | |||
*/ | |||
MBEDTLS_DEPRECATED static inline void mbedtls_aes_decrypt( | |||
mbedtls_aes_context* ctx, const unsigned char input[16], | |||
unsigned char output[16]) { | |||
mbedtls_internal_aes_decrypt(ctx, input, output); | |||
} | |||
#undef MBEDTLS_DEPRECATED | |||
#endif /* !MBEDTLS_DEPRECATED_REMOVED */ | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
#else /* MBEDTLS_AES_ALT */ | |||
#include "aes_alt.h" | |||
#endif /* MBEDTLS_AES_ALT */ | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
/** | |||
* \brief Checkup routine | |||
* | |||
* \return 0 if successful, or 1 if the test failed | |||
*/ | |||
int mbedtls_aes_self_test(int verbose); | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
#endif /* aes.h */ |
@@ -0,0 +1,5 @@ | |||
#pragma once | |||
#define MBEDTLS_AES_C | |||
#define MBEDTLS_AES_ROM_TABLES | |||
#define MBEDTLS_CIPHER_MODE_CBC |
@@ -0,0 +1,156 @@ | |||
/** | |||
* \file src/decryption/rc4/rc4_cryption_base.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include <algorithm> | |||
#include <cstdint> | |||
namespace lite { | |||
namespace rc4 { | |||
#define m256(x) static_cast<uint8_t>(x) | |||
/*! \brief Pseudo-random byte stream for RC4. | |||
*/ | |||
class RC4RandStream { | |||
public: | |||
RC4RandStream() = default; | |||
RC4RandStream(uint64_t key) { reset(key); } | |||
void reset(uint64_t init_key) { | |||
i_ = j_ = 0; | |||
for (int i = 0; i < 256; i++) | |||
s_[i] = i; | |||
uint8_t j = 0; | |||
for (int i = 0; i < 256; i++) { | |||
j = j + s_[i] + m256(init_key >> ((i % 8) * 8)); | |||
std::swap(s_[i], s_[j]); | |||
} | |||
// drop | |||
for (int i = 0; i < 768; i++) { | |||
next8(); | |||
} | |||
for (int i = 0, t = next8(); i < t; i++) { | |||
next8(); | |||
} | |||
} | |||
uint8_t next8() { | |||
i_++; | |||
uint8_t a = s_[i_]; | |||
j_ += a; | |||
uint8_t b = s_[j_]; | |||
s_[i_] = b; | |||
s_[j_] = a; | |||
uint8_t c = s_[m256((i_ << 5) ^ (j_ >> 3))] + | |||
s_[m256((j_ << 5) ^ (i_ >> 3))]; | |||
return (s_[m256(a + b)] + s_[c ^ 0xAA]) ^ s_[m256(j_ + b)]; | |||
} | |||
uint64_t next64() { | |||
uint64_t rst; | |||
uint8_t* buf = reinterpret_cast<uint8_t*>(&rst); | |||
for (int i = 0; i < 8; i++) { | |||
buf[i] = next8(); | |||
} | |||
return rst; | |||
} | |||
private: | |||
uint8_t s_[256], i_ = 0, j_ = 0; | |||
}; | |||
#undef m256 | |||
/*! | |||
* \brief fast and secure 64-bit hash | |||
* see https://code.google.com/p/fast-hash/ | |||
*/ | |||
class FastHash64 { | |||
public: | |||
FastHash64(uint64_t seed) | |||
: hash_{seed}, | |||
mul0_{key_gen_hash_mul0()}, | |||
mul1_{key_gen_hash_mul1()} {} | |||
void feed(uint64_t val) { | |||
val ^= val >> 23; | |||
val *= mul0_; | |||
val ^= val >> 47; | |||
hash_ ^= val; | |||
hash_ *= mul1_; | |||
} | |||
uint64_t get() { return hash_; } | |||
private: | |||
uint64_t hash_; | |||
const uint64_t mul0_, mul1_; | |||
static uint64_t key_gen_hash_mul0() { | |||
uint64_t rst; | |||
uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
buf[2] = 50; | |||
buf[3] = 244; | |||
buf[6] = 39; | |||
buf[1] = 92; | |||
buf[5] = 89; | |||
buf[4] = 155; | |||
buf[0] = 55; | |||
buf[7] = 33; | |||
return rst; | |||
} | |||
static uint64_t key_gen_hash_mul1() { | |||
uint64_t rst; | |||
uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
buf[6] = 3; | |||
buf[2] = 109; | |||
buf[7] = 136; | |||
buf[1] = 25; | |||
buf[5] = 85; | |||
buf[0] = 101; | |||
buf[4] = 242; | |||
buf[3] = 30; | |||
return rst; | |||
} | |||
}; | |||
// The encryption keys are always inlined. | |||
static inline uint64_t key_gen_enc_key() { | |||
uint64_t rst; | |||
uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
buf[4] = 120; | |||
buf[3] = 121; | |||
buf[7] = 122; | |||
buf[6] = 123; | |||
buf[0] = 124; | |||
buf[5] = 125; | |||
buf[2] = 126; | |||
buf[1] = 127; | |||
return rst; | |||
} | |||
static inline uint64_t key_gen_hash_key() { | |||
uint64_t rst; | |||
uint8_t volatile* buf = reinterpret_cast<uint8_t*>(&rst); | |||
buf[2] = 101; | |||
buf[5] = 102; | |||
buf[4] = 103; | |||
buf[7] = 104; | |||
buf[1] = 105; | |||
buf[3] = 106; | |||
buf[6] = 107; | |||
buf[0] = 108; | |||
return rst; | |||
} | |||
} // namespace rc4 | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,219 @@ | |||
/** | |||
* \file src/decryption/rc4/rc4_cryption_impl.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "rc4_cryption_impl.h" | |||
#include "../../misc.h" | |||
#include <cstring> | |||
using namespace lite; | |||
/*! | |||
* \brief Read the input stream once in order to initialize the decryption | |||
* state. | |||
*/ | |||
void RC4Impl::init_rc4_state() { | |||
rc4::RC4RandStream enc_stream(m_enc_key); | |||
rc4::FastHash64 dechash(m_hash_key); | |||
size_t offset = 0; | |||
std::vector<uint64_t> buffer(128); | |||
size_t remaining = m_model_length - sizeof(uint64_t); | |||
while (remaining > 0) { | |||
size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t)); | |||
memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset, | |||
toread); | |||
offset += toread; | |||
remaining -= toread; | |||
for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) { | |||
uint64_t value = buffer[i]; | |||
value ^= enc_stream.next64(); | |||
dechash.feed(value); | |||
} | |||
} | |||
uint64_t hashvalue; | |||
memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset, | |||
sizeof(hashvalue)); | |||
offset += sizeof(hashvalue); | |||
hashvalue ^= dechash.get() ^ enc_stream.next64(); | |||
m_state.hash_stream.reset(hashvalue); | |||
m_state.enc_stream.reset(m_enc_key); | |||
} | |||
std::vector<uint8_t> RC4Impl::decrypt_model() { | |||
std::vector<uint8_t> result(m_model_length, 0); | |||
uint8_t* ptr = result.data(); | |||
for (size_t i = 0; i < m_model_length; ++i) { | |||
ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i]; | |||
ptr[i] ^= m_state.hash_stream.next8() ^ m_state.enc_stream.next8(); | |||
} | |||
return result; | |||
} | |||
/*! \brief Encrypt the data in m_buffer. | |||
* | |||
* The basic idea is to calculate a 64-bit hash from the buffer and append | |||
* it to the end of the buffer. The basic requirement is that the change of | |||
* every byte including the hash value will destroy the whole model in every | |||
* byte. | |||
* | |||
* Encryption: | |||
* | |||
* 1. First calculate a 64-bit hash, called plain hash value, from the | |||
* buffer. | |||
* 2. Initialize a RC4 stream with the plain hash value. | |||
* 3. Obfuscate the model body with the RC4 stream defined in step 2. | |||
* 4. Calculate the hash value of the obfuscated model, called hash value | |||
* after hashing. | |||
* 5. Encrypt the model body with a RC4 stream made from the encryption key. | |||
* 6. Bit-xor the hash value after hashing with the plain hash value, called | |||
* mixed hash. | |||
* 7. Encrypt the mixed hash with the RC4 stream defined in step 5, called | |||
* the protected hash. | |||
* 8. Append the protected hash to the buffer. | |||
* | |||
* Decryption: | |||
* 1. Decrypt the model body with a RC4 stream made from the encryption key, | |||
* which is the reverse of step 5 and 7 of encryption and get the mixed | |||
* hash. | |||
* 2. Calculate the hash value of the decrypted model, which equals to the | |||
* hash value after hashing in step 4 of encryption. | |||
* 3. Bit-xor the hash value after hashing and the mixed hash to get the | |||
* plain hash value, which is the reverse of step 6 of encryption. | |||
* 4. Un-obfuscate the model body with the plain hash value, which is the | |||
* reverse of step 3 of encryption. | |||
* | |||
* Think: | |||
* 1. If any byte in the model body is broken, the hash value after hashing | |||
* will be broken in step 2, and hence the plain hash value in step 3 | |||
* will be also broken, and finally, the model body will be broken in | |||
* step 4. | |||
* 2. If the protected hash is broken, the plain hash value in step 3 will | |||
* be broken, and finally the model body will be broken. | |||
*/ | |||
std::vector<uint8_t> RC4Impl::encrypt_model() { | |||
size_t total_length = (m_model_length + (sizeof(size_t) - 1)) / | |||
sizeof(size_t) * sizeof(size_t); | |||
std::vector<uint8_t> pad_model(total_length, 0); | |||
memcpy(pad_model.data(), m_model_mem, m_model_length); | |||
// Calculate the hash of the model. | |||
rc4::FastHash64 plainhash(m_hash_key); | |||
uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data()); | |||
size_t len = pad_model.size() / sizeof(uint64_t); | |||
for (size_t i = 0; i < len; ++i) | |||
plainhash.feed(ptr[i]); | |||
uint64_t plainhash_value = plainhash.get(); | |||
// Encrypt the model. | |||
rc4::RC4RandStream hash_enc(plainhash_value); | |||
rc4::RC4RandStream outmost_enc(m_enc_key); | |||
rc4::FastHash64 afterhashenc_hash(m_hash_key); | |||
for (size_t i = 0; i < len; ++i) { | |||
uint64_t value = ptr[i] ^ hash_enc.next64(); | |||
afterhashenc_hash.feed(value); | |||
ptr[i] = value ^ outmost_enc.next64(); | |||
} | |||
uint64_t protected_hash = | |||
plainhash_value ^ afterhashenc_hash.get() ^ outmost_enc.next64(); | |||
size_t end = pad_model.size(); | |||
pad_model.resize(pad_model.size() + sizeof(uint64_t)); | |||
ptr = reinterpret_cast<uint64_t*>(&pad_model[end]); | |||
*ptr = protected_hash; | |||
return pad_model; | |||
} | |||
/*! | |||
* \brief Read the input stream once in order to initialize the decryption | |||
* state. | |||
*/ | |||
void SimpleFastRC4Impl::init_sfrc4_state() { | |||
rc4::RC4RandStream enc_stream(m_enc_key); | |||
rc4::FastHash64 dechash(m_hash_key); | |||
size_t offset = 0; | |||
std::vector<uint64_t> buffer(128); | |||
size_t remaining = m_model_length - sizeof(uint64_t); | |||
while (remaining > 0) { | |||
size_t toread = std::min(remaining, buffer.size() * sizeof(uint64_t)); | |||
memcpy(buffer.data(), static_cast<const uint8_t*>(m_model_mem) + offset, | |||
toread); | |||
offset += toread; | |||
remaining -= toread; | |||
for (size_t i = 0; i < toread / sizeof(uint64_t); ++i) { | |||
uint64_t value = buffer[i]; | |||
dechash.feed(value); | |||
} | |||
} | |||
uint64_t hashvalue; | |||
memcpy(&hashvalue, static_cast<const uint8_t*>(m_model_mem) + offset, | |||
sizeof(hashvalue)); | |||
offset += sizeof(hashvalue); | |||
/*! \brief test the hash_val. */ | |||
if (hashvalue != dechash.get()) | |||
LITE_THROW( | |||
"The checksum of the file cannot be verified. The file may " | |||
"be encrypted in the wrong algorithm or different keys."); | |||
m_state.hash_stream.reset(m_hash_key); | |||
m_state.enc_stream.reset(m_enc_key); | |||
} | |||
std::vector<uint8_t> SimpleFastRC4Impl::decrypt_model() { | |||
std::vector<uint8_t> result(m_model_length, 0); | |||
uint8_t* ptr = result.data(); | |||
for (size_t i = 0; i < m_model_length; ++i) { | |||
ptr[i] = static_cast<const uint8_t*>(m_model_mem)[i]; | |||
ptr[i] ^= m_state.enc_stream.next8(); | |||
} | |||
return result; | |||
} | |||
std::vector<uint8_t> SimpleFastRC4Impl::encrypt_model() { | |||
size_t total_length = (m_model_length + (sizeof(size_t) - 1)) / | |||
sizeof(size_t) * sizeof(size_t); | |||
std::vector<uint8_t> pad_model(total_length, 0); | |||
memcpy(pad_model.data(), m_model_mem, m_model_length); | |||
// Calculate the hash of the model. | |||
rc4::FastHash64 enchash(m_hash_key); | |||
uint64_t* ptr = reinterpret_cast<uint64_t*>(pad_model.data()); | |||
size_t len = pad_model.size() / sizeof(uint64_t); | |||
// Encrypt the model. | |||
rc4::RC4RandStream out_enc(m_enc_key); | |||
for (size_t i = 0; i < len; ++i) { | |||
ptr[i] = ptr[i] ^ out_enc.next64(); | |||
enchash.feed(ptr[i]); | |||
} | |||
uint64_t hash_value = enchash.get(); | |||
size_t end = pad_model.size(); | |||
pad_model.resize(pad_model.size() + sizeof(uint64_t)); | |||
ptr = reinterpret_cast<uint64_t*>(&pad_model[end]); | |||
*ptr = hash_value; | |||
return pad_model; | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,79 @@ | |||
/** | |||
* \file src/decryption/rc4/rc4_cryption_impl.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "rc4_cryption_base.h" | |||
#include <memory> | |||
#include <vector> | |||
namespace lite { | |||
class RC4Impl { | |||
struct RC4State { | |||
rc4::RC4RandStream enc_stream; | |||
rc4::RC4RandStream hash_stream; | |||
} m_state; | |||
public: | |||
RC4Impl(const void* model_mem, size_t size, const std::vector<uint8_t>& key) | |||
: m_model_mem(model_mem), m_model_length(size) { | |||
const uint8_t* data = key.data(); | |||
m_hash_key = *reinterpret_cast<const uint64_t*>(data); | |||
m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8); | |||
} | |||
std::vector<uint8_t> encrypt_model(); | |||
std::vector<uint8_t> decrypt_model(); | |||
/*! \brief Read the input stream once in order to initialize the decryption | |||
* state. | |||
*/ | |||
void init_rc4_state(); | |||
private: | |||
const void* m_model_mem; | |||
size_t m_model_length; | |||
uint64_t m_hash_key; | |||
uint64_t m_enc_key; | |||
}; | |||
class SimpleFastRC4Impl { | |||
struct SFRC4State { | |||
rc4::RC4RandStream enc_stream; | |||
rc4::RC4RandStream hash_stream; | |||
} m_state; | |||
public: | |||
SimpleFastRC4Impl(const void* model_mem, size_t size, | |||
const std::vector<uint8_t>& key) | |||
: m_model_mem(model_mem), m_model_length(size) { | |||
const uint8_t* data = key.data(); | |||
m_hash_key = *reinterpret_cast<const uint64_t*>(data); | |||
m_enc_key = *reinterpret_cast<const uint64_t*>(data + 8); | |||
} | |||
std::vector<uint8_t> encrypt_model(); | |||
std::vector<uint8_t> decrypt_model(); | |||
/*! \brief Read the input stream once in order to initialize the decryption | |||
* state. | |||
*/ | |||
void init_sfrc4_state(); | |||
private: | |||
const void* m_model_mem; | |||
size_t m_model_length; | |||
uint64_t m_hash_key; | |||
uint64_t m_enc_key; | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,58 @@ | |||
/** | |||
* \file src/decryption/rc4_cryption.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "rc4_cryption.h" | |||
#include "rc4/rc4_cryption_impl.h" | |||
#include <vector> | |||
using namespace lite; | |||
std::vector<uint8_t> RC4::decrypt_model(const void* model_mem, size_t size, | |||
const std::vector<uint8_t>& key) { | |||
RC4Impl rc4_impl(model_mem, size, key); | |||
rc4_impl.init_rc4_state(); | |||
return rc4_impl.decrypt_model(); | |||
} | |||
std::vector<uint8_t> RC4::encrypt_model(const void* model_mem, size_t size, | |||
const std::vector<uint8_t>& key) { | |||
RC4Impl rc4_impl(model_mem, size, key); | |||
return rc4_impl.encrypt_model(); | |||
} | |||
std::vector<uint8_t> RC4::get_decrypt_key() { | |||
std::vector<uint8_t> keys(128, 0); | |||
uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
data[0] = rc4::key_gen_hash_key(); | |||
data[1] = rc4::key_gen_enc_key(); | |||
return keys; | |||
}; | |||
std::vector<uint8_t> SimpleFastRC4::decrypt_model( | |||
const void* model_mem, size_t size, const std::vector<uint8_t>& key) { | |||
SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key); | |||
simple_fast_rc4_impl.init_sfrc4_state(); | |||
return simple_fast_rc4_impl.decrypt_model(); | |||
} | |||
std::vector<uint8_t> SimpleFastRC4::encrypt_model( | |||
const void* model_mem, size_t size, const std::vector<uint8_t>& key) { | |||
SimpleFastRC4Impl simple_fast_rc4_impl(model_mem, size, key); | |||
return simple_fast_rc4_impl.encrypt_model(); | |||
} | |||
std::vector<uint8_t> SimpleFastRC4::get_decrypt_key() { | |||
std::vector<uint8_t> keys(128, 0); | |||
uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
data[0] = rc4::key_gen_hash_key(); | |||
data[1] = rc4::key_gen_enc_key(); | |||
return keys; | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,44 @@ | |||
/** | |||
* \file src/decryption/rc4_cryption.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "rc4/rc4_cryption_base.h" | |||
#include <vector> | |||
namespace lite { | |||
class RC4 { | |||
public: | |||
static std::vector<uint8_t> decrypt_model(const void* model_mem, | |||
size_t size, | |||
const std::vector<uint8_t>& key); | |||
static std::vector<uint8_t> encrypt_model(const void* model_mem, | |||
size_t size, | |||
const std::vector<uint8_t>& key); | |||
static std::vector<uint8_t> get_decrypt_key(); | |||
}; | |||
class SimpleFastRC4 { | |||
public: | |||
static std::vector<uint8_t> decrypt_model(const void* model_mem, | |||
size_t size, | |||
const std::vector<uint8_t>& key); | |||
static std::vector<uint8_t> encrypt_model(const void* model_mem, | |||
size_t size, | |||
const std::vector<uint8_t>& key); | |||
static std::vector<uint8_t> get_decrypt_key(); | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,53 @@ | |||
/** | |||
* \file src/function_base.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include <unordered_map> | |||
#include "misc.h" | |||
#include "type_info.h" | |||
// template <typename tensor_type, typename ...Arg> | |||
namespace lite { | |||
class TensorImplDft; | |||
class NetworkImplDft; | |||
namespace { | |||
template <typename class_type> | |||
struct class_type_name { | |||
std::string operator()() { return ""; } | |||
}; | |||
#define ADD_STATEMENT(class_name, backend_name) \ | |||
template <> \ | |||
struct class_type_name<class_name> { \ | |||
std::string operator()() { return #backend_name; } \ | |||
} | |||
ADD_STATEMENT(TensorImplDft, Dft); | |||
ADD_STATEMENT(NetworkImplDft, Dft); | |||
#undef ADD_STATEMENT | |||
} // namespace | |||
// if it can't find the function, ignore | |||
template <typename tensor_type, typename ret_type, typename... Args> | |||
ret_type try_call_func(std::string func_name, Args... args) { | |||
mark_used_variable(func_name); | |||
mark_used_variable(args...); | |||
return nullptr; | |||
} | |||
// if it can't find the function, throw error | |||
template <typename tensor_type, typename ret_type, typename... Args> | |||
ret_type call_func(std::string func_name, Args... args) { | |||
mark_used_variable(args...); | |||
auto backend_name = class_type_name<tensor_type>()(); | |||
auto msg_info = | |||
func_name + " is not aviliable in " + backend_name + " backend."; | |||
LITE_THROW(msg_info.c_str()); | |||
} | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,256 @@ | |||
/** | |||
* \file src/global.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include <lite_build_config.h> | |||
#include "lite/global.h" | |||
#include "decryption/aes_decrypt.h" | |||
#include "decryption/decrypt_base.h" | |||
#include "decryption/rc4_cryption.h" | |||
#include "misc.h" | |||
#include "parse_info/parse_info_base.h" | |||
#include "parse_info/default_parse.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "megbrain/common.h" | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/serialization/extern_c_opr.h" | |||
#include "megbrain/version.h" | |||
#include "megcore_opencl.h" | |||
#include "mge/algo_cache/file_cache.h" | |||
#include "mge/common.h" | |||
#if MGB_ENABLE_TENSOR_RT | |||
#include "megbrain/tensorrt/tensorrt_engine_cache.h" | |||
#endif | |||
#if LITE_WITH_CUDA | |||
#include "mge/algo_cache/redis_cache.h" | |||
#endif | |||
#endif | |||
#include <mutex> | |||
#include <unordered_map> | |||
using namespace lite; | |||
lite::DecryptionStaticData& lite::decryption_static_data() { | |||
static lite::DecryptionStaticData global_map; | |||
return global_map; | |||
} | |||
void lite::get_version(int& major, int& minor, int& patch) { | |||
#if LITE_BUILD_WITH_MGE | |||
auto version = mgb::get_version(); | |||
major = version.major; | |||
minor = version.minor; | |||
patch = version.patch; | |||
#else | |||
//! without mge, the version set the max version | |||
major = 8; | |||
minor = 9999; | |||
patch = 0; | |||
#endif | |||
} | |||
size_t lite::get_device_count(LiteDeviceType device_type) { | |||
#if LITE_BUILD_WITH_MGE | |||
auto mgb_device_type = to_compnode_locator(device_type).type; | |||
return mgb::CompNode::get_device_count(mgb_device_type); | |||
#else | |||
LITE_MARK_USED_VAR(device_type); | |||
LITE_THROW("no lite backend avialible, please check build macro."); | |||
#endif | |||
} | |||
bool lite::register_decryption_and_key(std::string decrypt_name, | |||
const DecryptionFunc& func, | |||
const std::vector<uint8_t>& key) { | |||
LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
auto& global_map = decryption_static_data().decryption_methods; | |||
if (global_map.find(decrypt_name) != global_map.end()) { | |||
LITE_THROW(ssprintf("The decryption method %s is already registered.", | |||
decrypt_name.c_str())); | |||
return false; | |||
} else { | |||
auto key_pointer = std::make_shared<std::vector<uint8_t>>(key); | |||
global_map[decrypt_name] = {func, key_pointer}; | |||
LITE_LOG("Registered ecryption method %s.", decrypt_name.c_str()); | |||
return true; | |||
} | |||
} | |||
bool lite::update_decryption_or_key(std::string decrypt_name, | |||
const DecryptionFunc& func, | |||
const std::vector<uint8_t>& key) { | |||
LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
auto& global_map = decryption_static_data().decryption_methods; | |||
if (global_map.find(decrypt_name) != global_map.end()) { | |||
std::shared_ptr<std::vector<uint8_t>> key_pointer; | |||
DecryptionFunc new_func; | |||
if (func) { | |||
new_func = func; | |||
LITE_LOG("%s decryption function is updated.", | |||
decrypt_name.c_str()); | |||
} else { | |||
new_func = global_map[decrypt_name].first; | |||
} | |||
if (key.size()) { | |||
key_pointer = std::make_shared<std::vector<uint8_t>>(key); | |||
LITE_LOG("%s decryption key is updated.", decrypt_name.c_str()); | |||
} else { | |||
key_pointer = global_map[decrypt_name].second; | |||
} | |||
global_map[decrypt_name] = {new_func, key_pointer}; | |||
return true; | |||
} else { | |||
LITE_THROW(ssprintf("The decryption method %s is not registered.", | |||
decrypt_name.c_str())); | |||
return false; | |||
} | |||
} | |||
lite::ParseInfoStaticData& lite::parse_info_static_data() { | |||
static lite::ParseInfoStaticData global_map; | |||
return global_map; | |||
} | |||
bool lite::register_parse_info_func(std::string info_type, | |||
const ParseInfoFunc& parse_func) { | |||
LITE_LOCK_GUARD(parse_info_static_data().map_mutex); | |||
auto& global_map = parse_info_static_data().parse_info_methods; | |||
if (global_map.find(info_type) != global_map.end()) { | |||
LITE_THROW(ssprintf("The parse info method %s is already registered.", | |||
info_type.c_str())); | |||
return false; | |||
} else { | |||
global_map[info_type] = parse_func; | |||
LITE_LOG("Registered infomation parser method %s.", info_type.c_str()); | |||
return true; | |||
} | |||
} | |||
#if LITE_BUILD_WITH_MGE | |||
namespace { | |||
struct CacheControl { | |||
LITE_MUTEX cache_mutex; | |||
std::string cache_type = "file"; | |||
std::atomic_size_t config_algo_times{0}; | |||
std::atomic_size_t config_trt_times{0}; | |||
}; | |||
CacheControl cache_control; | |||
} // namespace | |||
void lite::try_coalesce_all_free_memory() { | |||
mgb::CompNode::try_coalesce_all_free_memory(); | |||
} | |||
void lite::set_loader_lib_path(const std::string& loader_path) { | |||
const char* lib_path = loader_path.c_str(); | |||
LITE_LOG("load a device loader of path %s.", lib_path); | |||
auto handle = dlopen(lib_path, RTLD_LAZY); | |||
LITE_ASSERT(handle, "failed to open c opr lib %s: %s", lib_path, dlerror()); | |||
const char* entry = MGB_C_OPR_INIT_FUNC_STR; | |||
auto func = dlsym(handle, entry); | |||
LITE_ASSERT(func, "can not resolve %s: %s", entry, dlerror()); | |||
typedef void (*entry_f_t)(void*); | |||
reinterpret_cast<entry_f_t>(func)( | |||
reinterpret_cast<void*>(&mgb_get_extern_c_opr_api_versioned)); | |||
} | |||
void lite::set_persistent_cache(const std::string& cache_path, | |||
bool always_sync) { | |||
LITE_LOCK_GUARD(cache_control.cache_mutex); | |||
cache_control.cache_type = "file"; | |||
if (cache_control.config_algo_times >= 1) { | |||
LITE_WARN( | |||
"The cache has been set,maybe some model is using now, change " | |||
"it now may cause unknow error!!"); | |||
} | |||
cache_control.config_algo_times++; | |||
mgb::PersistentCache::set_impl(std::make_shared<InFilePersistentCache>( | |||
cache_path.c_str(), always_sync)); | |||
} | |||
void lite::dump_persistent_cache(const std::string& cache_path) { | |||
LITE_LOCK_GUARD(cache_control.cache_mutex); | |||
LITE_ASSERT(cache_control.cache_type == "file", | |||
"now cache type is redis, it can't be dumped."); | |||
static_cast<InFilePersistentCache&>(mgb::PersistentCache::inst()) | |||
.dump_cache(cache_path.c_str()); | |||
} | |||
//! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine | |||
void lite::set_tensor_rt_cache(std::string tensorrt_cache_path) { | |||
#if MGB_ENABLE_TENSOR_RT | |||
LITE_LOCK_GUARD(cache_control.cache_mutex); | |||
if (cache_control.config_trt_times >= 1) { | |||
LITE_WARN( | |||
"The trt cache has been set,maybe some model is using now, " | |||
"change it now may cause unknow error!!"); | |||
} | |||
cache_control.config_trt_times++; | |||
mgb::TensorRTEngineCache::enable_engine_cache(true); | |||
mgb::TensorRTEngineCache::set_impl( | |||
std::make_shared<mgb::TensorRTEngineCacheIO>(tensorrt_cache_path)); | |||
#else | |||
LITE_MARK_USED_VAR(tensorrt_cache_path); | |||
LITE_THROW("TensorRT is disable at compile time."); | |||
#endif | |||
} | |||
void lite::dump_tensor_rt_cache() { | |||
#if MGB_ENABLE_TENSOR_RT | |||
if (mgb::TensorRTEngineCache::enable_engine_cache()) { | |||
mgb::TensorRTEngineCache::inst().dump_cache(); | |||
} | |||
#else | |||
LITE_THROW("TensorRT is disable at compile time."); | |||
#endif | |||
} | |||
#else //LITE_BUILD_WITH_MGE | |||
void lite::try_coalesce_all_free_memory() {} | |||
void lite::set_loader_lib_path(const std::string& ) { | |||
LITE_THROW("mge is disbale at build time, please build with mge"); | |||
} | |||
void lite::set_persistent_cache(const std::string&, bool) { | |||
LITE_THROW("mge is disbale at build time, please build with mge"); | |||
} | |||
void lite::dump_persistent_cache(const std::string& ) { | |||
LITE_THROW("mge is disbale at build time, please build with mge"); | |||
} | |||
//! Set the TensorRT engine cache path for serialized prebuilt ICudaEngine | |||
void lite::set_tensor_rt_cache(std::string ) { | |||
LITE_THROW("mge is disbale at build time, please build with mge"); | |||
} | |||
void lite::dump_tensor_rt_cache() { | |||
LITE_THROW("mge is disbale at build time, please build with mge"); | |||
} | |||
#endif | |||
namespace lite { | |||
REGIST_DECRYPTION_METHOD("AES_default", lite::AESDcryption::decrypt_model, | |||
lite::AESDcryption::get_decrypt_key()); | |||
REGIST_DECRYPTION_METHOD("RC4_default", lite::RC4::decrypt_model, | |||
lite::RC4::get_decrypt_key()); | |||
REGIST_DECRYPTION_METHOD("SIMPLE_FAST_RC4_default", | |||
lite::SimpleFastRC4::decrypt_model, | |||
lite::SimpleFastRC4::get_decrypt_key()); | |||
REGIST_PARSE_INFO_FUNCTION("LITE_default", lite::default_parse_info); | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,37 @@ | |||
/** | |||
* \file lite/src/lite_build_config.h.in | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#ifndef _HEADER_LITE_BUILD_CONFIG | |||
#define _HEADER_LITE_BUILD_CONFIG | |||
#cmakedefine01 LITE_ENABLE_LOGGING | |||
#cmakedefine01 LITE_ENABLE_EXCEPTION | |||
#cmakedefine01 LITE_WITH_CUDA | |||
#cmakedefine01 LITE_ASSERT_LOC | |||
#ifndef LITE_ENABLE_LOGGING | |||
#define LITE_ENABLE_LOGGING 1 | |||
#endif | |||
#ifndef LITE_ENABLE_EXCEPTION | |||
#if __cpp_exceptions || __EXCEPTIONS || \ | |||
(defined(_MSC_VER) && defined(_CPPUNWIND)) | |||
#define LITE_ENABLE_EXCEPTION 1 | |||
#else | |||
#define LITE_ENABLE_EXCEPTION 0 | |||
#endif | |||
#endif | |||
#ifndef LITE_WITH_CUDA | |||
#define LITE_WITH_CUDA 0 | |||
#endif | |||
#ifndef LITE_ASSERT_LOC | |||
#define LITE_ASSERT_LOC 0 | |||
#endif | |||
#endif // _HEADER_LITE_BUILD_CONFIG |
@@ -0,0 +1,254 @@ | |||
/** | |||
* \file lite/src/mge/algo_cache/file_cache.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../common.h" | |||
#include "file_cache.h" | |||
using namespace lite; | |||
//////////////////////// InFilePersistentCache::InputMemory /////////////// | |||
class InFilePersistentCache::InputMemory { | |||
const uint8_t* m_ptr; | |||
size_t m_offset = 0; | |||
size_t m_size; | |||
public: | |||
InputMemory(const uint8_t* bin, size_t size) : m_ptr{bin}, m_size{size} {} | |||
template <typename T> | |||
void read(T& val) { | |||
static_assert(std::is_trivially_copyable<T>::value, | |||
"only support trivially copyable type"); | |||
LITE_ASSERT(m_offset + sizeof(T) <= m_size); | |||
memcpy(&val, m_ptr, sizeof(T)); | |||
m_offset += sizeof(T); | |||
m_ptr += sizeof(T); | |||
} | |||
template <typename T> | |||
void read(T* buf, size_t size) { | |||
static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
"only support read bytes"); | |||
LITE_ASSERT(m_offset + size <= m_size); | |||
memcpy(buf, m_ptr, size); | |||
m_offset += size; | |||
m_ptr += size; | |||
} | |||
}; | |||
//////////////////////// InFilePersistentCache::InputFile /////////////// | |||
class InFilePersistentCache::InputFile { | |||
FILE* m_fp; | |||
public: | |||
InputFile(const char* path) : m_fp{fopen(path, "rb")} { | |||
LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno)); | |||
} | |||
~InputFile() { | |||
if (m_fp) { | |||
fclose(m_fp); | |||
} | |||
} | |||
template <typename T> | |||
void read(T& val) { | |||
static_assert(std::is_trivially_copyable<T>::value, | |||
"only support trivially copyable type"); | |||
auto ret = fread(&val, sizeof(T), 1, m_fp); | |||
LITE_ASSERT(ret == 1); | |||
} | |||
template <typename T> | |||
void read(T* buf, size_t size) { | |||
static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1, | |||
"only support read bytes"); | |||
auto ret = fread(buf, size, 1, m_fp); | |||
LITE_ASSERT(ret == 1); | |||
} | |||
}; | |||
//////////////////////// InFilePersistentCache::OutputFile /////////////// | |||
class InFilePersistentCache::OutputFile { | |||
FILE* m_fp; | |||
public: | |||
OutputFile(const char* path) : m_fp{fopen(path, "wb")} { | |||
LITE_ASSERT(m_fp, "failed to open %s: %s", path, strerror(errno)); | |||
} | |||
~OutputFile() { | |||
if (m_fp) { | |||
fclose(m_fp); | |||
} | |||
} | |||
template <typename T> | |||
void write(T val) { | |||
auto ret = fwrite(&val, sizeof(T), 1, m_fp); | |||
LITE_ASSERT(ret == 1); | |||
} | |||
template <typename T> | |||
void write(const T* buf, size_t size) { | |||
static_assert(sizeof(T) == 1, "only support write bytes"); | |||
auto ret = fwrite(buf, size, 1, m_fp); | |||
LITE_ASSERT(ret == 1); | |||
} | |||
void flush() { fflush(m_fp); } | |||
void set_head() { fseek(m_fp, 0, SEEK_SET); } | |||
}; | |||
//////////////////////// InFilePersistentCache::BlobStorage /////////////// | |||
template <typename Input> | |||
InFilePersistentCache::BlobStorage& | |||
InFilePersistentCache::BlobStorage::init_from_input(Input& inp) { | |||
uint32_t data_size; | |||
inp.read(data_size); | |||
size = data_size; | |||
data_refhold = std::make_unique<uint8_t[]>(size); | |||
inp.read(data_refhold.get(), size); | |||
ptr = data_refhold.get(); | |||
return *this; | |||
} | |||
void InFilePersistentCache::BlobStorage::write_to_file( | |||
OutputFile& out_file) const { | |||
uint32_t u_size = size; | |||
out_file.write(u_size); | |||
out_file.write(data_refhold.get(), u_size); | |||
} | |||
InFilePersistentCache::BlobStorage& | |||
InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) { | |||
data_refhold = std::make_unique<uint8_t[]>(b.size + 1); | |||
memcpy(data_refhold.get(), b.ptr, b.size); | |||
data_refhold.get()[b.size] = 0; // for C-string safety | |||
ptr = data_refhold.get(); | |||
size = b.size; | |||
return *this; | |||
} | |||
//////////////////////// InFilePersistentCache ////////////////////// | |||
template <typename Input> | |||
void InFilePersistentCache::read_cache(Input& inp) { | |||
uint32_t nr_category; | |||
inp.read(nr_category); | |||
char category_buf[256]; | |||
for (uint32_t i = 0; i < nr_category; i++) { | |||
uint32_t category_size; | |||
inp.read(category_size); | |||
inp.read(category_buf, category_size); | |||
category_buf[category_size] = '\0'; | |||
std::string category(category_buf); | |||
mgb_log_debug("load new category: %s", category_buf); | |||
// read bobs | |||
uint32_t nr_bobs; | |||
inp.read(nr_bobs); | |||
for (uint32_t j = 0; j < nr_bobs; j++) { | |||
BlobStorage key_storage; | |||
key_storage.init_from_input(inp).init_hash(); | |||
mgb_log_debug("read key: %zu", key_storage.hash); | |||
m_cache[category][std::move(key_storage)].init_from_input(inp); | |||
} | |||
} | |||
} | |||
InFilePersistentCache::InFilePersistentCache(const char* path, | |||
bool always_open) { | |||
if (!access(path, F_OK)) { | |||
mgb_log_debug("use fastrun cache: %s", path); | |||
InputFile inp(path); | |||
read_cache<InputFile>(inp); | |||
} | |||
if (always_open) { | |||
m_always_open_file = std::make_shared<OutputFile>(path); | |||
} | |||
} | |||
InFilePersistentCache::InFilePersistentCache(const uint8_t* bin, size_t size) { | |||
LITE_ASSERT(bin); | |||
InputMemory inp(bin, size); | |||
read_cache<InputMemory>(inp); | |||
} | |||
void InFilePersistentCache::dump_cache(const char* path) { | |||
OutputFile out_file(path); | |||
dump_cache(&out_file); | |||
} | |||
void InFilePersistentCache::dump_cache(OutputFile* out_file) { | |||
uint32_t nr_category = m_cache.size(); | |||
out_file->write(nr_category); | |||
for (const auto& cached_category : m_cache) { | |||
uint32_t category_size = cached_category.first.size(); | |||
out_file->write(category_size); | |||
out_file->write(cached_category.first.data(), category_size); | |||
mgb_log_debug("write new category: %s", cached_category.first.c_str()); | |||
uint32_t nr_bobs = cached_category.second.size(); | |||
out_file->write(nr_bobs); | |||
for (const auto& item : cached_category.second) { | |||
mgb_log_debug("dump key: %zu", item.first.hash); | |||
item.first.write_to_file(*out_file); | |||
item.second.write_to_file(*out_file); | |||
} | |||
} | |||
} | |||
mgb::Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get( | |||
const std::string& category, const Blob& key) { | |||
decltype(m_cache.begin()) iter0; | |||
{ | |||
MGB_LOCK_GUARD(m_mtx); | |||
iter0 = m_cache.find(category); | |||
if (iter0 == m_cache.end()) | |||
return mgb::None; | |||
} | |||
BlobStorage key_storage; | |||
key_storage.Blob::operator=(key); | |||
key_storage.init_hash(); | |||
MGB_LOCK_GUARD(m_mtx); | |||
auto iter1 = iter0->second.find(key_storage); | |||
if (iter1 == iter0->second.end()) | |||
return mgb::None; | |||
return iter1->second; | |||
} | |||
void InFilePersistentCache::put(const std::string& category, const Blob& key, | |||
const Blob& value) { | |||
BlobStorage key_storage; | |||
key_storage.init_data_ref(key).init_hash(); | |||
MGB_LOCK_GUARD(m_mtx); | |||
auto size0 = m_cache.size(); | |||
m_cache[category][std::move(key_storage)].init_data_ref(value); | |||
if (m_cache.size() > size0) { | |||
mgb_log_debug("new cache category: %s", category.c_str()); | |||
} | |||
if (m_always_open_file) { | |||
m_always_open_file->set_head(); | |||
dump_cache(m_always_open_file.get()); | |||
m_always_open_file->flush(); | |||
} | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,85 @@ | |||
/** | |||
* \file lite/src/mge/algo_cache/file_cache.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "megbrain/utils/persistent_cache.h" | |||
namespace lite { | |||
/** | |||
* dump format: | |||
* | |||
* all integers in local endian (effectively little endian as I can see) | |||
* | |||
* dump format: | |||
* <nr_category|uint32_t><category_size|uint32_t><category|uint8_t*> | |||
* <nr_bob|uint32_t>[<key_size|uint32_t><key|uint8_t*><data_size| | |||
* uint32_t><data|uint8_t*>]* | |||
*/ | |||
//! TODO: fix one thread set cache when other threads is using old cache | |||
class InFilePersistentCache final : public mgb::PersistentCache { | |||
class InputFile; | |||
class InputMemory; | |||
class OutputFile; | |||
struct BlobStorage : public Blob { | |||
std::unique_ptr<uint8_t[]> data_refhold; | |||
size_t hash = 0; | |||
template <typename Input> | |||
BlobStorage& init_from_input(Input& inp); | |||
void write_to_file(OutputFile& out_file) const; | |||
BlobStorage& init_data_ref(const Blob& b); | |||
BlobStorage& init_hash() { | |||
hash = mgb::XXHash{}.update(ptr, size).digest(); | |||
return *this; | |||
} | |||
bool operator==(const BlobStorage& rhs) const { | |||
return size == rhs.size && !memcmp(ptr, rhs.ptr, size); | |||
} | |||
struct Hash { | |||
size_t operator()(const BlobStorage& b) const { return b.hash; } | |||
}; | |||
}; | |||
std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage, | |||
BlobStorage::Hash>> | |||
m_cache; | |||
LITE_MUTEX m_mtx; | |||
std::shared_ptr<OutputFile> m_always_open_file; | |||
template <typename Input> | |||
void read_cache(Input& inp); | |||
public: | |||
InFilePersistentCache() = default; | |||
InFilePersistentCache(const char* path, bool always_open = false); | |||
InFilePersistentCache(const uint8_t* bin, size_t size); | |||
/** | |||
* \warning You should invoke \c dump_cache mannually to save the cache | |||
* file. | |||
*/ | |||
void dump_cache(const char* path); | |||
void dump_cache(OutputFile* out_file); | |||
mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override; | |||
void put(const std::string& category, const Blob& key, | |||
const Blob& value) override; | |||
}; | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,241 @@ | |||
/** | |||
* \file lite/src/mge/algo_cache/redis_cache.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA | |||
#include "../../misc.h" | |||
#include "redis_cache.h" | |||
#include <iostream> | |||
#include <vector> | |||
namespace { | |||
/* | |||
** Translation Table as described in RFC1113 | |||
*/ | |||
static const char cb64[] = | |||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | |||
/* | |||
** Translation Table to decode: | |||
*https://github.com/dgiardini/imgcalkap/blob/master/base64.c | |||
*/ | |||
static const char cd64[] = | |||
"|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`" | |||
"abcdefghijklmnopq"; | |||
/* | |||
** encodeblock | |||
** | |||
** encode 3 8-bit binary bytes as 4 '6-bit' characters | |||
*/ | |||
void encodeblock(unsigned char in[3], unsigned char out[4], int len) { | |||
out[0] = cb64[in[0] >> 2]; | |||
out[1] = cb64[((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4)]; | |||
out[2] = (unsigned char)(len > 1 ? cb64[((in[1] & 0x0f) << 2) | | |||
((in[2] & 0xc0) >> 6)] | |||
: '='); | |||
out[3] = (unsigned char)(len > 2 ? cb64[in[2] & 0x3f] : '='); | |||
} | |||
/* | |||
** decodeblock | |||
** | |||
** decode 4 '6-bit' characters into 3 8-bit binary bytes | |||
*/ | |||
void decodeblock(unsigned char in[4], unsigned char out[3]) { | |||
out[0] = (unsigned char)(in[0] << 2 | in[1] >> 4); | |||
out[1] = (unsigned char)(in[1] << 4 | in[2] >> 2); | |||
out[2] = (unsigned char)(((in[2] << 6) & 0xc0) | in[3]); | |||
} | |||
/** | |||
* Encode string to base64 string | |||
* @param input - source string | |||
* @param outdata - target base64 string | |||
* @param linesize - max size of line | |||
*/ | |||
void encode(const std::vector<std::uint8_t>& input, | |||
std::vector<std::uint8_t>& outdata, int linesize = 76) { | |||
outdata.clear(); | |||
unsigned char in[3], out[4]; | |||
int i, len, blocksout = 0; | |||
size_t j = 0; | |||
auto* indata = reinterpret_cast<const unsigned char*>(input.data()); | |||
unsigned int insize = input.size(); | |||
while (j <= insize) { | |||
len = 0; | |||
for (i = 0; i < 3; i++) { | |||
in[i] = (unsigned char)indata[j]; | |||
j++; | |||
if (j <= insize) { | |||
len++; | |||
} else { | |||
in[i] = 0; | |||
} | |||
} | |||
if (len) { | |||
encodeblock(in, out, len); | |||
for (i = 0; i < 4; i++) { | |||
outdata.push_back(out[i]); | |||
} | |||
blocksout++; | |||
} | |||
if (blocksout >= (linesize / 4) || (j == insize)) { | |||
if (blocksout) { | |||
outdata.push_back('\r'); | |||
outdata.push_back('\n'); | |||
} | |||
blocksout = 0; | |||
} | |||
} | |||
} | |||
/** | |||
* Decode base64 string ot source | |||
* @param input - base64 string | |||
* @param outdata - source string | |||
*/ | |||
void decode(const std::vector<std::uint8_t>& input, | |||
std::vector<std::uint8_t>& outdata) { | |||
outdata.clear(); | |||
unsigned char in[4], out[3], v; | |||
int i, len; | |||
size_t j = 0; | |||
auto* indata = reinterpret_cast<const unsigned char*>(input.data()); | |||
unsigned int insize = input.size(); | |||
while (j <= insize) { | |||
for (len = 0, i = 0; i < 4 && (j <= insize); i++) { | |||
v = 0; | |||
while ((j <= insize) && v == 0) { | |||
v = (unsigned char)indata[j++]; | |||
v = (unsigned char)((v < 43 || v > 122) ? 0 : cd64[v - 43]); | |||
if (v) { | |||
v = (unsigned char)((v == '$') ? 0 : v - 61); | |||
} | |||
} | |||
if (j <= insize) { | |||
len++; | |||
if (v) { | |||
in[i] = (unsigned char)(v - 1); | |||
} | |||
} else { | |||
in[i] = 0; | |||
} | |||
} | |||
if (len) { | |||
decodeblock(in, out); | |||
for (i = 0; i < len - 1; i++) { | |||
outdata.push_back(out[i]); | |||
} | |||
} | |||
} | |||
} | |||
/** | |||
* Encode binary data to base64 buffer | |||
* @param input - source data | |||
* @param outdata - target base64 buffer | |||
* @param linesize | |||
*/ | |||
void encode(const std::string& input, std::string& outdata, int linesize = 76) { | |||
std::vector<std::uint8_t> out; | |||
std::vector<std::uint8_t> in(input.begin(), input.end()); | |||
encode(in, out, linesize); | |||
outdata = std::string(out.begin(), out.end()); | |||
} | |||
/** | |||
* Decode base64 buffer to source binary data | |||
* @param input - base64 buffer | |||
* @param outdata - source binary data | |||
*/ | |||
void decode(const std::string& input, std::string& outdata) { | |||
std::vector<std::uint8_t> in(input.begin(), input.end()); | |||
std::vector<std::uint8_t> out; | |||
decode(in, out); | |||
outdata = std::string(out.begin(), out.end()); | |||
} | |||
} // namespace | |||
using namespace lite; | |||
RedisCache::RedisCache(std::string redis_ip, size_t port, std::string password) | |||
: m_ip(redis_ip), m_port(port), m_password(password) { | |||
m_client.auth(password); | |||
m_client.connect( | |||
m_ip, m_port, | |||
[](const std::string& host, std::size_t port, | |||
cpp_redis::connect_state status) { | |||
if (status == cpp_redis::connect_state::dropped) { | |||
LITE_LOG("client disconnected from %s.", host.c_str()); | |||
LITE_LOG("Redis server connect to %s :%zu failed.", | |||
host.c_str(), port); | |||
} | |||
}, | |||
std::uint32_t(200)); | |||
} | |||
mgb::Maybe<mgb::PersistentCache::Blob> RedisCache::get( | |||
const std::string& category, const mgb::PersistentCache::Blob& key) { | |||
LITE_LOCK_GUARD(m_mtx); | |||
if (m_old == nullptr) { | |||
return mgb::None; | |||
} | |||
auto mem_result = m_old->get(category, key); | |||
if (mem_result.valid()) | |||
return mem_result; | |||
std::string key_str(static_cast<const char*>(key.ptr), key.size); | |||
std::string redis_key_str; | |||
encode(category + '@' + key_str, redis_key_str, 24); | |||
auto result = m_client.get(redis_key_str); | |||
m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100)); | |||
LITE_ASSERT(is_valid()); | |||
auto content = result.get(); | |||
if (content.is_null()) | |||
return mgb::None; | |||
std::string decode_content; | |||
decode(content.as_string(), decode_content); | |||
m_old->put(category, key, {decode_content.data(), decode_content.length()}); | |||
return m_old->get(category, key); | |||
} | |||
void RedisCache::put(const std::string& category, const Blob& key, | |||
const mgb::PersistentCache::Blob& value) { | |||
// ScopedTimer t1(std::string("put") + category); | |||
LITE_LOCK_GUARD(m_mtx); | |||
std::string key_str(static_cast<const char*>(key.ptr), key.size); | |||
std::string redis_key_str; | |||
encode(category + '@' + key_str, redis_key_str); | |||
std::string value_str(static_cast<const char*>(value.ptr), value.size); | |||
std::string redis_value_str; | |||
encode(value_str, redis_value_str); | |||
auto result = m_client.set(redis_key_str, redis_value_str); | |||
if (m_old == nullptr) { | |||
return; | |||
} | |||
m_old->put(category, key, value); | |||
m_client.sync_commit<double, std::milli>(std::chrono::milliseconds(100)); | |||
LITE_ASSERT(is_valid()); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,47 @@ | |||
/** | |||
* \file lite/src/mge/algo_cache/redis_cache.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2020 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if !defined(WIN32) && LITE_BUILD_WITH_MGE && LITE_WITH_CUDA | |||
#include <cpp_redis/cpp_redis> | |||
#include <string> | |||
#include <vector> | |||
#include "megbrain/utils/persistent_cache.h" | |||
namespace lite { | |||
//! TODO: fix one thread set cache when other threads is using old cache | |||
class RedisCache final : public mgb::PersistentCache { | |||
public: | |||
RedisCache(std::string redis_ip, size_t port, std::string password); | |||
bool is_valid() { return m_client.is_connected(); } | |||
~RedisCache() {} | |||
void init(std::shared_ptr<mgb::PersistentCache> old) { m_old = old; } | |||
mgb::Maybe<Blob> get(const std::string& category, const Blob& key) override; | |||
void put(const std::string& category, const Blob& key, | |||
const Blob& value) override; | |||
private: | |||
std::shared_ptr<mgb::PersistentCache> m_old; | |||
LITE_MUTEX m_mtx; | |||
cpp_redis::client m_client; | |||
const std::string m_ip; | |||
const size_t m_port; | |||
const std::string m_password; | |||
}; | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,191 @@ | |||
/** | |||
* \file src/mge/common.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "common.h" | |||
#include "megdnn/dtype.h" | |||
using namespace lite; | |||
using namespace mgb; | |||
enum class CompressionMethod { | |||
NO_COMPRESSION = 0, | |||
FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS = 1, | |||
FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS = 2, | |||
}; | |||
void lite::decompressed_tensor_value_loader( | |||
void* ptr_, const mgb::TensorLayout& layout, | |||
mgb::serialization::InputFile& fin) { | |||
uint8_t compress_flag; | |||
fin.read(&compress_flag, sizeof(compress_flag)); | |||
size_t num_weights = layout.total_nr_elems(); | |||
switch (CompressionMethod(compress_flag)) { | |||
case CompressionMethod::NO_COMPRESSION: { | |||
mgb::serialization::GraphLoadConfig::default_tensor_value_loader( | |||
ptr_, layout, fin); | |||
break; | |||
} | |||
case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT8_WEIGHTS: { | |||
if (ptr_) { | |||
float stride, base; | |||
std::vector<uint8_t> weights(num_weights); | |||
fin.read(&stride, sizeof(stride)); | |||
fin.read(&base, sizeof(base)); | |||
fin.read(weights.data(), num_weights * sizeof(uint8_t)); | |||
auto* ptr = static_cast<float*>(ptr_); | |||
for (size_t i = 0; i < num_weights; ++i) | |||
ptr[i] = stride * weights[i] + base; | |||
} else { | |||
fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint8_t)); | |||
} | |||
break; | |||
} | |||
case CompressionMethod::FLOAT32_STRIDE_FLOAT32_BASE_UINT16_WEIGHTS: { | |||
if (ptr_) { | |||
float stride, base; | |||
std::vector<uint16_t> weights(num_weights); | |||
fin.read(&stride, sizeof(stride)); | |||
fin.read(&base, sizeof(base)); | |||
fin.read(weights.data(), num_weights * sizeof(uint16_t)); | |||
auto* ptr = static_cast<float*>(ptr_); | |||
for (size_t i = 0; i < num_weights; ++i) | |||
ptr[i] = stride * weights[i] + base; | |||
} else { | |||
fin.skip(sizeof(float) * 2 + num_weights * sizeof(uint16_t)); | |||
} | |||
break; | |||
} | |||
default: | |||
LITE_THROW("Unexpected compression method"); | |||
} | |||
} | |||
LTensorLayout lite::to_impl_layout(const Layout& layout) { | |||
mgb::TensorLayout mge_layout; | |||
mge_layout.ndim = layout.ndim; | |||
LITE_ASSERT(layout.ndim < TensorShape::MAX_NDIM, | |||
"lite layout ndim is to large"); | |||
for (size_t i = 0; i < layout.ndim; i++) { | |||
mge_layout.shape[i] = layout.shapes[i]; | |||
} | |||
mge_layout.init_contiguous_stride(); | |||
switch (layout.data_type) { | |||
case LiteDataType::LITE_FLOAT: | |||
mge_layout.dtype = mgb::dtype::Float32(); | |||
break; | |||
case LiteDataType::LITE_HALF: | |||
mge_layout.dtype = mgb::dtype::Float16(); | |||
break; | |||
case LiteDataType::LITE_INT: | |||
mge_layout.dtype = mgb::dtype::Int32(); | |||
break; | |||
case LiteDataType::LITE_INT8: | |||
mge_layout.dtype = mgb::dtype::Int8(); | |||
break; | |||
case LiteDataType::LITE_UINT8: | |||
mge_layout.dtype = mgb::dtype::Uint8(); | |||
break; | |||
case LiteDataType::LITE_INT16: | |||
mge_layout.dtype = mgb::dtype::Int16(); | |||
break; | |||
default: | |||
LITE_THROW(mgb::ssprintf("unsupport dtype in lite enum id is %d.", | |||
static_cast<int>(layout.data_type))); | |||
} | |||
return mge_layout; | |||
} | |||
Layout lite::to_lite_layout(const LTensorLayout& mge_layout) { | |||
Layout layout; | |||
if (!mge_layout.dtype.valid()) { | |||
return layout; | |||
} | |||
layout.ndim = mge_layout.ndim; | |||
LITE_ASSERT(layout.ndim < layout.MAXDIM, "tensor layout ndim is to large"); | |||
for (size_t i = 0; i < layout.ndim; i++) { | |||
layout.shapes[i] = mge_layout.shape[i]; | |||
} | |||
switch (mge_layout.dtype.enumv()) { | |||
case mgb::DTypeEnum::Float32: | |||
layout.data_type = LiteDataType::LITE_FLOAT; | |||
break; | |||
case mgb::DTypeEnum::Float16: | |||
layout.data_type = LiteDataType::LITE_HALF; | |||
break; | |||
case mgb::DTypeEnum::Int32: | |||
layout.data_type = LiteDataType::LITE_INT; | |||
break; | |||
case mgb::DTypeEnum::Int16: | |||
layout.data_type = LiteDataType::LITE_INT16; | |||
break; | |||
case mgb::DTypeEnum::Int8: | |||
layout.data_type = LiteDataType::LITE_INT8; | |||
break; | |||
case mgb::DTypeEnum::Uint8: | |||
layout.data_type = LiteDataType::LITE_UINT8; | |||
break; | |||
default: | |||
LITE_THROW(mgb::ssprintf("unsupport dtype in lite : %s.", | |||
mge_layout.to_string().c_str())); | |||
} | |||
return layout; | |||
} | |||
mgb::CompNode::Locator lite::to_compnode_locator(const LiteDeviceType& device) { | |||
mgb::CompNode::Locator loc; | |||
switch (device) { | |||
case LiteDeviceType::LITE_CPU: | |||
loc.type = mgb::CompNode::DeviceType::CPU; | |||
break; | |||
case LiteDeviceType::LITE_CUDA: | |||
loc.type = mgb::CompNode::DeviceType::CUDA; | |||
break; | |||
case LiteDeviceType::LITE_ATLAS: | |||
loc.type = mgb::CompNode::DeviceType::ATLAS; | |||
break; | |||
case LiteDeviceType::LITE_OPENCL: | |||
loc.type = mgb::CompNode::DeviceType::OPENCL; | |||
break; | |||
case LiteDeviceType::LITE_DEVICE_DEFAULT: | |||
loc.type = mgb::CompNode::DeviceType::UNSPEC; | |||
break; | |||
default: | |||
LITE_THROW( | |||
ssprintf("lite unsupported compnode type: enum value: %d.", | |||
(int)(device))); | |||
} | |||
return loc; | |||
} | |||
LiteDeviceType lite::get_device_from_locator( | |||
const mgb::CompNode::Locator& locator) { | |||
switch (locator.type) { | |||
case mgb::CompNode::DeviceType::CPU: | |||
case mgb::CompNode::DeviceType::MULTITHREAD: | |||
return LiteDeviceType::LITE_CPU; | |||
case mgb::CompNode::DeviceType::CUDA: | |||
return LiteDeviceType::LITE_CUDA; | |||
case mgb::CompNode::DeviceType::ATLAS: | |||
return LiteDeviceType::LITE_ATLAS; | |||
case mgb::CompNode::DeviceType::OPENCL: | |||
return LiteDeviceType::LITE_OPENCL; | |||
case mgb::CompNode::DeviceType::UNSPEC: | |||
return LiteDeviceType::LITE_DEVICE_DEFAULT; | |||
default: | |||
LITE_THROW( | |||
ssprintf("lite unsupported compnode type: enum value: %d.", | |||
(int)(locator.type))); | |||
} | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,66 @@ | |||
/** | |||
* \file src/mge/common.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../misc.h" | |||
#include "lite/network.h" | |||
#include "lite/tensor.h" | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/serialization/serializer.h" | |||
#include "megbrain/tensor.h" | |||
//! rename mge name L* | |||
namespace lite { | |||
using LTensorLayout = mgb::TensorLayout; | |||
using LComputingGraph = mgb::ComputingGraph; | |||
using LDeviceTensorStorage = mgb::DeviceTensorStorage; | |||
} // namespace lite | |||
namespace lite { | |||
/*! | |||
* \brief transform mgelite Layout to mgb TensorLayout | |||
*/ | |||
LTensorLayout to_impl_layout(const Layout& layout); | |||
/*! | |||
* \brief transform mgb TensorLayout to mgelite Layout | |||
*/ | |||
Layout to_lite_layout(const mgb::TensorLayout& mge_layout); | |||
/*! | |||
* \brief transform mgelite device to mgb CompNode Locator | |||
*/ | |||
mgb::CompNode::Locator to_compnode_locator(const LiteDeviceType& device); | |||
/*! | |||
* \brief transform mgb CompNode Locator to lite Device | |||
*/ | |||
LiteDeviceType get_device_from_locator(const mgb::CompNode::Locator& locator); | |||
/*! \brief A megbrain tensor loader with weight decompression. | |||
* | |||
* The weight to be compressed must start with a byte of compression flag (CF). | |||
* | |||
* 1. CF = 0: no compression. | |||
* 2. CF = 1: float32 stride + float32 base + uint8 weight (return s*w+b) | |||
* 3. CF = 2: float32 stride + float32 base + uint16 weight (return s*w+b) | |||
* | |||
*/ | |||
void decompressed_tensor_value_loader(void* ptr_, | |||
const mgb::TensorLayout& layout, | |||
mgb::serialization::InputFile& fin); | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,212 @@ | |||
/** | |||
* \file src/mge/function_dft.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#if LITE_BUILD_WITH_MGE | |||
#include "function_base.h" | |||
#include "network_impl.h" | |||
#include "network_impl_base.h" | |||
#include "tensor_impl.h" | |||
namespace lite { | |||
#define THROW_FUNC_ERROR(func_name) \ | |||
auto msg_info = func_name + " is not aviliable in Dft backend."; \ | |||
LITE_THROW(msg_info.c_str()) | |||
// the functions used for dft's tensor.cpp are as followed: | |||
template <> | |||
inline std::shared_ptr<Tensor::TensorImplBase> | |||
call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
std::string func_name) { | |||
if (func_name == "create_tensor") { | |||
return std::make_shared<TensorImplDft>(); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline std::shared_ptr<Tensor::TensorImplBase> | |||
call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
std::string func_name, LiteDeviceType device_type, | |||
bool is_pinned_host) { | |||
if (func_name == "create_tensor") { | |||
return std::make_shared<TensorImplDft>(device_type, is_pinned_host); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline std::shared_ptr<Tensor::TensorImplBase> | |||
call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
std::string func_name, int device_id, LiteDeviceType device_type, | |||
const Layout layout, bool is_pinned_host) { | |||
if (func_name == "create_tensor") { | |||
return std::make_shared<TensorImplDft>(device_id, device_type, layout, | |||
is_pinned_host); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline std::shared_ptr<Tensor::TensorImplBase> | |||
call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
std::string func_name, LiteDeviceType device_type, const Layout layout, | |||
bool is_pinned_host) { | |||
if (func_name == "create_tensor") { | |||
return std::make_shared<TensorImplDft>(device_type, layout, | |||
is_pinned_host); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline std::shared_ptr<Tensor::TensorImplBase> | |||
call_func<TensorImplDft, std::shared_ptr<Tensor::TensorImplBase>>( | |||
std::string func_name, int device_id, int stream_id, | |||
LiteDeviceType device_type, bool is_pinned_host) { | |||
if (func_name == "create_tensor") { | |||
return std::make_shared<TensorImplDft>(device_id, stream_id, | |||
device_type, is_pinned_host); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
// the functions used for dft's network.cpp are as followed: | |||
template <> | |||
inline std::unique_ptr<Network::NetworkImplBase> | |||
call_func<NetworkImplDft, std::unique_ptr<Network::NetworkImplBase>>( | |||
std::string func_name) { | |||
if (func_name == "create_network") { | |||
return std::make_unique<NetworkImplDft>(); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline Network::NetworkImplBase* | |||
try_call_func<NetworkImplDft, Network::NetworkImplBase*>( | |||
std::string func_name) { | |||
if (func_name == "parse_model") { | |||
return new NetworkImplDft(); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
#define CALL_FUNC(func_name, ...) \ | |||
network_impl->cast_final_safe<NetworkImplDft>().func_name(__VA_ARGS__) | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl, | |||
size_t num) { | |||
if (func_name == "set_cpu_threads_number") { | |||
CALL_FUNC(set_cpu_threads_number, num); | |||
} else if (func_name == "set_network_algo_workspace_limit") { | |||
CALL_FUNC(set_network_algo_workspace_limit, num); | |||
} else { | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
} | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl) { | |||
if (func_name == "use_tensorrt") { | |||
CALL_FUNC(use_tensorrt); | |||
} else if (func_name == "set_cpu_inplace_mode") { | |||
CALL_FUNC(set_cpu_inplace_mode); | |||
} else { | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
} | |||
template <> | |||
inline size_t call_func<NetworkImplDft, size_t>( | |||
std::string func_name, Network::NetworkImplBase* network_impl) { | |||
if (func_name == "get_cpu_threads_number") { | |||
return CALL_FUNC(get_cpu_threads_number); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline bool call_func<NetworkImplDft, bool>( | |||
std::string func_name, Network::NetworkImplBase* network_impl) { | |||
if (func_name == "is_cpu_inplace_mode") { | |||
return CALL_FUNC(is_cpu_inplace_mode); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl, | |||
ThreadAffinityCallback thread_affinity_callback) { | |||
if (func_name == "set_runtime_thread_affinity") { | |||
return CALL_FUNC(set_runtime_thread_affinity, | |||
std::move(thread_affinity_callback)); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl, | |||
LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size, | |||
bool binary_equal_between_batch) { | |||
if (func_name == "set_network_algo_policy") { | |||
return CALL_FUNC(set_network_algo_policy, strategy, shared_batch_size, | |||
binary_equal_between_batch); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl, | |||
std::shared_ptr<Allocator> user_allocator) { | |||
if (func_name == "set_memory_allocator") { | |||
return CALL_FUNC(set_memory_allocator, user_allocator); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl, | |||
std::string file_name) { | |||
if (func_name == "enable_io_txt_dump") { | |||
return CALL_FUNC(enable_io_txt_dump, file_name); | |||
} else if (func_name == "enable_io_bin_dump") { | |||
return CALL_FUNC(enable_io_bin_dump, file_name); | |||
} | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
template <> | |||
inline void call_func<NetworkImplDft, void>( | |||
std::string func_name, Network::NetworkImplBase* network_impl, | |||
Network::NetworkImplBase* src_network_impl) { | |||
if (func_name == "share_runtime_memory_with") { | |||
CALL_FUNC(share_runtime_memory_with, src_network_impl); | |||
} else if (func_name == "shared_weight_with") { | |||
CALL_FUNC(shared_weight_with, src_network_impl); | |||
} else { | |||
THROW_FUNC_ERROR(func_name); | |||
} | |||
} | |||
#undef THROW_FUNC_ERROR | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,69 @@ | |||
/** | |||
* \file src/mge/memory_alloctor.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "common.h" | |||
#include "megbrain/dtype.h" | |||
#include "network_impl.h" | |||
#include "megbrain/graph/cg.h" | |||
namespace lite { | |||
class UserStaticMemAlloc final : public mgb::cg::DeviceMemoryAllocator { | |||
std::shared_ptr<Allocator> m_allocator = nullptr; | |||
public: | |||
UserStaticMemAlloc(std::shared_ptr<Allocator> allocator) | |||
: m_allocator(allocator) {} | |||
void alloc_static(LComputingGraph*, LDeviceTensorStorage& dest, | |||
size_t size) override { | |||
if (size < dest.size()) { | |||
return; | |||
} | |||
auto cn = dest.comp_node_allow_invalid(); | |||
LITE_ASSERT(cn.valid(), "The compnode is invalid when alloc memory."); | |||
LiteDeviceType device_type = | |||
get_device_from_locator(cn.locator_logical()); | |||
int device_id = cn.locator_logical().device; | |||
auto ptr_alloc = static_cast<mgb::dt_byte*>(m_allocator->allocate( | |||
device_type, device_id, size, cn.get_mem_addr_alignment())); | |||
auto storage = std::shared_ptr<mgb::dt_byte>( | |||
ptr_alloc, | |||
[allocator = m_allocator, device_type, device_id](void* ptr) { | |||
allocator->free(device_type, device_id, ptr); | |||
}); | |||
dest.reset(cn, size, storage); | |||
} | |||
void alloc_dynamic(mgb::VarNode*, mgb::DeviceTensorStorage& dest, | |||
size_t size) override { | |||
alloc_static(nullptr, dest, size); | |||
} | |||
void defrag_prealloc_contig(mgb::ComputingGraph*, mgb::CompNode comp_node, | |||
size_t size) override { | |||
LiteDeviceType device_type = | |||
get_device_from_locator(comp_node.locator_logical()); | |||
int device_id = comp_node.locator_logical().device; | |||
auto ptr_tmp = | |||
m_allocator->allocate(device_type, device_id, size, | |||
comp_node.get_mem_addr_alignment()); | |||
m_allocator->free(device_type, device_id, ptr_tmp); | |||
} | |||
}; | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,781 @@ | |||
/** | |||
* \file src/mge/network_impl.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "network_impl.h" | |||
#include "common.h" | |||
#include "lite/network.h" | |||
#include "memory_allocator.h" | |||
#include "parse_model/model_parser.h" | |||
#include "parse_info/parse_info_base.h" | |||
#include "megbrain/common.h" | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/graph.h" | |||
#include "megbrain/graph/cg.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/tensor.h" | |||
#if MGB_OPENCL | |||
#include "megcore_opencl.h" | |||
#endif | |||
#include <fstream> | |||
#include <memory> | |||
#include <set> | |||
using namespace lite; | |||
using namespace mgb; | |||
LITE_DYN_TYPE_OBJ_FINAL_IMPL(NetworkImplDft); | |||
void NetworkImplDft::set_config(const Config& config) { | |||
m_user_config = std::make_unique<Config>(); | |||
*m_user_config = config; | |||
m_load_config.comp_graph = mgb::ComputingGraph::make(); | |||
m_compnode_locator = to_compnode_locator(m_user_config->device_type); | |||
m_compnode_locator.device = config.device_id; | |||
} | |||
void NetworkImplDft::shared_weight_with(const NetworkImplBase* src_network) { | |||
application_config(); | |||
const auto& src_impl = src_network->cast_final_safe<NetworkImplDft>(); | |||
LITE_ASSERT(src_impl.m_loader, | |||
"Clone network must after the network is loaded."); | |||
m_load_result = src_impl.m_loader->load(m_load_config, true); | |||
//! flag weather the mode is cross compnode model | |||
cross_compnode_model_detect(); | |||
//! update the IO of the network | |||
update_io(); | |||
//! replace the IO when there is device input or output | |||
compile_graph(); | |||
} | |||
void NetworkImplDft::application_config() { | |||
auto device_type = m_user_config->device_type; | |||
m_compnode_locator.type = to_compnode_locator(device_type).type; | |||
m_compnode_locator.device = m_user_config->device_id; | |||
if (m_nr_threads > 1 && device_type == LiteDeviceType::LITE_CPU) { | |||
m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD; | |||
m_compnode_locator.device = m_user_config->device_id; | |||
} | |||
//! model options | |||
#define ConfigOption(mge_name, lite_name) \ | |||
options.mge_name = m_user_config->options.lite_name; | |||
auto&& options = m_load_config.comp_graph->options(); | |||
ConfigOption(graph_opt.weight_preprocess, weight_preprocess); | |||
ConfigOption(graph_opt.fuse_preprocess, fuse_preprocess); | |||
ConfigOption(fake_next_exec, fake_next_exec); | |||
ConfigOption(var_sanity_check_first_run, var_sanity_check_first_run); | |||
m_load_config.const_var_shape = m_user_config->options.const_shape; | |||
ConfigOption(force_dynamic_alloc, force_dynamic_alloc); | |||
ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc); | |||
ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change); | |||
LITE_ASSERT(m_user_config->options.jit_level == 0 || | |||
(m_user_config->options.jit_level > 0 && | |||
device_type == LiteDeviceType::LITE_CUDA), | |||
"jit only support in cuda device."); | |||
ConfigOption(graph_opt.jit, jit_level); | |||
ConfigOption(comp_node_seq_record_level, comp_node_seq_record_level); | |||
ConfigOption(graph_opt_level, graph_opt_level); | |||
ConfigOption(async_exec_level, async_exec_level); | |||
#undef ConfigOption | |||
#define ConfigOptionLayoutTransform(name) \ | |||
if (m_user_config->options.name) { \ | |||
options.graph_opt.name(); \ | |||
} | |||
ConfigOptionLayoutTransform(enable_nchw44); | |||
ConfigOptionLayoutTransform(enable_nchw44_dot); | |||
ConfigOptionLayoutTransform(enable_nchw88); | |||
ConfigOptionLayoutTransform(enable_nhwcd4); | |||
ConfigOptionLayoutTransform(enable_nchw4); | |||
ConfigOptionLayoutTransform(enable_nchw32); | |||
ConfigOptionLayoutTransform(enable_nchw64); | |||
#undef ConfigOptionLayoutTransform | |||
if (m_user_config->has_compression) { | |||
m_load_config.tensor_value_loader = decompressed_tensor_value_loader; | |||
} | |||
//! if device is LITE_NONE, the compnode information is stored in model | |||
if (device_type != LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
//! currently not set Locator type because an atlas mgb model is a | |||
//! cross-compnode graph | |||
if (device_type == LiteDeviceType::LITE_ATLAS) { | |||
m_load_config.comp_node_mapper = | |||
[this](mgb::CompNode::Locator& loc) { | |||
if (loc.type == mgb::CompNode::DeviceType::ATLAS) { | |||
loc.device = m_compnode_locator.device; | |||
loc.stream = m_compnode_locator.stream; | |||
} else if (loc.type == | |||
mgb::CompNode::DeviceType::MULTITHREAD) { | |||
loc.stream = m_nr_threads; | |||
} | |||
}; | |||
} else { | |||
m_load_config.comp_node_mapper = | |||
[this](mgb::CompNode::Locator& loc) { | |||
loc = m_compnode_locator; | |||
}; | |||
} | |||
} | |||
} | |||
void NetworkImplDft::set_memory_allocator( | |||
std::shared_ptr<Allocator> user_allocator) { | |||
auto allocator = std::make_shared<UserStaticMemAlloc>(user_allocator); | |||
LITE_ASSERT(m_load_config.comp_graph); | |||
m_load_config.comp_graph->set_device_memory_allocator(allocator); | |||
} | |||
//! share the runtime memory with other network, the weights is not shared | |||
void NetworkImplDft::share_runtime_memory_with( | |||
Network::NetworkImplBase* network_impl) { | |||
LITE_ASSERT(network_impl); | |||
LITE_ASSERT(m_load_config.comp_graph); | |||
m_load_config.comp_graph->share_device_memory_with( | |||
*(network_impl->cast_final_safe<NetworkImplDft>() | |||
.m_load_config.comp_graph)); | |||
} | |||
void NetworkImplDft::set_cpu_inplace_mode() { | |||
LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, | |||
"cpu inplace mode is only avaliable in CPU."); | |||
m_is_cpu_inplace_mode = true; | |||
if (m_compnode_locator.type == mgb::CompNode::DeviceType::CPU) { | |||
m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT; | |||
} else { | |||
LITE_ASSERT( | |||
m_compnode_locator.type == CompNode::DeviceType::MULTITHREAD, | |||
"cpu inplace mode is only avaliable in CPU."); | |||
m_compnode_locator.device = | |||
mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT; | |||
} | |||
} | |||
void NetworkImplDft::set_cpu_threads_number(size_t nr_threads) { | |||
LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, | |||
"multi threads mode is only avaliable in CPU."); | |||
if (nr_threads > 1) { | |||
m_nr_threads = nr_threads; | |||
m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD; | |||
m_compnode_locator.nr_threads = nr_threads; | |||
} | |||
} | |||
void NetworkImplDft::set_runtime_thread_affinity( | |||
const ThreadAffinityCallback& thread_affinity_callback) { | |||
LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU, | |||
"multi threads mode is only avaliable in CPU."); | |||
mgb::CompNode::Locator loc; | |||
m_load_config.comp_node_mapper(loc); | |||
auto cn = mgb::CompNode::load(loc); | |||
if (m_nr_threads > 1) { | |||
mgb::CompNodeEnv::from_comp_node(cn).cpu_env().set_affinity( | |||
thread_affinity_callback); | |||
} else { | |||
mgb::CompNodeEnv::from_comp_node(cn).cpu_env().dispatch( | |||
[thread_affinity_callback](void) { | |||
thread_affinity_callback(0); | |||
}); | |||
} | |||
} | |||
void NetworkImplDft::set_device_id(int device_id) { | |||
m_compnode_locator.device = device_id; | |||
m_user_config->device_id = device_id; | |||
} | |||
void NetworkImplDft::set_stream_id(int stream_id) { | |||
m_compnode_locator.stream = stream_id; | |||
} | |||
void NetworkImplDft::use_tensorrt() { | |||
auto&& options = m_load_config.comp_graph->options(); | |||
options.graph_opt.tensorrt = true; | |||
} | |||
//! set the callback in async model | |||
void NetworkImplDft::set_async_callback(const AsyncCallback& callback) { | |||
LITE_ASSERT(!m_is_cpu_inplace_mode, | |||
"cpu inplace mode not support async mode"); | |||
LITE_ASSERT(m_user_config->device_type == LiteDeviceType::LITE_CPU || | |||
m_user_config->device_type == LiteDeviceType::LITE_CUDA, | |||
"Now only cpu and cuda>10.0 support async mode"); | |||
m_async = true; | |||
m_async_callback = std::move(callback); | |||
} | |||
void NetworkImplDft::make_output_spec() { | |||
m_output_spec.clear(); | |||
for (auto&& out : m_network_io->outputs) { | |||
if (m_load_result.output_var_map.count(out.name)) { | |||
auto&& load_out = m_load_result.output_var_map[out.name]; | |||
auto cb = [&out, this](const mgb::DeviceTensorND& dv) mutable { | |||
mgb::CompNode comp_node = dv.comp_node(); | |||
if (out.io_type == LiteIOType::LITE_IO_SHAPE) { | |||
auto mgb_layout = dv.layout(); | |||
out.lite_tensor->set_layout(to_lite_layout(mgb_layout)); | |||
} else { | |||
TensorHelper::implement(out.lite_tensor) | |||
->cast_final_safe<TensorImplDft>() | |||
.copy_from_mge_tensor(dv); | |||
out.lite_tensor->update_from_implement(); | |||
} | |||
if (m_async) { | |||
out.have_sync = true; | |||
bool need_exec_cb = true; | |||
for (auto&& j : m_network_io->outputs) { | |||
if (!j.have_sync) { | |||
need_exec_cb = false; | |||
} | |||
} | |||
if (need_exec_cb) { | |||
for (auto&& j : m_network_io->outputs) { | |||
j.have_sync = false; | |||
} | |||
comp_node.add_callback([this]() { finish(); }); | |||
} | |||
} | |||
}; | |||
m_output_spec.emplace_back(load_out, std::move(cb)); | |||
} else { | |||
LITE_THROW(ssprintf("no output named : %s in the mode", | |||
out.name.c_str())); | |||
} | |||
} | |||
} | |||
void NetworkImplDft::replace_dev_input_pass() { | |||
mgb::CompNode::Locator locator; | |||
m_load_config.comp_node_mapper(locator); | |||
//! CPU is not need use device input | |||
if (locator.type == mgb::CompNode::DeviceType::CPU) { | |||
return; | |||
} | |||
//! repalce the H2D with VolatileSharedDeviceTensor, and keep the dev tensor | |||
//! in m_network_io.input, user can directly change the dev tensor | |||
//! storage through m_network_io.input.lite_tensor->reset() befor forward | |||
using DeviceTensorMap = | |||
std::unordered_map<std::string, | |||
std::shared_ptr<mgb::DeviceTensorND>>; | |||
DeviceTensorMap name2dev_tensor; | |||
mgb::ThinHashMap<mgb::HostTensorND*, mgb::SymbolVar> host_val2var; | |||
//! construct host_val2var that maps from host tensor to corresponding var | |||
auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) { | |||
if (opr->same_type<mgb::opr::Host2DeviceCopy>()) { | |||
mgb::HostTensorND* tensor = | |||
opr->cast_final<mgb::opr::Host2DeviceCopy>() | |||
.host_data() | |||
.get(); | |||
host_val2var[tensor] = opr->output(0); | |||
} | |||
}; | |||
mgb::cg::DepOprIter dep_iter{on_opr}; | |||
for (auto i : m_load_result.output_var_list) { | |||
dep_iter.add(i.node()->owner_opr()); | |||
} | |||
mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> inp_var_map, out_var_map; | |||
mgb::SmallVector<std::string> to_clear; | |||
for (auto&& config_in : m_network_io->inputs) { | |||
if (!config_in.is_host) { | |||
auto host_val = m_load_result.tensor_map[config_in.name]; | |||
auto dev_val = TensorHelper::implement(config_in.lite_tensor) | |||
->cast_final_safe<TensorImplDft>() | |||
.m_dev_tensor; | |||
auto dev_var = mgb::opr::VolatileSharedDeviceTensor::make( | |||
*m_load_result.graph, dev_val, {config_in.name}); | |||
inp_var_map[host_val2var.at(host_val.get())] = dev_var; | |||
name2dev_tensor[config_in.name] = dev_val; | |||
} | |||
} | |||
auto new_ovar = | |||
mgb::cg::replace_vars(m_load_result.output_var_list, inp_var_map); | |||
for (size_t i = 0; i < new_ovar.size(); ++i) { | |||
out_var_map[m_load_result.output_var_list[i]] = new_ovar[i]; | |||
} | |||
for (auto&& i : m_load_result.output_var_map) { | |||
i.second = out_var_map.at(i.second); | |||
} | |||
for (auto&& i : m_load_result.output_var_map_id) { | |||
i.second = out_var_map.at(i.second); | |||
} | |||
for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) { | |||
new_ovar[i].rename(m_load_result.output_var_list[i].node()->name()); | |||
} | |||
m_load_result.output_var_list = std::move(new_ovar); | |||
} | |||
void NetworkImplDft::cross_compnode_model_detect() { | |||
mgb::ThinHashSet<LiteDeviceType> nr_used_device_type; | |||
auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) { | |||
for (auto j : opr->output()) { | |||
if (j->comp_node() != mgb::CompNode::default_cpu()) { | |||
nr_used_device_type.insert( | |||
get_device_from_locator(j->comp_node().locator())); | |||
} | |||
} | |||
}; | |||
mgb::cg::DepOprIter dep_iter{on_opr}; | |||
for (auto i : m_load_result.output_var_list) { | |||
dep_iter.add(i.node()->owner_opr()); | |||
} | |||
m_nr_device_type = nr_used_device_type.size(); | |||
} | |||
void NetworkImplDft::load_model( | |||
std::shared_ptr<void> model_mem, size_t size, | |||
std::unordered_map<std::string, LiteAny> separate_config_map) { | |||
if (!m_loader) { | |||
m_input_file = mgb::serialization::InputFile::make_mem_proxy( | |||
model_mem, size, false); | |||
auto format = | |||
mgb::serialization::GraphLoader::identify_graph_dump_format( | |||
*m_input_file); | |||
if (!format.valid()) { | |||
LITE_THROW("invalid model format"); | |||
} | |||
m_loader = mgb::serialization::GraphLoader::make( | |||
std::move(m_input_file), format.val()); | |||
} | |||
//! applay the user configration to mge model | |||
application_config(); | |||
//! config some flag get from json config file | |||
if (separate_config_map.find("device_id") != separate_config_map.end()) { | |||
set_device_id(separate_config_map["device_id"].unsafe_cast<int>()); | |||
} | |||
if (separate_config_map.find("number_threads") != | |||
separate_config_map.end() && | |||
separate_config_map["number_threads"].unsafe_cast<size_t>() > 1) { | |||
set_cpu_threads_number( | |||
separate_config_map["number_threads"].unsafe_cast<size_t>()); | |||
} | |||
if (separate_config_map.find("enable_inplace_model") != | |||
separate_config_map.end() && | |||
separate_config_map["enable_inplace_model"].unsafe_cast<bool>()) { | |||
set_cpu_inplace_mode(); | |||
} | |||
if (separate_config_map.find("use_tensorrt") != separate_config_map.end() && | |||
separate_config_map["use_tensorrt"].unsafe_cast<bool>()) { | |||
use_tensorrt(); | |||
} | |||
m_load_result = m_loader->load(m_load_config, true); | |||
cross_compnode_model_detect(); | |||
//! update the IO of the network | |||
update_io(); | |||
//! replace the IO when there is device input or output | |||
compile_graph(); | |||
} | |||
void NetworkImplDft::compile_graph() { | |||
modify_exection_policy(); | |||
replace_dev_input_pass(); | |||
make_output_spec(); | |||
m_execute_func = m_load_result.graph_compile(m_output_spec); | |||
} | |||
void NetworkImplDft::start() const { | |||
if (m_start_callback) { | |||
std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>> | |||
input_io_map; | |||
for (auto&& io_inner : m_network_io->inputs) { | |||
input_io_map[io_inner.name] = { | |||
IO{io_inner.name, io_inner.is_host, io_inner.io_type, | |||
io_inner.config_layout}, | |||
io_inner.lite_tensor}; | |||
} | |||
m_start_callback(input_io_map); | |||
} | |||
} | |||
void NetworkImplDft::forward() { | |||
start(); | |||
LITE_ASSERT(m_execute_func, "forward must be called after network loaded."); | |||
m_execute_func->execute(); | |||
} | |||
void NetworkImplDft::wait() { | |||
if (!m_async) { | |||
m_execute_func->wait(); | |||
} | |||
finish(); | |||
} | |||
void NetworkImplDft::finish() const { | |||
if (m_async) { | |||
LITE_ASSERT(m_async_callback, | |||
"The callback func must set when async mode."); | |||
m_async_callback(); | |||
} | |||
if (m_finish_callback) { | |||
std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>> | |||
output_io_map; | |||
for (auto&& io_inner : m_network_io->outputs) { | |||
output_io_map[io_inner.name] = { | |||
IO{io_inner.name, io_inner.is_host, io_inner.io_type, | |||
io_inner.config_layout}, | |||
io_inner.lite_tensor}; | |||
} | |||
m_finish_callback(output_io_map); | |||
} | |||
output_plugin_result(); | |||
} | |||
void NetworkImplDft::set_io(const NetworkIO& network_io) { | |||
m_network_io = std::make_unique<NetworkIOInner>(); | |||
for (auto&& in : network_io.inputs) { | |||
m_network_io->inputs.emplace_back(in); | |||
} | |||
for (auto&& out : network_io.outputs) { | |||
m_network_io->outputs.emplace_back(out); | |||
} | |||
} | |||
void NetworkImplDft::update_io() { | |||
update_input(); | |||
update_output(); | |||
} | |||
void NetworkImplDft::update_input() { | |||
auto device_type = m_user_config->device_type; | |||
auto device_id = m_compnode_locator.device; | |||
auto stream_id = m_compnode_locator.stream; | |||
//! if cpu all input and output are host | |||
if (device_type == LiteDeviceType::LITE_CPU) { | |||
for (auto&& in : m_network_io->inputs) { | |||
in.is_host = true; | |||
} | |||
} | |||
//! if cross compnode model, modify the device input if it is not valid | |||
if (m_nr_device_type > 1) { | |||
for (auto&& in_tensor_iter : m_load_result.tensor_map) { | |||
for (auto&& config_in : m_network_io->inputs) { | |||
//! if tensor is set to device input | |||
if (in_tensor_iter.first == config_in.name && | |||
!config_in.is_host) { | |||
//! if the origin compnode of the tensor is not the device, | |||
//! set the input to host | |||
if (get_device_from_locator( | |||
in_tensor_iter.second->comp_node().locator()) == | |||
LiteDeviceType::LITE_CPU) { | |||
config_in.is_host = true; | |||
LITE_WARN( | |||
"The input tensor %s of the cross device model " | |||
"should not from device.", | |||
config_in.name.c_str()); | |||
} | |||
} | |||
} | |||
} | |||
} | |||
for (auto&& in_tensor_iter : m_load_result.tensor_map) { | |||
bool found = false; | |||
for (auto&& config_in : m_network_io->inputs) { | |||
if (in_tensor_iter.first == config_in.name) { | |||
found = true; | |||
if (config_in.is_host) { | |||
config_in.lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type, true); | |||
TensorHelper::implement(config_in.lite_tensor) | |||
->cast_final_safe<TensorImplDft>() | |||
.m_host_tensor = in_tensor_iter.second; | |||
config_in.lite_tensor->update_from_implement(); | |||
} else { | |||
config_in.lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type); | |||
config_in.lite_tensor->set_layout( | |||
to_lite_layout(in_tensor_iter.second->layout())); | |||
} | |||
if (config_in.config_layout.ndim && | |||
!(config_in.config_layout == | |||
config_in.lite_tensor->get_layout())) { | |||
config_in.lite_tensor->set_layout(config_in.config_layout); | |||
} | |||
} | |||
} | |||
if (!found) { | |||
IOInner io_in; | |||
io_in.name = in_tensor_iter.first; | |||
io_in.lite_tensor = std::make_shared<Tensor>(device_id, stream_id, | |||
device_type, true); | |||
TensorHelper::implement(io_in.lite_tensor) | |||
->cast_final_safe<TensorImplDft>() | |||
.m_host_tensor = in_tensor_iter.second; | |||
io_in.lite_tensor->update_from_implement(); | |||
m_network_io->inputs.push_back(io_in); | |||
} | |||
} | |||
//! delete the IO that is not the network | |||
for (auto it = m_network_io->inputs.begin(); | |||
it != m_network_io->inputs.end();) { | |||
if (it->lite_tensor == nullptr) { | |||
LITE_LOG("%s is not the network input, ignore it.", | |||
it->name.c_str()); | |||
it = m_network_io->inputs.erase(it); | |||
} else { | |||
it++; | |||
} | |||
} | |||
} | |||
void NetworkImplDft::update_output() { | |||
auto device_type = m_user_config->device_type; | |||
auto device_id = m_compnode_locator.device; | |||
auto stream_id = m_compnode_locator.stream; | |||
if (device_type == LiteDeviceType::LITE_CPU) { | |||
for (auto&& out : m_network_io->outputs) { | |||
out.is_host = true; | |||
} | |||
} | |||
//! delete the output that is not the network | |||
for (auto out_it = m_network_io->outputs.begin(); | |||
out_it != m_network_io->outputs.end();) { | |||
if (std::find_if(m_load_result.output_var_list.begin(), | |||
m_load_result.output_var_list.end(), | |||
[out_it](const mgb::SymbolVar var) { | |||
return var.node()->name() == out_it->name; | |||
}) == m_load_result.output_var_list.end()) { | |||
LITE_LOG("%s is not the network output, ignore it.", | |||
out_it->name.c_str()); | |||
out_it = m_network_io->outputs.erase(out_it); | |||
} else { | |||
out_it++; | |||
} | |||
} | |||
//! user config the output tensor, so only compute the config output | |||
if (m_compute_configured_output_only) { | |||
LITE_ASSERT(m_network_io->outputs.size() > 0, | |||
"compute configured output only with no configure output."); | |||
for (auto out_it = m_network_io->outputs.begin(); | |||
out_it != m_network_io->outputs.end(); out_it++) { | |||
//! use pinned memory to copy form device | |||
if (out_it->is_host) { | |||
out_it->lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type, true); | |||
} else { | |||
out_it->lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type); | |||
} | |||
} | |||
//! user not set, use default output | |||
} else { | |||
for (auto&& out : m_load_result.output_var_list) { | |||
auto it = std::find_if(m_network_io->outputs.begin(), | |||
m_network_io->outputs.end(), | |||
[&out](const IOInner io) { | |||
return io.name == out.node()->name(); | |||
}); | |||
if (it != m_network_io->outputs.end()) { | |||
if (it->is_host) { | |||
it->lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type, true); | |||
} else { | |||
it->lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type); | |||
} | |||
} else { | |||
IOInner output; | |||
output.name = out.node()->name(); | |||
output.lite_tensor = std::make_shared<Tensor>( | |||
device_id, stream_id, device_type, true); | |||
m_network_io->outputs.push_back({output}); | |||
} | |||
} | |||
} | |||
} | |||
std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(std::string io_name, | |||
LiteTensorPhase phase) { | |||
if (phase == LiteTensorPhase::LITE_INPUT || | |||
phase == LiteTensorPhase::LITE_IO) { | |||
for (auto&& config_in : m_network_io->inputs) { | |||
if (io_name == config_in.name) { | |||
return config_in.lite_tensor; | |||
} | |||
} | |||
} | |||
if (phase == LiteTensorPhase::LITE_OUTPUT || | |||
phase == LiteTensorPhase::LITE_IO) { | |||
for (auto&& config_out : m_network_io->outputs) { | |||
if (io_name == config_out.name) { | |||
config_out.lite_tensor->update_from_implement(); | |||
return config_out.lite_tensor; | |||
} | |||
} | |||
} | |||
LITE_THROW(mgb::ssprintf( | |||
"tensor name must be %s input tensor name or the registered " | |||
"output tensor name if NetworkIO is set, if NetworkIO is not set, " | |||
"the output tensor is all the network output tensor, or the output " | |||
"tensor is only the registered tensor.", | |||
io_name.c_str())); | |||
return nullptr; | |||
} | |||
std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) { | |||
return get_io_tensor(get_input_name(index)); | |||
} | |||
std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) { | |||
return get_io_tensor(get_output_name(index)); | |||
} | |||
//! set opr algorithm selection strategy in the network | |||
void NetworkImplDft::set_network_algo_policy(LiteAlgoSelectStrategy strategy, | |||
uint32_t shared_batch_size, | |||
bool binary_equal_between_batch) { | |||
using S = megdnn::param::ExecutionPolicy::Strategy; | |||
auto dst_strategy = static_cast<S>(0); | |||
if (static_cast<uint32_t>(strategy) & | |||
LiteAlgoSelectStrategy::LITE_ALGO_HEURISTIC) { | |||
dst_strategy = dst_strategy | S::HEURISTIC; | |||
} | |||
if (static_cast<uint32_t>(strategy) & | |||
LiteAlgoSelectStrategy::LITE_ALGO_PROFILE) { | |||
dst_strategy = dst_strategy | S::PROFILE; | |||
} | |||
if (static_cast<uint32_t>(strategy) & | |||
LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE) { | |||
dst_strategy = dst_strategy | S::REPRODUCIBLE; | |||
} | |||
if (static_cast<uint32_t>(strategy) & | |||
LiteAlgoSelectStrategy::LITE_ALGO_OPTIMIZED) { | |||
dst_strategy = dst_strategy | S::OPTIMIZED; | |||
} | |||
m_execution_policy = dst_strategy; | |||
auto&& fast_run_config = | |||
m_load_config.comp_graph->options().fast_run_config; | |||
fast_run_config.binary_equal_between_batch = binary_equal_between_batch; | |||
fast_run_config.shared_batch_size = shared_batch_size; | |||
if (m_execute_func) { | |||
LITE_WARN( | |||
"set_network_algo_policy maybe cause error after loaded " | |||
"network!!!!"); | |||
modify_exection_policy(); | |||
} | |||
} | |||
void NetworkImplDft::modify_exection_policy() { | |||
mgb::SymbolVarArray vars; | |||
for (auto i : m_output_spec) { | |||
vars.push_back(i.first); | |||
} | |||
if (static_cast<uint32_t>(m_execution_policy) != 0) | |||
mgb::gopt::modify_opr_algo_strategy_inplace(vars, m_execution_policy); | |||
} | |||
//! set opr algorithm selection strategy in the network | |||
void NetworkImplDft::set_network_algo_workspace_limit(size_t workspace_limit) { | |||
mgb::SymbolVarArray vars; | |||
for (auto i : m_output_spec) { | |||
vars.push_back(i.first); | |||
} | |||
mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit); | |||
} | |||
//! get the input tensor name in the order of graph | |||
std::vector<const char*> NetworkImplDft::get_all_output_name() const { | |||
std::vector<const char*> output_names; | |||
for (auto& output : m_network_io->outputs) { | |||
output_names.push_back(output.name.c_str()); | |||
} | |||
return output_names; | |||
} | |||
//! get the input tensor name in the order of graph | |||
std::vector<const char*> NetworkImplDft::get_all_input_name() const { | |||
std::vector<const char*> input_names; | |||
for (auto& input : m_load_result.tensor_map) { | |||
input_names.push_back(input.first.c_str()); | |||
} | |||
return input_names; | |||
} | |||
//! get the output tensor name in the order of graph | |||
const char* NetworkImplDft::get_output_name(size_t index) const { | |||
LITE_ASSERT( | |||
index < m_load_result.output_var_list.size(), | |||
"The output tensor index is large than the total outputs number."); | |||
return m_load_result.output_var_list[index].node()->name().c_str(); | |||
} | |||
//! get the input tensor name in the order of graph | |||
const char* NetworkImplDft::get_input_name(size_t index) const { | |||
LITE_ASSERT( | |||
index < m_load_result.tensor_map.size(), | |||
"The input tensor index is large than the total inputs number."); | |||
size_t i = 0; | |||
for (auto& input : m_load_result.tensor_map) { | |||
if (i == index) { | |||
return input.first.c_str(); | |||
} | |||
i++; | |||
} | |||
LITE_THROW(ssprintf("no input tensor of index %zu.", index)); | |||
} | |||
//! Plugin part | |||
void NetworkImplDft::enable_profile_performance(std::string profile_json_file) { | |||
#if MGB_ENABLE_JSON | |||
#if MGB_OPENCL | |||
mgb::CompNode::enable_opencl_profile(true); | |||
#endif | |||
m_profiler = std::make_unique<mgb::GraphProfiler>( | |||
m_load_config.comp_graph.get()); | |||
m_profiler_output_file = profile_json_file; | |||
#else | |||
LITE_MARK_USED_VAR(profile_json_file); | |||
LITE_THROW("JSON is disable at compile time."); | |||
#endif | |||
} | |||
void NetworkImplDft::enable_io_txt_dump(std::string io_txt_out_file) { | |||
auto iodump = std::make_unique<mgb::TextOprIODump>( | |||
m_load_config.comp_graph.get(), io_txt_out_file.c_str()); | |||
iodump->print_addr(false); | |||
m_iodump = std::move(iodump); | |||
} | |||
void NetworkImplDft::enable_io_bin_dump(std::string io_bin_out_dir) { | |||
m_iodump = std::make_unique<mgb::BinaryOprIODump>( | |||
m_load_config.comp_graph.get(), io_bin_out_dir.c_str()); | |||
} | |||
void inline NetworkImplDft::output_plugin_result() const { | |||
#if MGB_ENABLE_JSON | |||
if (m_profiler && m_execute_func) { | |||
m_profiler->to_json_full(m_execute_func.get()) | |||
->writeto_fpath(m_profiler_output_file); | |||
} | |||
#endif | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,242 @@ | |||
/** | |||
* \file src/mge/network_impl.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "lite/network.h" | |||
#include "network_impl_base.h" | |||
#include "tensor_impl.h" | |||
#include "megbrain/graph/bases.h" | |||
#include "megbrain/plugin/opr_io_dump.h" | |||
#include "megbrain/plugin/profiler.h" | |||
#include "megbrain/serialization/extern_c_opr.h" | |||
#include "megbrain/serialization/file.h" | |||
#include "megbrain/serialization/load_dump_config.h" | |||
#include "megbrain/serialization/serializer.h" | |||
#include "megbrain/utils/thin/hash_table.h" | |||
#include <memory> | |||
#include <unordered_map> | |||
namespace lite { | |||
/*! | |||
* \brief implement the Network, contain the mgb related member | |||
*/ | |||
class NetworkImplDft final : public Network::NetworkImplBase { | |||
LITE_DYN_TYPE_OBJ_FINAL_DECL; | |||
public: | |||
using S = megdnn::param::ExecutionPolicy::Strategy; | |||
//! set the config of the network, include: | |||
//! the inference device | |||
//! the other inference options, such as record_level, weight_preprocess... | |||
void set_config(const Config& config) override; | |||
//! set the special io infomation, if not set, default io tensor will used, | |||
//! this is special for input/output is not host tensor, default the | |||
//! input/output tensors are host tensor | |||
void set_io(const NetworkIO& network_io) override; | |||
//! only compute the output tensor in user configured | |||
void compute_only_configured_output() override { | |||
m_compute_configured_output_only = true; | |||
} | |||
//! get the network input and ouput tensor, the layout of which is | |||
//! sync from mge tensor | |||
std::shared_ptr<Tensor> get_io_tensor( | |||
std::string io_name, | |||
LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override; | |||
//! get the input tensor by index in the load_result tensormap | |||
std::shared_ptr<Tensor> get_input_tensor(size_t index) override; | |||
//! get the output tensor by index in the load_result output_var_list | |||
std::shared_ptr<Tensor> get_output_tensor(size_t index) override; | |||
//! get all the input tensor name in the order in load return | |||
std::vector<const char*> get_all_input_name() const override; | |||
//! get all the output tensor name in the order in load return | |||
std::vector<const char*> get_all_output_name() const override; | |||
//! get the input tensor name in the order in load return | |||
const char* get_input_name(size_t index) const override; | |||
//! get the output tensor name in the order in load return | |||
const char* get_output_name(size_t index) const override; | |||
//! set the callback in async model | |||
void set_async_callback(const AsyncCallback& callback) override; | |||
//! set the start callback which will execute before network forward | |||
void set_start_callback(const StartCallback& callback) override { | |||
m_start_callback = std::move(callback); | |||
} | |||
//! set the finish callback which will execute after network forward | |||
void set_finish_callback(const FinishCallback& callback) override { | |||
m_finish_callback = std::move(callback); | |||
} | |||
//! load the model and get the m_load_result | |||
void load_model(std::shared_ptr<void> model_mem, size_t size, | |||
std::unordered_map<std::string, LiteAny> | |||
separate_config_map = {}) override; | |||
//! forward the network with filled input data and fill the output data | |||
//! to the output tensor | |||
void forward() override; | |||
//! in sync model, wait utile the inference finish | |||
void wait() override; | |||
virtual LiteDeviceType get_device_type() const override { | |||
return m_user_config->device_type; | |||
} | |||
//! Set cpu default mode when device is CPU, in some low computation | |||
//! device or single core device, this mode will get good performace | |||
void set_cpu_inplace_mode(); | |||
bool is_cpu_inplace_mode() const { return m_is_cpu_inplace_mode; } | |||
//! When device is CPU, this interface will set the to be loaded model | |||
//! run in multi thread mode with the given thread number. | |||
void set_cpu_threads_number(size_t nr_threads); | |||
size_t get_cpu_threads_number() const { return m_nr_threads; } | |||
//! set device id, default device id = 0 | |||
void set_device_id(int device_id) override; | |||
int get_device_id() const override { return m_compnode_locator.device; }; | |||
LiteBackend get_backend_type() const override { | |||
return LiteBackend::LITE_DEFAULT; | |||
} | |||
//! set stream id, default stream id = 0 | |||
void set_stream_id(int stream_id) override; | |||
int get_stream_id() const override { return m_compnode_locator.stream; }; | |||
//! enable tensorrt | |||
void use_tensorrt(); | |||
//! enable profile the network, a JSON format file will be generated | |||
void enable_profile_performance( | |||
std::string profile_json_file_path) override; | |||
/********************** mge special function ************************/ | |||
//! load a new network which will share weights with src network | |||
void shared_weight_with(const NetworkImplBase* src_network); | |||
//! share the runtime memory with other network, the weights is not shared | |||
void share_runtime_memory_with(NetworkImplBase* network); | |||
//! set threads affinity callback; | |||
void set_runtime_thread_affinity( | |||
const ThreadAffinityCallback& thread_affinity_callback); | |||
//! set the network memroy allocator, the allocator is defined by user | |||
void set_memory_allocator(std::shared_ptr<Allocator> user_allocator); | |||
//! set opr algorithm selection strategy in the network | |||
void set_network_algo_policy(LiteAlgoSelectStrategy strategy, | |||
uint32_t shared_batch_size, | |||
bool binary_equal_between_batch); | |||
//! set workspace_limit for oprs with multiple algorithms, set | |||
//! workspace limitation can save memory but may influence the performance | |||
void set_network_algo_workspace_limit(size_t workspace_limit); | |||
//! Dump input/output values of all internal variables to output file, | |||
//! in text format | |||
void enable_io_txt_dump(std::string io_txt_out_file); | |||
//! Dump input/output values of all internal variables to output | |||
//! directory, in binary format | |||
void enable_io_bin_dump(std::string io_bin_out_dir); | |||
private: | |||
//! construct the outputspec according to the m_network_io, and set the | |||
//! call_back to the outputspec | |||
void make_output_spec(); | |||
//! modify the execution policy | |||
void modify_exection_policy(); | |||
//! if the input is dev tensor, the pass will replace the H2D Opr to | |||
//! VolatileSharedDeviceTensor Opr | |||
void replace_dev_input_pass(); | |||
//! check whether the model is cross compnode | |||
void cross_compnode_model_detect(); | |||
//! when the model have loaded, update the IO, if not set networkio, update | |||
//! the networkio with the IO of loaded model | |||
void update_io(); | |||
void update_input(); | |||
void update_output(); | |||
//! when the model info have loaded, update the config according the model | |||
//! info, finaly use it in compute graph | |||
void application_config(); | |||
//! after finish forwarding the netwark, output the result of plugin to file | |||
void output_plugin_result() const; | |||
//! when finish forwarding the network, the function will be called | |||
void finish() const; | |||
//! before forwarding the network, the function will be called | |||
void start() const; | |||
//! compile the graph to get the execute function | |||
void compile_graph(); | |||
private: | |||
bool m_async = false; | |||
bool m_is_cpu_inplace_mode = false; | |||
int m_nr_device_type = 0; | |||
size_t m_nr_threads = 1; | |||
bool m_compute_configured_output_only = false; | |||
mgb::CompNode::Locator m_compnode_locator; | |||
AsyncCallback m_async_callback = nullptr; | |||
std::unique_ptr<NetworkIOInner> m_network_io; | |||
std::unique_ptr<Config> m_user_config; | |||
std::unique_ptr<mgb::cg::AsyncExecutable> m_execute_func; | |||
//! The model load related data | |||
S m_execution_policy = static_cast<S>(0); | |||
std::unique_ptr<mgb::serialization::InputFile> m_input_file; | |||
mgb::serialization::GraphLoadConfig m_load_config; | |||
mgb::serialization::GraphLoader::LoadResult m_load_result; | |||
mgb::ComputingGraph::OutputSpec m_output_spec; | |||
std::shared_ptr<mgb::serialization::GraphLoader> m_loader; | |||
//! start and finish callback | |||
StartCallback m_start_callback = nullptr; | |||
FinishCallback m_finish_callback = nullptr; | |||
//! profile and io dump related data | |||
#if MGB_ENABLE_JSON | |||
std::unique_ptr<mgb::GraphProfiler> m_profiler; | |||
std::string m_profiler_output_file; | |||
#endif | |||
std::unique_ptr<mgb::OprIODumpBase> m_iodump; | |||
}; | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,435 @@ | |||
/** | |||
* \file inlude/mge/tensor.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "tensor_impl.h" | |||
#include "common.h" | |||
#include "lite/tensor.h" | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/tensor.h" | |||
#include <memory> | |||
using namespace lite; | |||
/**********************TensorImpl****************************/ | |||
LITE_DYN_TYPE_OBJ_FINAL_IMPL(TensorImplDft); | |||
TensorImplDft::TensorImplDft() { | |||
m_host_tensor = | |||
std::make_shared<mgb::HostTensorND>(mgb::CompNode::default_cpu()); | |||
} | |||
TensorImplDft::TensorImplDft(LiteDeviceType device, bool is_pinned_host) { | |||
auto cn = mgb::CompNode::load(to_compnode_locator(device)); | |||
if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
device = LiteDeviceType::LITE_CPU; | |||
} | |||
if (device == LiteDeviceType::LITE_CPU) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
mgb::CompNode::default_cpu()); | |||
} else if (is_pinned_host) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>(cn); | |||
} else { | |||
m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn); | |||
} | |||
} | |||
TensorImplDft::TensorImplDft(LiteDeviceType device, const Layout& layout, | |||
bool is_pinned_host) { | |||
auto cn = mgb::CompNode::load(to_compnode_locator(device)); | |||
auto mge_layout = to_impl_layout(layout); | |||
if (device == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
device = LiteDeviceType::LITE_CPU; | |||
} | |||
if (device == LiteDeviceType::LITE_CPU) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
mgb::CompNode::default_cpu(), mge_layout); | |||
} else if (is_pinned_host) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout); | |||
} else { | |||
m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn, mge_layout); | |||
} | |||
} | |||
TensorImplDft::TensorImplDft(int device_id, LiteDeviceType device_type, | |||
const Layout& layout, bool is_pinned_host) { | |||
auto locator = to_compnode_locator(device_type); | |||
locator.device = device_id; | |||
auto cn = mgb::CompNode::load(locator); | |||
if (device_type == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
device_type = LiteDeviceType::LITE_CPU; | |||
} | |||
if (layout.ndim) { | |||
auto mge_layout = to_impl_layout(layout); | |||
if (device_type == LiteDeviceType::LITE_CPU) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
mgb::CompNode::default_cpu(), mge_layout); | |||
} else if (is_pinned_host) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>(cn, mge_layout); | |||
} else { | |||
m_dev_tensor = | |||
std::make_shared<mgb::DeviceTensorND>(cn, mge_layout); | |||
} | |||
} else { | |||
if (device_type == LiteDeviceType::LITE_CPU) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
mgb::CompNode::default_cpu()); | |||
} else if (is_pinned_host) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>(cn); | |||
} else { | |||
m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn); | |||
} | |||
} | |||
} | |||
TensorImplDft::TensorImplDft(int device_id, int stream_id, | |||
LiteDeviceType device_type, bool is_pinned_host) { | |||
auto locator = to_compnode_locator(device_type); | |||
locator.device = device_id; | |||
locator.stream = stream_id; | |||
auto cn = mgb::CompNode::load(locator); | |||
if (get_device_from_locator(locator) == LiteDeviceType::LITE_CPU) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>( | |||
mgb::CompNode::default_cpu()); | |||
} else if (is_pinned_host) { | |||
m_host_tensor = std::make_shared<mgb::HostTensorND>(cn); | |||
} else { | |||
m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(cn); | |||
} | |||
} | |||
LiteDeviceType TensorImplDft::get_device_type() const { | |||
if (is_host()) { | |||
return LiteDeviceType::LITE_CPU; | |||
} else { | |||
return get_device_from_locator(m_dev_tensor->comp_node().locator()); | |||
} | |||
} | |||
int TensorImplDft::get_device_id() const { | |||
if (is_host()) { | |||
return m_host_tensor->comp_node().locator().device; | |||
} else { | |||
return m_dev_tensor->comp_node().locator().device; | |||
} | |||
} | |||
bool TensorImplDft::is_pinned_host() const { | |||
return is_host() && | |||
get_device_from_locator(m_host_tensor->comp_node().locator()) != | |||
LiteDeviceType::LITE_CPU; | |||
} | |||
void TensorImplDft::set_mge_tensor_compnode(const mgb::CompNode& comp_node) { | |||
if (is_host()) { | |||
m_host_tensor->comp_node(comp_node, true); | |||
} else { | |||
m_dev_tensor->comp_node(comp_node, true); | |||
} | |||
} | |||
Layout TensorImplDft::get_layout() const { | |||
if (is_host()) { | |||
return to_lite_layout(m_host_tensor->layout()); | |||
} else { | |||
return to_lite_layout(m_dev_tensor->layout()); | |||
} | |||
} | |||
void* TensorImplDft::get_memory_ptr() const { | |||
if (is_host()) { | |||
return static_cast<void*>(m_host_tensor->raw_ptr()); | |||
} else { | |||
return static_cast<void*>(m_dev_tensor->raw_ptr()); | |||
} | |||
} | |||
void* TensorImplDft::get_memory_ptr(const std::vector<size_t>& idx) const { | |||
if (is_host()) { | |||
auto elemsize_log = m_host_tensor->layout().dtype.size_log(); | |||
switch (elemsize_log) { | |||
case 0: | |||
return static_cast<void*>( | |||
m_host_tensor->ptr<uint8_t>(idx.begin(), idx.end())); | |||
break; | |||
case 1: | |||
return static_cast<void*>( | |||
m_host_tensor->ptr<short>(idx.begin(), idx.end())); | |||
break; | |||
case 2: | |||
return static_cast<void*>( | |||
m_host_tensor->ptr<float>(idx.begin(), idx.end())); | |||
break; | |||
default: | |||
LITE_THROW("not supported data_type."); | |||
} | |||
} else { | |||
auto elemsize_log = m_dev_tensor->layout().dtype.size_log(); | |||
switch (elemsize_log) { | |||
case 0: | |||
return static_cast<void*>( | |||
m_dev_tensor->ptr<uint8_t>(idx.begin(), idx.end())); | |||
break; | |||
case 1: | |||
return static_cast<void*>( | |||
m_dev_tensor->ptr<short>(idx.begin(), idx.end())); | |||
break; | |||
case 2: | |||
return static_cast<void*>( | |||
m_dev_tensor->ptr<float>(idx.begin(), idx.end())); | |||
break; | |||
default: | |||
LITE_THROW("not supported data_type."); | |||
} | |||
} | |||
} | |||
std::shared_ptr<Tensor> TensorImplDft::slice( | |||
const std::vector<size_t>& start, const std::vector<size_t>& end, | |||
const std::vector<size_t>& step) { | |||
Layout layout; | |||
mgb::TensorLayout layout_mge; | |||
if (is_host()) { | |||
layout_mge = m_host_tensor->layout(); | |||
layout = to_lite_layout(m_host_tensor->layout()); | |||
} else { | |||
layout_mge = m_dev_tensor->layout(); | |||
layout = to_lite_layout(m_dev_tensor->layout()); | |||
} | |||
size_t length = start.size(); | |||
LITE_ASSERT(length == end.size() && length <= layout.ndim, | |||
"The start and end must be the same size and less than layout " | |||
"ndim."); | |||
std::vector<mgb::Slice> slices; | |||
if (step.size()) { | |||
LITE_ASSERT(length == step.size(), | |||
"The start and step must be the same size."); | |||
for (size_t i = 0; i < length; i++) { | |||
slices.push_back(mgb::Slice{start[i], end[i], step[i]}); | |||
} | |||
} else { | |||
for (size_t i = 0; i < length; i++) { | |||
slices.push_back(mgb::Slice{start[i], end[i]}); | |||
} | |||
} | |||
auto subspec = mgb::SubTensorSpec::make_from_offset_elem(layout_mge, 0); | |||
size_t axis = 0; | |||
for (auto&& i : slices) { | |||
subspec.merge_with(i.apply(subspec.layout(), axis)); | |||
axis++; | |||
} | |||
auto ret = std::make_shared<Tensor>(); | |||
auto& impl = TensorHelper::implement(ret)->cast_final_safe<TensorImplDft>(); | |||
if (is_host()) { | |||
*impl.m_host_tensor = m_host_tensor->sub(subspec); | |||
} else { | |||
impl.m_dev_tensor = std::make_shared<mgb::DeviceTensorND>( | |||
m_dev_tensor->sub(subspec)); | |||
impl.m_host_tensor = nullptr; | |||
} | |||
LITE_ASSERT(is_host() == impl.is_host()); | |||
return ret; | |||
} | |||
void TensorImplDft::fill_zero() { | |||
if (is_host()) { | |||
auto mge_layout = m_host_tensor->layout(); | |||
if (m_host_tensor->layout().is_physical_contiguous()) { | |||
auto ptr = get_memory_ptr(); | |||
std::memset(ptr, 0, | |||
mge_layout.dtype.size(mge_layout.total_nr_elems())); | |||
} else { | |||
TensorImplDft tmp(LiteDeviceType::LITE_CPU, | |||
to_lite_layout(mge_layout), true); | |||
tmp.fill_zero(); | |||
this->copy_from(&tmp); | |||
} | |||
} else { | |||
mgb::dev_tensor_memset(*m_dev_tensor, 0); | |||
m_dev_tensor->sync(); | |||
} | |||
} | |||
void TensorImplDft::share_memory_with(const TensorImplBase* src_tensor_impl) { | |||
auto src_dft_tensor = static_cast<const TensorImplDft*>(src_tensor_impl); | |||
LITE_ASSERT(is_host() == src_dft_tensor->is_host(), | |||
"share memory must happen in same device"); | |||
//! make shape the src memory is ready | |||
src_tensor_impl->get_memory_ptr(); | |||
if (is_host()) { | |||
*m_host_tensor = *src_dft_tensor->m_host_tensor; | |||
} else { | |||
*m_dev_tensor = *src_dft_tensor->m_dev_tensor; | |||
} | |||
} | |||
void TensorImplDft::set_layout(const Layout& layout) { | |||
bool host = is_host(); | |||
auto mgb_layout = to_impl_layout(layout); | |||
if (host) { | |||
m_host_tensor->dtype(mgb_layout.dtype); | |||
m_host_tensor->resize(mgb_layout); | |||
} else { | |||
m_dev_tensor->dtype(mgb_layout.dtype); | |||
m_dev_tensor->resize(mgb_layout); | |||
} | |||
} | |||
void TensorImplDft::reshape(const Layout& layout) { | |||
auto mgb_layout = to_impl_layout(layout); | |||
bool host = is_host(); | |||
if (host) { | |||
m_host_tensor->resize(mgb_layout); | |||
} else { | |||
m_dev_tensor->resize(mgb_layout); | |||
} | |||
} | |||
void TensorImplDft::reset(void* prepared_data) { | |||
auto raw_ptr = static_cast<mgb::dt_byte*>(prepared_data); | |||
auto raw_storage = std::shared_ptr<mgb::dt_byte>(raw_ptr, [](void*) {}); | |||
bool host = is_host(); | |||
if (host) { | |||
auto cn = m_host_tensor->comp_node(); | |||
auto mge_layout = m_host_tensor->layout(); | |||
size_t size = mge_layout.span().dist_byte(); | |||
mgb::HostTensorStorage storage; | |||
storage.reset(cn, size, raw_storage); | |||
m_host_tensor->reset(storage, mge_layout); | |||
} else { | |||
auto cn = m_dev_tensor->comp_node(); | |||
auto mge_layout = m_dev_tensor->layout(); | |||
size_t size = mge_layout.span().dist_byte(); | |||
mgb::DeviceTensorStorage storage; | |||
storage.reset(cn, size, raw_storage); | |||
m_dev_tensor->reset(storage, mge_layout); | |||
} | |||
} | |||
void TensorImplDft::reset(void* prepared_data, const Layout& layout) { | |||
set_layout(layout); | |||
reset(prepared_data); | |||
} | |||
bool TensorImplDft::is_continue_memory() const { | |||
if (is_host()) { | |||
return m_host_tensor->layout().is_physical_contiguous(); | |||
} else { | |||
return m_dev_tensor->layout().is_physical_contiguous(); | |||
} | |||
} | |||
void TensorImplDft::copy_from(const TensorImplBase* src_impl) { | |||
if (is_continue_memory()) { | |||
copy_from_continue(src_impl); | |||
} else { | |||
copy_from_fixlayout(src_impl); | |||
} | |||
} | |||
void TensorImplDft::copy_from_continue(const TensorImplBase* src_impl) { | |||
auto src = static_cast<const TensorImplDft*>(src_impl); | |||
if (is_host()) { | |||
//! host to host | |||
if (src->is_host()) { | |||
m_host_tensor->copy_from(*src->m_host_tensor); | |||
//! device to host | |||
} else { | |||
auto src_cn = src->m_dev_tensor->comp_node(); | |||
auto dst_cn = m_host_tensor->comp_node(); | |||
if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) { | |||
LITE_WARN( | |||
"The dst tensor memroy is alloced before coping, " | |||
"then pinned memroy would not use to optmize the " | |||
"copy performance."); | |||
//! When D2H in megbrain and the compnode of src and dst is not | |||
//! equal, there must be one compnode that is cpu-default, so | |||
//! here, we use temp tensor for transition | |||
auto tmp_impl = std::make_shared<TensorImplDft>(); | |||
tmp_impl->set_mge_tensor_compnode(src_cn); | |||
tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
m_host_tensor->copy_from(*tmp_impl->m_host_tensor); | |||
} else { | |||
//! if dst compnode is not valid(memory is not alloced), the | |||
//! tensor is pinned host tensor | |||
m_host_tensor->comp_node(src_cn, true); | |||
m_host_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
} | |||
} | |||
} else { | |||
//! host to device | |||
if (src->is_host()) { | |||
m_dev_tensor->copy_from(*src->m_host_tensor).sync(); | |||
//! device to device | |||
} else { | |||
m_dev_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
} | |||
} | |||
} | |||
void TensorImplDft::copy_from_fixlayout(const TensorImplBase* src_impl) { | |||
auto src = static_cast<const TensorImplDft*>(src_impl); | |||
if (is_host()) { | |||
//! host to host | |||
if (src->is_host()) { | |||
m_host_tensor->copy_from_fixlayout(*src->m_host_tensor); | |||
//! device to host | |||
} else { | |||
auto src_cn = src->m_dev_tensor->comp_node(); | |||
auto dst_cn = m_host_tensor->comp_node(); | |||
if (src_cn != dst_cn && m_host_tensor->layout().ndim > 0) { | |||
LITE_WARN( | |||
"The dst tensor memroy is alloced before coping, " | |||
"then pinned memroy would not use to optmize the " | |||
"copy performance."); | |||
//! When D2H in megbrain and the compnode of src and dst is not | |||
//! equal, there must be one compnode that is cpu-default, so | |||
//! here, we use temp tensor for transition | |||
auto tmp_impl = std::make_shared<TensorImplDft>(); | |||
tmp_impl->set_mge_tensor_compnode(src_cn); | |||
tmp_impl->m_host_tensor->copy_from(*src->m_dev_tensor).sync(); | |||
m_host_tensor->copy_from_fixlayout(*tmp_impl->m_host_tensor); | |||
} else { | |||
//! if dst compnode is not valid(memory is not alloced), the | |||
//! tensor is pinned host tensor | |||
m_host_tensor->comp_node(src_cn, true); | |||
m_host_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync(); | |||
} | |||
} | |||
} else { | |||
//! host to device | |||
if (src->is_host()) { | |||
m_dev_tensor->copy_from_fixlayout(*src->m_host_tensor).sync(); | |||
//! device to device | |||
} else { | |||
m_dev_tensor->copy_from_fixlayout(*src->m_dev_tensor).sync(); | |||
} | |||
} | |||
} | |||
void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) { | |||
if (is_host()) { | |||
auto src_cn = dv.comp_node(); | |||
m_host_tensor->comp_node(src_cn, true); | |||
m_host_tensor->copy_from(dv); | |||
} else { | |||
m_dev_tensor->copy_from(dv); | |||
} | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,128 @@ | |||
/** | |||
* \file src/mge/tensor_impl.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "lite/tensor.h" | |||
#include "tensor_impl_base.h" | |||
#include "megbrain/tensor.h" | |||
#include <unordered_map> | |||
namespace lite { | |||
/*! | |||
* \brief implement the Tensor in mge | |||
*/ | |||
class TensorImplDft final : public Tensor::TensorImplBase { | |||
LITE_DYN_TYPE_OBJ_FINAL_DECL; | |||
public: | |||
TensorImplDft(); | |||
TensorImplDft(LiteDeviceType device, bool is_pinned_host = false); | |||
TensorImplDft(LiteDeviceType device, const Layout& layout, | |||
bool is_pinned_host = false); | |||
TensorImplDft(int device_id, LiteDeviceType device, | |||
const Layout& layout = {}, bool is_pinned_host = false); | |||
TensorImplDft(int device_id, int stream_id, LiteDeviceType device, | |||
bool is_pinned_host = false); | |||
virtual ~TensorImplDft() = default; | |||
LiteDeviceType get_device_type() const override; | |||
int get_device_id() const override; | |||
LiteBackend get_backend_type() const override { | |||
return LiteBackend::LITE_DEFAULT; | |||
} | |||
Layout get_layout() const override; | |||
bool is_pinned_host() const override; | |||
//! which will trigger memory alloc in tensor implement | |||
void* get_memory_ptr() const override; | |||
//! which will trigger memory alloc in tensor implement if memory is not | |||
//! allocated, and compute the ptr in the gaven idx | |||
void* get_memory_ptr(const std::vector<size_t>& idx) const override; | |||
//! set layout will change the layout and reallocate memory of the tensor | |||
void set_layout(const Layout& layout) override; | |||
//! use the user allocated data to reset the memory of the tensor, the | |||
//! memory will not be managed by the lite, later, the user should delete | |||
//! it. | |||
void reset(void* prepared_data) override; | |||
//! use the user allocated data and corresponding layout to reset the data | |||
//! and layout of the tensor, the memory will not be managed by lite, later, | |||
//! the user should delete it. | |||
void reset(void* prepared_data, const Layout& layout) override; | |||
//! get a new tensor slice from the origin tensor | |||
std::shared_ptr<Tensor> slice( | |||
const std::vector<size_t>& start, const std::vector<size_t>& end, | |||
const std::vector<size_t>& step = {}) override; | |||
//! set the tensor memory with zero | |||
void fill_zero() override; | |||
//! reshape the tensor with new shape, keep the data_type the same | |||
void reshape(const Layout& layout) override; | |||
//! copy tensor form other tensor | |||
//! Note: the best way for tensor copy is just set the dst device, left | |||
//! layout empty, when copying the dst layout will be set the same with | |||
//! src | |||
void copy_from(const TensorImplBase* src_impl) override; | |||
//! share memory with other tensor | |||
void share_memory_with(const TensorImplBase* src_impl) override; | |||
//! whether the memory of tensor is continue | |||
bool is_continue_memory() const override; | |||
//! get host tensor | |||
std::shared_ptr<mgb::HostTensorND> host_tensor() const { | |||
return m_host_tensor; | |||
} | |||
//! get device tensor | |||
std::shared_ptr<mgb::DeviceTensorND> dev_tensor() const { | |||
return m_dev_tensor; | |||
} | |||
//! copy from mgb tensor | |||
void copy_from_mge_tensor(const mgb::DeviceTensorND& dv); | |||
public: | |||
friend class NetworkImplDft; | |||
private: | |||
bool is_host() const { return m_host_tensor != nullptr; }; | |||
void copy_from_continue(const TensorImplBase* src_impl); | |||
void copy_from_fixlayout(const TensorImplBase* src_impl); | |||
void set_mge_tensor_compnode(const mgb::CompNode& comp_node); | |||
private: | |||
std::shared_ptr<mgb::HostTensorND> m_host_tensor; | |||
std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor; | |||
}; | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,154 @@ | |||
/** | |||
* \file inlude/misc.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "./misc.h" | |||
#include "lite/global.h" | |||
#include <time.h> | |||
#include <chrono> | |||
#include <cstdarg> | |||
#if LITE_BUILD_WITH_MGE | |||
#include "megbrain/common.h" | |||
#endif | |||
#ifdef __ANDROID__ | |||
#include <android/log.h> | |||
#endif | |||
using namespace lite; | |||
namespace lite { | |||
namespace log_detail { | |||
LiteLogLevel current_log_level = LiteLogLevel::ERROR; | |||
template <class T, size_t N> | |||
constexpr size_t countof(T (&)[N]) { | |||
return N; | |||
} | |||
} // namespace log_detail | |||
} // namespace lite | |||
namespace { | |||
std::string svsprintf(const char* fmt, va_list ap_orig) { | |||
int size = 100; /* Guess we need no more than 100 bytes */ | |||
char* p; | |||
if ((p = (char*)malloc(size)) == nullptr) | |||
return "svsprintf: malloc failed"; | |||
for (;;) { | |||
va_list ap; | |||
va_copy(ap, ap_orig); | |||
int n = vsnprintf(p, size, fmt, ap); | |||
va_end(ap); | |||
if (n < 0) | |||
return "svsprintf: vsnprintf failed"; | |||
if (n < size) { | |||
std::string rst(p); | |||
free(p); | |||
return rst; | |||
} | |||
size = n + 1; | |||
char* np = (char*)realloc(p, size); | |||
if (!np) { | |||
free(p); | |||
return "svsprintf: realloc failed"; | |||
} else | |||
p = np; | |||
} | |||
} | |||
} // namespace | |||
void lite::set_log_level(LiteLogLevel l) { | |||
log_detail::current_log_level = l; | |||
#if LITE_BUILD_WITH_MGE | |||
mgb::LogLevel lite_log_level = mgb::LogLevel::DEBUG; | |||
switch (l) { | |||
case LiteLogLevel::DEBUG: | |||
lite_log_level = mgb::LogLevel::DEBUG; | |||
break; | |||
case LiteLogLevel::INFO: | |||
lite_log_level = mgb::LogLevel::INFO; | |||
break; | |||
case LiteLogLevel::WARN: | |||
lite_log_level = mgb::LogLevel::WARN; | |||
break; | |||
case LiteLogLevel::ERROR: | |||
lite_log_level = mgb::LogLevel::ERROR; | |||
break; | |||
default: | |||
LITE_THROW("unkonw loglevel"); | |||
} | |||
mgb::set_log_level(lite_log_level); | |||
#endif | |||
} | |||
LiteLogLevel lite::get_log_level() { | |||
return log_detail::current_log_level; | |||
} | |||
std::string lite::ssprintf(const char* format, ...) { | |||
va_list ap; | |||
va_start(ap, format); | |||
auto ret = svsprintf(format, ap); | |||
va_end(ap); | |||
return ret; | |||
} | |||
void lite::print_log(LiteLogLevel level, const char* format, ...) { | |||
if (static_cast<uint32_t>(level) < static_cast<uint32_t>(get_log_level())) { | |||
return; | |||
} | |||
using namespace std::chrono; | |||
auto now = system_clock::now(); | |||
auto now_time_t = system_clock::to_time_t(now); | |||
tm now_tm; | |||
#if _WIN32 | |||
localtime_s(&now_tm, &now_time_t); | |||
#else | |||
localtime_r(&now_time_t, &now_tm); | |||
#endif | |||
auto now_trunc_to_sec = system_clock::from_time_t(mktime(&now_tm)); | |||
auto microsec = duration_cast<microseconds>(now - now_trunc_to_sec); | |||
char time_buffer[100]; | |||
snprintf(time_buffer, log_detail::countof(time_buffer), | |||
"%02d:%02d:%02d.%06ld ", now_tm.tm_hour, now_tm.tm_min, | |||
now_tm.tm_sec, long(microsec.count())); | |||
const char* prefix[] = {"LITE[DBG] ", "LITE[INF] ", "LITE[WRN] ", | |||
"LITE[ERR] "}; | |||
std::string out; | |||
out += prefix[int(level)]; | |||
out += time_buffer; | |||
va_list ap; | |||
va_start(ap, format); | |||
auto ret = svsprintf(format, ap); | |||
va_end(ap); | |||
out += ret; | |||
#ifdef __ANDROID__ | |||
__android_log_print(ANDROID_LOG_INFO, "lite", "%s", out.c_str()); | |||
#else | |||
fprintf(stderr, "%s\n", out.c_str()); | |||
#endif | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,254 @@ | |||
/** | |||
* \file include/misc.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#include <chrono> | |||
#include <exception> | |||
#include <stdexcept> | |||
#include <string> | |||
#include "lite/common_enum_c.h" | |||
#include "lite/global.h" | |||
namespace lite { | |||
#if LITE_ENABLE_EXCEPTION | |||
/*! \brief The error class in lite. | |||
* | |||
* It can be used to represent both an error caused by the invalid | |||
* input of the caller or an invalid runtime condition. | |||
* | |||
* The necessary presumption should be guaranteed by assertions instead of | |||
* exceptions. | |||
*/ | |||
class Error : public std::exception { | |||
public: | |||
Error(const std::string& msg) : m_msg("Error: " + msg) {} | |||
const char* what() const noexcept override { return m_msg.c_str(); } | |||
private: | |||
std::string m_msg; | |||
}; | |||
#endif | |||
std::string ssprintf(const char* fmt = 0, ...) | |||
__attribute__((format(printf, 1, 2))); | |||
/*! | |||
* \brief Print a message. | |||
* | |||
* The message is printed only if level is above or equals to the current log | |||
* level. | |||
*/ | |||
void print_log(LiteLogLevel level, const char* format = 0, ...) | |||
__attribute__((format(printf, 2, 3))); | |||
} // namespace lite | |||
#if LITE_ENABLE_LOGGING | |||
#define LITE_LOG_(level, msg...) \ | |||
do { \ | |||
lite::print_log(LiteLogLevel::level, ##msg); \ | |||
} while (0) | |||
#else | |||
#define LITE_LOG_(level, msg...) (void)0 | |||
#endif | |||
#define LITE_LOG(fmt...) LITE_LOG_(DEBUG, fmt); | |||
#define LITE_DEBUG(fmt...) LITE_LOG_(DEBUG, fmt); | |||
#define LITE_WARN(fmt...) LITE_LOG_(WARN, fmt); | |||
#define LITE_ERROR(fmt...) LITE_LOG_(ERROR, fmt); | |||
#if LITE_ENABLE_EXCEPTION | |||
#define LITE_THROW(msg) throw lite::Error(msg) | |||
#else | |||
#define LITE_THROW(msg) \ | |||
do { \ | |||
LITE_ERROR(msg); \ | |||
__builtin_trap(); \ | |||
} while (0) | |||
#endif | |||
#if LITE_ENABLE_EXCEPTION | |||
#define LITE_ERROR_HANDLER_BEGIN try { | |||
#define LITE_ERROR_HANDLER_END \ | |||
} \ | |||
catch (const ::lite::Error& e) { \ | |||
std::string msg = std::string("Lite exception: ") + e.what(); \ | |||
LITE_ERROR("%s.", msg.c_str()); \ | |||
throw; \ | |||
} | |||
#else | |||
#define LITE_ERROR_HANDLER_BEGIN | |||
#define LITE_ERROR_HANDLER_END | |||
#endif | |||
/*! \brief Return an error if the given pointer is null pointer. | |||
* | |||
* The macro is used to ensure the validity of the passing context pointer. | |||
*/ | |||
#define LITE_CHECK_NON_NULL_POINTER(ptr) \ | |||
LITE_ASSERT(ptr != nullptr, "Input ptr is null.") | |||
//! branch prediction hint: likely to take | |||
#define lite_likely(v) __builtin_expect(static_cast<bool>(v), 1) | |||
//! branch prediction hint: unlikely to take | |||
#define lite_unlikely(v) __builtin_expect(static_cast<bool>(v), 0) | |||
#if LITE_ENABLE_LOGGING | |||
#if LITE_ASSERT_LOC | |||
#define LITE_ASSERT(expr, msg...) \ | |||
do { \ | |||
if (lite_unlikely(!(expr))) { \ | |||
auto info = lite::ssprintf(msg); \ | |||
LITE_THROW( \ | |||
lite::ssprintf("Assert \' %s \' failed at file : %s \n" \ | |||
"line %d : %s,\nextra " \ | |||
"message: %s", \ | |||
#expr, __FILE__, __LINE__, \ | |||
__PRETTY_FUNCTION__, info.c_str())); \ | |||
} \ | |||
} while (0) | |||
#else | |||
#define LITE_ASSERT(expr, msg...) \ | |||
do { \ | |||
if (lite_unlikely(!(expr))) { \ | |||
auto info = lite::ssprintf(msg); \ | |||
LITE_THROW(lite::ssprintf( \ | |||
"Assert \' %s \' failed at file : %s \n" \ | |||
"line %d : %s,\nextra " \ | |||
"message: %s", \ | |||
#expr, "about location info, please build with debug", \ | |||
__LINE__, __PRETTY_FUNCTION__, info.c_str())); \ | |||
} \ | |||
} while (0) | |||
#endif | |||
#else | |||
#define LITE_ASSERT(expr, msg...) \ | |||
do { \ | |||
if (lite_unlikely(!(expr))) { \ | |||
auto msg_string = lite::ssprintf(msg); \ | |||
LITE_THROW(msg_string.c_str()); \ | |||
} \ | |||
} while (0) | |||
#endif | |||
#define LITE_MARK_USED_VAR(var) ((void)var) | |||
namespace lite { | |||
class ScopedTimer { | |||
public: | |||
typedef std::chrono::system_clock Clock; | |||
typedef std::chrono::nanoseconds Nsec; | |||
ScopedTimer(std::string name) : m_name(name) { m_start = Clock::now(); } | |||
~ScopedTimer() { | |||
m_stop = Clock::now(); | |||
std::chrono::duration<double> elapsed = m_stop - m_start; | |||
Nsec u = std::chrono::duration_cast<Nsec>(elapsed); | |||
auto msg = ssprintf("%s used time %fms.", m_name.c_str(), | |||
static_cast<double>(u.count()) / 1000000.f); | |||
LITE_LOG("%s", msg.c_str()); | |||
} | |||
private: | |||
std::chrono::time_point<std::chrono::system_clock> m_start, m_stop; | |||
const std::string m_name; | |||
}; | |||
class Timer { | |||
public: | |||
typedef std::chrono::system_clock Clock; | |||
typedef std::chrono::nanoseconds Nsec; | |||
Timer(std::string name) : m_name(name) { m_start = Clock::now(); } | |||
double get_used_time() { | |||
m_stop = Clock::now(); | |||
std::chrono::duration<double> elapsed = m_stop - m_start; | |||
Nsec u = std::chrono::duration_cast<Nsec>(elapsed); | |||
return static_cast<double>(u.count()) / 1000000.0; | |||
} | |||
void print_used_time(int iter) { | |||
m_stop = Clock::now(); | |||
std::chrono::duration<double> elapsed = m_stop - m_start; | |||
Nsec u = std::chrono::duration_cast<Nsec>(elapsed); | |||
printf("%s used time %f ms\n", (m_name + std::to_string(iter)).c_str(), | |||
static_cast<double>(u.count()) / 1000000.0); | |||
} | |||
void reset_start() { m_start = Clock::now(); } | |||
private: | |||
std::chrono::time_point<std::chrono::system_clock> m_start, m_stop; | |||
const std::string m_name; | |||
}; | |||
inline void mark_used_variable() {} | |||
template <typename T, typename... Arg> | |||
inline void mark_used_variable(T firstArg, Arg... args) { | |||
LITE_MARK_USED_VAR(firstArg); | |||
mark_used_variable(args...); | |||
} | |||
} // namespace lite | |||
#if defined(_WIN32) | |||
#include <io.h> | |||
#include <windows.h> | |||
#undef CONST | |||
#define F_OK 0 | |||
#define RTLD_LAZY 0 | |||
// On the windows platform we use a lib_filename without a full path so | |||
// the win-api "LoadLibrary" would uses a standard search strategy to | |||
// find the lib module. As we cannot access to the lib_filename without a | |||
// full path, we should not use "access(a, b)" to verify it. | |||
#define access(a, b) false | |||
static inline void* dlopen(const char* file, int) { | |||
return static_cast<void*>(LoadLibrary(file)); | |||
} | |||
static inline char* dlerror() { | |||
const char* errmsg = "dlerror not aviable in windows"; | |||
return const_cast<char*>(errmsg); | |||
} | |||
static inline void* dlsym(void* handle, const char* name) { | |||
FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||
return reinterpret_cast<void*>(symbol); | |||
} | |||
#elif __linux__ || __unix__ || __APPLE__ | |||
#include <dlfcn.h> | |||
#include <unistd.h> | |||
#endif | |||
#if __DEPLOY_ON_XP_SP2__ | |||
//! refer to | |||
//! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160 | |||
//! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not | |||
//! implement some base apis for c++ std function, for example, | |||
//! std::mutex/std::thread/std::condition_variable as a workround, we will | |||
//! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc! | |||
#define LITE_MUTEX size_t | |||
#define LITE_RECURSIVE_MUTEX size_t | |||
#define LITE_LOCK_GUARD(mtx) LITE_MARK_USED_VAR(mtx) | |||
#define LITE_LOCK_GUARD_UNIQUE(mtx) LITE_MARK_USED_VAR(mtx) | |||
#define LITE_LOCK_GUARD_SHARED(mtx) LITE_MARK_USED_VAR(LITE_MARK_USED_VAR) | |||
#else | |||
#define LITE_MUTEX std::mutex | |||
#define LITE_RECURSIVE_MUTEX std::recursive_mutex | |||
#define LITE_LOCK_GUARD(mtx) \ | |||
std::lock_guard<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx) | |||
#define LITE_LOCK_GUARD_UNIQUE(mtx) \ | |||
std::unique_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx) | |||
#define LITE_LOCK_GUARD_SHARED(mtx) \ | |||
std::shared_lock<decltype(mtx)> LITE_LOCK_GUARD_CTOR(mtx) | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,501 @@ | |||
/** | |||
* \file src/network.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite/network.h" | |||
#include "function_base.h" | |||
#include "network_impl_base.h" | |||
#include "parse_info/parse_info_base.h" | |||
#include "parse_model/model_parser.h" | |||
#include "type_info.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "mge/function_dft.h" | |||
#include "mge/network_impl.h" | |||
#endif | |||
#include <fstream> | |||
#include <memory> | |||
using namespace lite; | |||
/** | |||
* \brief Construct the new work implement | |||
* the order must be : | |||
* 1. creeat the implement | |||
* 2. config and load | |||
* 3. set_io | |||
*/ | |||
Network::Network(const Config& config, const NetworkIO& network_io) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_config = config; | |||
m_network_io = network_io; | |||
if (config.backend == LiteBackend::LITE_DEFAULT) { | |||
m_impl = call_func<NetworkImplDft, | |||
std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
"create_network"); | |||
} else if (config.backend == LiteBackend::LITE_RK_NPU) { | |||
m_impl = call_func<NetworkImplRK, | |||
std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
"create_network"); | |||
} | |||
m_impl->set_config(config); | |||
m_impl->set_io(network_io); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Network::Network(const NetworkIO& network_io, const Config& config) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_config = config; | |||
m_network_io = network_io; | |||
if (config.backend == LiteBackend::LITE_DEFAULT) { | |||
m_impl = call_func<NetworkImplDft, | |||
std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
"create_network"); | |||
} else if (config.backend == LiteBackend::LITE_RK_NPU) { | |||
m_impl = call_func<NetworkImplRK, | |||
std::unique_ptr<lite::Network::NetworkImplBase>>( | |||
"create_network"); | |||
} | |||
m_impl->set_config(config); | |||
m_impl->set_io(network_io); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Network::load_model(void* model_mem, size_t size) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
//! this model_mem is managed by user | |||
std::shared_ptr<void> model{model_mem, [](void*) {}}; | |||
prase_model(model, size); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Network::load_model(std::string model_path) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
FILE* fin = fopen(model_path.c_str(), "rb"); | |||
LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), | |||
strerror(errno)); | |||
fseek(fin, 0, SEEK_END); | |||
size_t size = ftell(fin); | |||
fseek(fin, 0, SEEK_SET); | |||
void* ptr = malloc(size); | |||
std::shared_ptr<void> buf{ptr, ::free}; | |||
auto nr = fread(buf.get(), 1, size, fin); | |||
LITE_ASSERT(nr == size); | |||
fclose(fin); | |||
prase_model(buf, size); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Network::prase_model(std::shared_ptr<void> model_data, size_t size) { | |||
std::unordered_map<std::string, LiteAny> separate_config_map; | |||
ModelParser model_parser(model_data, size); | |||
//! parse the model info | |||
if (model_parser.parse_model_info(m_config, m_network_io, | |||
separate_config_map, m_extra_info)) { | |||
if (m_config.backend == LiteBackend::LITE_DEFAULT && | |||
m_impl->get_backend_type() != LiteBackend::LITE_DEFAULT) { | |||
m_impl.reset(try_call_func<NetworkImplDft, | |||
lite::Network::NetworkImplBase*>( | |||
"parse_model")); | |||
} else if (m_config.backend == LiteBackend::LITE_RK_NPU && | |||
m_impl->get_backend_type() != LiteBackend::LITE_RK_NPU) { | |||
m_impl.reset(try_call_func<NetworkImplRK, | |||
lite::Network::NetworkImplBase*>( | |||
"parse_model")); | |||
} | |||
m_impl->set_config(m_config); | |||
m_impl->set_io(m_network_io); | |||
} | |||
//! decryption the model | |||
size_t model_length; | |||
auto&& model_shared_ptr = model_parser.parse_model(model_length, m_config); | |||
m_impl->load_model(model_shared_ptr, model_length, separate_config_map); | |||
m_loaded = true; | |||
update_from_implement(); | |||
} | |||
Network::~Network() = default; | |||
void Network::update_from_implement() { | |||
m_config.device_type = m_impl->get_device_type(); | |||
} | |||
void Network::compute_only_configured_output() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(!m_loaded, | |||
"compute_only_configured_output should be used before model " | |||
"loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->compute_only_configured_output(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::shared_ptr<Tensor> Network::get_io_tensor(std::string name, | |||
LiteTensorPhase phase) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_io_tensor(name, phase); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, | |||
"get_input_tensor should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_input_tensor(index); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, | |||
"get_output_tensor should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_output_tensor(index); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Network& Network::set_async_callback(const AsyncCallback& callback) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
m_impl->set_async_callback(std::move(callback)); | |||
return *this; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Network& Network::set_start_callback(const StartCallback& callback) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
m_impl->set_start_callback(std::move(callback)); | |||
return *this; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Network& Network::set_finish_callback(const FinishCallback& callback) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
m_impl->set_finish_callback(std::move(callback)); | |||
return *this; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Network& Network::set_device_id(int device_id) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(!m_loaded, "set_device_id should be used before model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
m_impl->set_device_id(device_id); | |||
return *this; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Network& Network::set_stream_id(int stream_id) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(!m_loaded, "set_stream_id should be used before model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
m_impl->set_stream_id(stream_id); | |||
return *this; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Network::forward() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, "forward should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl.get()); | |||
m_impl->forward(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Network::wait() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, "wait should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
m_impl->wait(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::string Network::get_input_name(size_t index) const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, "get_input_name should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_input_name(index); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::string Network::get_output_name(size_t index) const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, "get_output_name should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_output_name(index); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::vector<std::string> Network::get_all_input_name() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, | |||
"get_all_input_name should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
auto all_input_name = m_impl->get_all_input_name(); | |||
std::vector<std::string> all_names; | |||
for (auto& name : all_input_name) { | |||
all_names.push_back(name); | |||
} | |||
return all_names; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::vector<std::string> Network::get_all_output_name() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_loaded, | |||
"get_all_output_name should be used after model loaded."); | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
auto all_output_name = m_impl->get_all_output_name(); | |||
std::vector<std::string> all_names; | |||
for (auto& name : all_output_name) { | |||
all_names.push_back(name); | |||
} | |||
return all_names; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
int Network::get_device_id() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_device_id(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
int Network::get_stream_id() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_CHECK_NON_NULL_POINTER(m_impl); | |||
return m_impl->get_stream_id(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Network::enable_profile_performance(std::string profile_file_path) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_impl->enable_profile_performance(profile_file_path); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
const std::string& Network::get_model_extra_info() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
return m_extra_info; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
LiteDeviceType Network::get_device_type() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
return m_impl->get_device_type(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
/*********************** MGE special network function ***************/ | |||
void Runtime::set_cpu_threads_number(std::shared_ptr<Network> network, | |||
size_t nr_threads) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT( | |||
!NetworkHelper::loaded(network), | |||
"set_cpu_threads_number should be used before model loaded."); | |||
call_func<NetworkImplDft, void>("set_cpu_threads_number", network_impl, | |||
nr_threads); | |||
return; | |||
} | |||
LITE_THROW("set_cpu_threads_number is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::use_tensorrt(std::shared_ptr<Network> network) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(!NetworkHelper::loaded(network), | |||
"use_tensorrt should be used before model loaded."); | |||
call_func<NetworkImplDft, void>("use_tensorrt", network_impl); | |||
return; | |||
} | |||
LITE_THROW("use_tensorrt is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
size_t Runtime::get_cpu_threads_number(const std::shared_ptr<Network> network) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
return call_func<NetworkImplDft, size_t>("get_cpu_threads_number", | |||
network_impl); | |||
} | |||
LITE_THROW("get_cpu_threads_number is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::set_runtime_thread_affinity( | |||
std::shared_ptr<Network> network, | |||
const ThreadAffinityCallback& thread_affinity_callback) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(NetworkHelper::loaded(network), | |||
"set_runtime_thread_affinity should be used after model " | |||
"loaded."); | |||
call_func<NetworkImplDft, void>("set_runtime_thread_affinity", | |||
network_impl, thread_affinity_callback); | |||
return; | |||
} | |||
LITE_THROW("set_runtime_thread_affinity is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::set_cpu_inplace_mode(std::shared_ptr<Network> network) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(!NetworkHelper::loaded(network), | |||
"set_cpu_inplace_mode should be used before model loaded."); | |||
call_func<NetworkImplDft, void>("set_cpu_inplace_mode", network_impl); | |||
return; | |||
} | |||
LITE_THROW("set_cpu_inplace_mode is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
bool Runtime::is_cpu_inplace_mode(const std::shared_ptr<Network> network) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
return call_func<NetworkImplDft, bool>("is_cpu_inplace_mode", | |||
network_impl); | |||
} | |||
LITE_THROW("is_cpu_inplace_mode is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
//! set opr algorithm selection strategy in the network | |||
void Runtime::set_network_algo_policy(std::shared_ptr<Network> network, | |||
LiteAlgoSelectStrategy strategy, | |||
uint32_t shared_batch_size, | |||
bool binary_equal_between_batch) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
call_func<NetworkImplDft, void>("set_network_algo_policy", network_impl, | |||
strategy, shared_batch_size, | |||
binary_equal_between_batch); | |||
return; | |||
} | |||
LITE_THROW("set_network_algo_policy is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
//! set opr algorithm selection strategy in the network | |||
void Runtime::set_network_algo_workspace_limit(std::shared_ptr<Network> network, | |||
size_t workspace_limit) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(NetworkHelper::loaded(network), | |||
"set_network_algo_policy should be used after model " | |||
"loaded."); | |||
call_func<NetworkImplDft, void>("set_network_algo_workspace_limit", | |||
network_impl, workspace_limit); | |||
return; | |||
} | |||
LITE_THROW( | |||
"set_network_algo_workspace_limit is not aviliable in the " | |||
"backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
//! set the network memroy allocator, the allocator is defined by user | |||
void Runtime::set_memory_allocator(std::shared_ptr<Network> network, | |||
std::shared_ptr<Allocator> user_allocator) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(!NetworkHelper::loaded(network), | |||
"set_memory_allocator should be used before model loaded."); | |||
call_func<NetworkImplDft, void>("set_memory_allocator", network_impl, | |||
user_allocator); | |||
return; | |||
} | |||
LITE_THROW("set_memory_allocator is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::share_runtime_memory_with(std::shared_ptr<Network> dst_network, | |||
std::shared_ptr<Network> src_network) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl_dst = NetworkHelper::implement(dst_network); | |||
if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(!NetworkHelper::loaded(dst_network), | |||
"share_runtime_memory_with should be used before model " | |||
"loaded."); | |||
call_func<NetworkImplDft, void>("share_runtime_memory_with", | |||
network_impl_dst, | |||
NetworkHelper::implement(src_network)); | |||
return; | |||
} | |||
LITE_THROW("share_runtime_memory_with is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::enable_io_txt_dump(std::shared_ptr<Network> network, | |||
std::string io_txt_out_file) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
call_func<NetworkImplDft, void>("enable_io_txt_dump", network_impl, | |||
io_txt_out_file); | |||
return; | |||
} | |||
LITE_THROW("enable_io_txt_dump is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::enable_io_bin_dump(std::shared_ptr<Network> network, | |||
std::string io_bin_out_dir) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl = NetworkHelper::implement(network); | |||
if (network_impl->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
call_func<NetworkImplDft, void>("enable_io_bin_dump", network_impl, | |||
io_bin_out_dir); | |||
return; | |||
} | |||
LITE_THROW("enable_io_bin_dump is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Runtime::shared_weight_with_network( | |||
std::shared_ptr<Network> dst_network, | |||
const std::shared_ptr<Network> src_network) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto network_impl_dst = NetworkHelper::implement(dst_network); | |||
if (network_impl_dst->get_backend_type() == LiteBackend::LITE_DEFAULT) { | |||
LITE_ASSERT(NetworkHelper::loaded(src_network), | |||
"shared_weight_with_network should be used after the src " | |||
"network " | |||
"loaded."); | |||
auto src_implment = NetworkHelper::implement(src_network); | |||
call_func<NetworkImplDft, void>("shared_weight_with", network_impl_dst, | |||
src_implment); | |||
NetworkHelper::loaded(dst_network, true); | |||
return; | |||
} | |||
LITE_THROW("shared_weight_with_network is not aviliable in the backend."); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,161 @@ | |||
/** | |||
* \file src/network_impl_base.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite/network.h" | |||
#include "misc.h" | |||
#include "tensor_impl_base.h" | |||
#include "type_info.h" | |||
#include <unordered_map> | |||
namespace lite { | |||
/*! | |||
* \brief the Inner IO data struct, add some inner data from IO | |||
*/ | |||
class IOInner : public IO { | |||
public: | |||
//! use to flag the corresponding lite_tensor is filled, when the | |||
//! value of lite_tensor is filled, the have_sync is true, other wise false, | |||
//! this is used in async mode | |||
bool have_sync = false; | |||
//! Real input and output data location | |||
std::shared_ptr<Tensor> lite_tensor = nullptr; | |||
IOInner() = default; | |||
IOInner(const IO& io) { | |||
name = io.name; | |||
is_host = io.is_host; | |||
io_type = io.io_type; | |||
config_layout = io.config_layout; | |||
} | |||
}; | |||
/*! | |||
* \brief the realy network IO info when network run | |||
*/ | |||
struct NetworkIOInner { | |||
std::vector<IOInner> inputs; | |||
std::vector<IOInner> outputs; | |||
}; | |||
/*! | |||
* \brief implement the Network, contain the mgb related member | |||
*/ | |||
class Network::NetworkImplBase : public DynTypeObj { | |||
public: | |||
virtual ~NetworkImplBase() = default; | |||
//! set the config of the network, include: | |||
//! the inference device | |||
//! the other inference options, such as record_level, weight_preprocess... | |||
virtual void set_config(const Config& config) = 0; | |||
//! set the special io infomation, if not set, default io tensor will used, | |||
//! this is special for input/output is not host tensor, default the | |||
//! input/output tensors are host tensor | |||
virtual void set_io(const NetworkIO& network_io) = 0; | |||
//! only compute the output tensor in user configured | |||
virtual void compute_only_configured_output() = 0; | |||
//! get the network input and ouput tensor, the layout of which is | |||
//! sync from mge tensor | |||
virtual std::shared_ptr<Tensor> get_io_tensor( | |||
std::string io_name, | |||
LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0; | |||
//! get the input tensor by index in the load_result tensormap | |||
virtual std::shared_ptr<Tensor> get_input_tensor(size_t index) = 0; | |||
//! get the output tensor by index in the load_result output_var_list | |||
virtual std::shared_ptr<Tensor> get_output_tensor(size_t index) = 0; | |||
//! get all the input tensor name in the order in load return | |||
virtual std::vector<const char*> get_all_input_name() const = 0; | |||
//! get all the output tensor name in the order in load return | |||
virtual std::vector<const char*> get_all_output_name() const = 0; | |||
//! get the input tensor name in the order in load return | |||
virtual const char* get_input_name(size_t index) const = 0; | |||
//! get the output tensor name in the order in load return | |||
virtual const char* get_output_name(size_t index) const = 0; | |||
//! set the callback in async model | |||
virtual void set_async_callback(const AsyncCallback& callback) = 0; | |||
//! set the start callback which will execute before network forward | |||
virtual void set_start_callback(const StartCallback& callback) = 0; | |||
//! set the finish callback which will execute after network forward | |||
virtual void set_finish_callback(const FinishCallback& callback) = 0; | |||
//! load the model and get the m_load_result | |||
virtual void load_model(std::shared_ptr<void> model_mem, size_t size, | |||
std::unordered_map<std::string, LiteAny> | |||
separate_config_map = {}) = 0; | |||
//! forward the network with filled input data and fill the output data | |||
//! to the output tensor | |||
virtual void forward() = 0; | |||
//! in sync model, wait utile the inference finish | |||
virtual void wait() = 0; | |||
//! set device id, default device id = 0 | |||
virtual void set_device_id(int device_id) = 0; | |||
virtual int get_device_id() const = 0; | |||
virtual LiteBackend get_backend_type() const = 0; | |||
//! set stream id, default stream id = 0 | |||
virtual void set_stream_id(int stream_id) = 0; | |||
virtual int get_stream_id() const = 0; | |||
virtual LiteDeviceType get_device_type() const = 0; | |||
//! enable profile the network, a file will be generated | |||
virtual void enable_profile_performance(std::string profile_file_path) = 0; | |||
}; | |||
/******************************** friend class *****************************/ | |||
/*! | |||
* \brief friend class of Network, for convenient accessing the Network members | |||
*/ | |||
class NetworkHelper { | |||
public: | |||
static bool loaded(const std::shared_ptr<Network> network) { | |||
LITE_ASSERT(network); | |||
return network->m_loaded; | |||
} | |||
static void loaded(const std::shared_ptr<Network> network, bool loaded) { | |||
LITE_ASSERT(network); | |||
network->m_loaded = loaded; | |||
} | |||
static Network::NetworkImplBase* implement(const Network* network) { | |||
LITE_ASSERT(network); | |||
return network->m_impl.get(); | |||
} | |||
static Network::NetworkImplBase* implement( | |||
const std::shared_ptr<Network> network) { | |||
LITE_ASSERT(network); | |||
return network->m_impl.get(); | |||
} | |||
static void implement(const std::shared_ptr<Network> network, | |||
std::unique_ptr<Network::NetworkImplBase> impl) { | |||
LITE_ASSERT(network); | |||
network->m_impl = std::move(impl); | |||
} | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,246 @@ | |||
/** | |||
* \file src/parse_info/default_parse.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "../misc.h" | |||
#include "lite/global.h" | |||
#include "lite/network.h" | |||
#include "nlohmann/json.hpp" | |||
namespace lite { | |||
//! The LITE_default parse info function | |||
bool default_parse_info( | |||
const void* info_ptr, size_t length, const std::string& model_name, | |||
Config& config, NetworkIO& network_io, | |||
std::unordered_map<std::string, LiteAny>& separate_config_map, | |||
std::string& extra_info) { | |||
using json = nlohmann::json; | |||
std::string json_string(static_cast<const char*>(info_ptr), length); | |||
auto info = json::parse(json_string); | |||
if (!info["valid"]) { | |||
return false; | |||
} | |||
auto info_model_name = info["name"]; | |||
if (info_model_name != model_name) { | |||
LITE_THROW( | |||
ssprintf("infomation of model name is not match, packed model " | |||
"is %s, but json info get %s.", | |||
model_name.c_str(), | |||
static_cast<std::string>(info_model_name).c_str())); | |||
} | |||
//! check version | |||
std::string model_version = info["version"]; | |||
int major = std::stoi(model_version.substr(0, model_version.find("."))); | |||
int start = model_version.find(".") + 1; | |||
int minor = std::stoi( | |||
model_version.substr(start, model_version.find(".", start))); | |||
start = model_version.find(".", start) + 1; | |||
int patch = std::stoi(model_version.substr(start)); | |||
int lite_major, lite_minor, lite_patch; | |||
lite::get_version(lite_major, lite_minor, lite_patch); | |||
size_t model_version_sum = (major * 10000 + minor) * 100 + patch; | |||
size_t lite_version_sum = | |||
(lite_major * 10000 + lite_minor) * 100 + lite_patch; | |||
if (model_version_sum > lite_version_sum) { | |||
LITE_WARN("Lite load the future version model !!!!!!!!!!!!!"); | |||
} | |||
if (info.contains("has_compression")) { | |||
config.has_compression = info["has_compression"]; | |||
} | |||
if (info.contains("backend")) { | |||
if (info["backend"] == "MGE") { | |||
config.backend = LiteBackend::LITE_DEFAULT; | |||
} | |||
if (info["backend"] == "RK") { | |||
config.backend = LiteBackend::LITE_RK_NPU; | |||
} | |||
} | |||
auto get_device_type = [](std::string type) -> LiteDeviceType { | |||
if (type == "CPU") | |||
return LiteDeviceType::LITE_CPU; | |||
if (type == "CUDA") | |||
return LiteDeviceType::LITE_CUDA; | |||
if (type == "OPENCL") | |||
return LiteDeviceType::LITE_OPENCL; | |||
if (type == "ATLAS") | |||
return LiteDeviceType::LITE_ATLAS; | |||
if (type == "NPU") | |||
return LiteDeviceType::LITE_NPU; | |||
else { | |||
LITE_THROW(ssprintf("LITE not support device type of %s.", | |||
type.c_str())); | |||
} | |||
}; | |||
if (info.contains("device")) { | |||
auto device_json = info["device"]; | |||
config.device_type = get_device_type(device_json["type"]); | |||
if (device_json.contains("device_id")) { | |||
separate_config_map["device_id"] = | |||
static_cast<int>(device_json["device_id"]); | |||
} | |||
if (device_json.contains("number_threads")) { | |||
separate_config_map["number_threads"] = | |||
static_cast<size_t>(device_json["number_threads"]); | |||
} | |||
if (device_json.contains("enable_inplace_model")) { | |||
separate_config_map["enable_inplace_model"] = | |||
static_cast<bool>(device_json["enable_inplace_model"]); | |||
} | |||
if (device_json.contains("use_tensorrt")) { | |||
separate_config_map["use_tensorrt"] = | |||
static_cast<bool>(device_json["use_tensorrt"]); | |||
} | |||
} | |||
//! options | |||
if (info.contains("options")) { | |||
auto options = info["options"]; | |||
if (options.contains("weight_preprocess")) | |||
config.options.weight_preprocess = options["weight_preprocess"]; | |||
if (options.contains("fuse_preprocess")) | |||
config.options.fuse_preprocess = options["fuse_preprocess"]; | |||
if (options.contains("fake_next_exec")) | |||
config.options.fake_next_exec = options["fake_next_exec"]; | |||
if (options.contains("var_sanity_check_first_run")) | |||
config.options.var_sanity_check_first_run = | |||
options["var_sanity_check_first_run"]; | |||
if (options.contains("const_shape")) | |||
config.options.const_shape = options["const_shape"]; | |||
if (options.contains("force_dynamic_alloc")) | |||
config.options.force_dynamic_alloc = options["force_dynamic_alloc"]; | |||
if (options.contains("force_output_dynamic_alloc")) | |||
config.options.force_output_dynamic_alloc = | |||
options["force_output_dynamic_alloc"]; | |||
if (options.contains("no_profiling_on_shape_change")) | |||
config.options.no_profiling_on_shape_change = | |||
options["no_profiling_on_shape_change"]; | |||
if (options.contains("jit_level")) | |||
config.options.jit_level = options["jit_level"]; | |||
if (options.contains("comp_node_seq_record_level")) | |||
config.options.comp_node_seq_record_level = | |||
options["comp_node_seq_record_level"]; | |||
if (options.contains("graph_opt_level")) | |||
config.options.graph_opt_level = options["graph_opt_level"]; | |||
if (options.contains("async_exec_level")) | |||
config.options.async_exec_level = options["async_exec_level"]; | |||
} | |||
//! IO | |||
auto get_io_type = [](std::string type) -> LiteIOType { | |||
if (type == "value") | |||
return LiteIOType::LITE_IO_VALUE; | |||
if (type == "shape") | |||
return LiteIOType::LITE_IO_SHAPE; | |||
else { | |||
LITE_THROW( | |||
ssprintf("LITE not support IO type of %s.", type.c_str())); | |||
} | |||
}; | |||
auto get_data_type = [](std::string type) -> LiteDataType { | |||
if (type == "float32") | |||
return LiteDataType::LITE_FLOAT; | |||
if (type == "float16") | |||
return LiteDataType::LITE_HALF; | |||
if (type == "int32") | |||
return LiteDataType::LITE_INT; | |||
if (type == "int16") | |||
return LiteDataType::LITE_INT16; | |||
if (type == "int8") | |||
return LiteDataType::LITE_INT8; | |||
if (type == "uint8") | |||
return LiteDataType::LITE_UINT8; | |||
else { | |||
LITE_THROW(ssprintf("LITE not support data type of %s.", | |||
type.c_str())); | |||
} | |||
}; | |||
#define SET_SHAPE(shape_json_, config_) \ | |||
do { \ | |||
int ndim = 0; \ | |||
for (int i = 0; i < 4; i++) { \ | |||
if (shape_json_.contains(shape_name[i])) { \ | |||
ndim++; \ | |||
config_.config_layout.shapes[i] = shape_json_[shape_name[i]]; \ | |||
} else { \ | |||
break; \ | |||
} \ | |||
} \ | |||
config_.config_layout.ndim = ndim; \ | |||
} while (0) | |||
#define Config_IO(io_json_, io_config_) \ | |||
if (io_json_.contains("is_host")) \ | |||
io_config_.is_host = io_json_["is_host"]; \ | |||
if (io_json_.contains("io_type")) \ | |||
io_config_.io_type = get_io_type(io_json_["io_type"]); \ | |||
if (io_json_.contains("dtype")) \ | |||
io_config_.config_layout.data_type = get_data_type(io_json_["dtype"]); \ | |||
if (io_json_.contains("shape")) { \ | |||
auto shape_json = io_json_["shape"]; \ | |||
SET_SHAPE(shape_json, io_config_); \ | |||
} | |||
const std::string shape_name[] = {"dim0", "dim1", "dim2", "dim3"}; | |||
if(info.contains("IO")){ | |||
auto IOs = info["IO"]; | |||
if(IOs.contains("inputs")){ | |||
auto inputs = IOs["inputs"]; | |||
for (size_t i = 0; i < inputs.size(); i++) { | |||
auto input_json = inputs[i]; | |||
bool found = false; | |||
for (auto&& io_config : network_io.inputs) { | |||
if (io_config.name == input_json["name"]) { | |||
found = true; | |||
Config_IO(input_json, io_config); | |||
} | |||
} | |||
if (!found) { | |||
IO input; | |||
input.name = input_json["name"]; | |||
Config_IO(input_json, input); | |||
network_io.inputs.push_back(input); | |||
} | |||
} | |||
} | |||
if (IOs.contains("outputs")) { | |||
auto outputs = IOs["outputs"]; | |||
for (size_t i = 0; i < outputs.size(); i++) { | |||
auto output_json = outputs[i]; | |||
bool found = false; | |||
for (auto&& io_config : network_io.outputs) { | |||
if (io_config.name == output_json["name"]) { | |||
found = true; | |||
Config_IO(output_json, io_config); | |||
} | |||
} | |||
if (!found) { | |||
IO output; | |||
output.name = output_json["name"]; | |||
Config_IO(output_json, output); | |||
network_io.outputs.push_back(output); | |||
} | |||
} | |||
} | |||
} | |||
//! extra_info | |||
if (info.contains("extra_info")) { | |||
extra_info = info["extra_info"].dump(); | |||
} | |||
return true; | |||
#undef GET_BOOL | |||
#undef Config_IO | |||
} | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,40 @@ | |||
/** | |||
* \file src/parse_info/parse_info_base.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite/global.h" | |||
#include "mutex" | |||
namespace lite { | |||
struct ParseInfoStaticData { | |||
std::unordered_map<std::string, ParseInfoFunc> parse_info_methods; | |||
LITE_MUTEX map_mutex; | |||
}; | |||
ParseInfoStaticData& parse_info_static_data(); | |||
template <int count> | |||
struct ParseInfoRegister; | |||
} // namespace lite | |||
#define REGIST_PARSE_INFO_FUNCTION(name_, func_) \ | |||
REGIST_PARSE_INFO_FUNCTION_WITH_NUM(__COUNTER__, name_, func_) | |||
#define REGIST_PARSE_INFO_FUNCTION_WITH_NUM(number_, name_, func_) \ | |||
template <> \ | |||
struct ParseInfoRegister<number_> { \ | |||
ParseInfoRegister() { register_parse_info_func(name_, func_); } \ | |||
}; \ | |||
namespace { \ | |||
ParseInfoRegister<number_> parse_info_##number_; \ | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,134 @@ | |||
/** | |||
* \file src/model_parser.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "model_parser.h" | |||
#include "decryption/decrypt_base.h" | |||
#include "parse_info/parse_info_base.h" | |||
using namespace lite; | |||
using namespace model_parse; | |||
std::string ModelParser::sm_model_tag = "packed_model"; | |||
void ModelParser::parse_header() { | |||
size_t tag_length = sm_model_tag.size(); | |||
//! parse model tag | |||
const char* ptr = static_cast<char*>(m_model.get()); | |||
std::string tag(static_cast<const char*>(ptr), tag_length); | |||
if (sm_model_tag == tag) { | |||
m_is_bare_model = false; | |||
} else { | |||
//! if no tag, the model is bare model, return | |||
m_is_bare_model = true; | |||
return; | |||
} | |||
uint8_t* buffer = static_cast<uint8_t*>(m_model.get()) + tag_length; | |||
auto packed_model = GetPackModel(buffer); | |||
auto models = packed_model->models(); | |||
LITE_ASSERT(models->size() == 1, "Now only support one model"); | |||
auto model = models->Get(0); | |||
m_model_name = model->header()->name()->c_str(); | |||
m_model_decryption_name = | |||
model->header()->model_decryption_method()->c_str(); | |||
m_info_decryption_name = model->header()->info_decryption_method()->c_str(); | |||
m_info_parse_func_name = model->header()->info_parse_method()->c_str(); | |||
m_info = model->info(); | |||
m_model_data = model->data(); | |||
} | |||
bool ModelParser::parse_model_info( | |||
Config& network_config, NetworkIO& network_io, | |||
std::unordered_map<std::string, LiteAny>& isolated_config_map, | |||
std::string& extra_info) const { | |||
//! no model info, no parse, direct return | |||
if (m_is_bare_model || !m_info) { | |||
return false; | |||
} | |||
size_t info_length = m_info->data()->size(); | |||
const uint8_t* info_data = m_info->data()->Data(); | |||
//! decryption the info | |||
auto info_ptr = decrypt_memory(info_data, info_length, | |||
m_info_decryption_name, info_length); | |||
//! parse the info | |||
LITE_LOCK_GUARD(parse_info_static_data().map_mutex); | |||
auto it_parse = parse_info_static_data().parse_info_methods.find( | |||
m_info_parse_func_name); | |||
if (it_parse == parse_info_static_data().parse_info_methods.end()) { | |||
LITE_THROW(ssprintf("can't find model info parse function %s.", | |||
m_info_parse_func_name.c_str())); | |||
} | |||
auto model_info_parse_func = | |||
parse_info_static_data().parse_info_methods[m_info_parse_func_name]; | |||
//! convert for NetworkIOInner to NetworkIO | |||
if (model_info_parse_func) { | |||
model_info_parse_func(info_ptr.get(), info_length, m_model_name, | |||
network_config, network_io, isolated_config_map, | |||
extra_info); | |||
} else { | |||
LITE_THROW(ssprintf("model info parse function of %s is empty", | |||
m_info_parse_func_name.c_str())); | |||
} | |||
return true; | |||
} | |||
std::shared_ptr<void> ModelParser::parse_model(size_t& model_length, | |||
const Config& config) const { | |||
if (m_is_bare_model) { | |||
if (config.bare_model_cryption_name.size() == 0) { | |||
model_length = m_total_length; | |||
return m_model; | |||
} else { | |||
return decrypt_memory( | |||
static_cast<uint8_t*>(m_model.get()), m_total_length, | |||
config.bare_model_cryption_name, model_length); | |||
} | |||
} | |||
LITE_ASSERT(m_model_data, "packed model parse error!"); | |||
model_length = m_model_data->data()->size(); | |||
const uint8_t* model_data = m_model_data->data()->Data(); | |||
LITE_ASSERT(model_length > 0, "The loaded model is of zero length."); | |||
return decrypt_memory(model_data, model_length, m_model_decryption_name, | |||
model_length); | |||
} | |||
std::shared_ptr<void> ModelParser::decrypt_memory( | |||
const uint8_t* data, size_t length, const std::string decryption_name, | |||
size_t& result_length) const { | |||
const uint8_t* memory_ptr = data; | |||
if (decryption_name == "NONE") { | |||
result_length = length; | |||
return std::shared_ptr<void>(const_cast<uint8_t*>(memory_ptr), | |||
[](void*) {}); | |||
} | |||
LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
auto it = decryption_static_data().decryption_methods.find(decryption_name); | |||
if (it == decryption_static_data().decryption_methods.end()) { | |||
LITE_THROW(ssprintf("The decryption method %s is not registed yet.", | |||
decryption_name.c_str())); | |||
} | |||
auto&& func = it->second.first; | |||
auto&& key = it->second.second; | |||
if (func) { | |||
auto model_vector = func(memory_ptr, length, *key); | |||
result_length = model_vector.size(); | |||
auto tmp_model_vector = | |||
new std::vector<uint8_t>(std::move(model_vector)); | |||
return std::shared_ptr<void>( | |||
tmp_model_vector->data(), | |||
[tmp_model_vector](void*) { delete tmp_model_vector; }); | |||
} else { | |||
LITE_THROW(ssprintf("No decryption function in %s method.", | |||
decryption_name.c_str())); | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,75 @@ | |||
/** | |||
* \file src/model_parser.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite/global.h" | |||
#include "../network_impl_base.h" | |||
#include "pack_model_generated.h" | |||
#include <flatbuffers/flatbuffers.h> | |||
#include <unordered_map> | |||
namespace lite { | |||
/*! | |||
* \brief parse the model and decyt | |||
*/ | |||
class ModelParser { | |||
public: | |||
ModelParser(std::shared_ptr<void> model_ptr, size_t model_length) | |||
: m_model(model_ptr), m_total_length(model_length) { | |||
//! parse the header | |||
parse_header(); | |||
} | |||
//! parse the Info part of the model, update the network_config and | |||
//! network_io | |||
bool parse_model_info( | |||
Config& network_config, NetworkIO& network_io, | |||
std::unordered_map<std::string, LiteAny>& isolated_config_map, | |||
std::string& extra_info) const; | |||
//! parse the model and decrypt the model | |||
std::shared_ptr<void> parse_model(size_t& model_length, | |||
const Config& config) const; | |||
private: | |||
//! parse the header of the model and store the model related information | |||
//! to the menber data | |||
void parse_header(); | |||
//! decrypt a memory with length of length and decryption method name | |||
//! decrypt_name | |||
std::shared_ptr<void> decrypt_memory(const uint8_t* data, size_t length, | |||
const std::string decryption_name, | |||
size_t& result_length) const; | |||
private: | |||
std::string m_model_name; | |||
//! the info and model decryption method name, the | |||
//! decryption func can be found through this name | |||
std::string m_info_decryption_name; | |||
std::string m_model_decryption_name; | |||
//! the function name to parse the model info | |||
std::string m_info_parse_func_name; | |||
//! if a model is not added json info to the model is not crypted, the | |||
//! model is a bare model | |||
bool m_is_bare_model = true; | |||
const model_parse::ModelInfo* m_info = nullptr; | |||
const model_parse::ModelData* m_model_data = nullptr; | |||
std::shared_ptr<void> m_model; | |||
size_t m_total_length; | |||
static std::string sm_model_tag; | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,28 @@ | |||
namespace model_parse; | |||
table ModelHeader { | |||
name:string; | |||
info_decryption_method:string; | |||
info_parse_method:string; | |||
model_decryption_method:string; | |||
} | |||
table ModelInfo { | |||
data:[ubyte]; | |||
} | |||
table ModelData { | |||
data:[ubyte]; | |||
} | |||
table Model { | |||
header:ModelHeader; | |||
info:ModelInfo; | |||
data:ModelData; | |||
} | |||
table PackModel { | |||
models:[Model]; | |||
} | |||
root_type PackModel; |
@@ -0,0 +1,339 @@ | |||
/** | |||
* \file src/tensor.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite/tensor.h" | |||
#include "function_base.h" | |||
#include "tensor_impl_base.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "megbrain/comp_node.h" | |||
#include "megbrain/tensor.h" | |||
#include "mge/function_dft.h" | |||
#include "mge/tensor_impl.h" | |||
#endif | |||
#include <memory> | |||
using namespace lite; | |||
size_t Layout::get_elem_size() const { | |||
size_t elesize = 1; | |||
switch (data_type) { | |||
case LiteDataType::LITE_INT64: | |||
elesize = 8; | |||
break; | |||
case LiteDataType::LITE_FLOAT: | |||
case LiteDataType::LITE_INT: | |||
case LiteDataType::LITE_UINT: | |||
elesize = 4; | |||
break; | |||
case LiteDataType::LITE_HALF: | |||
case LiteDataType::LITE_INT16: | |||
case LiteDataType::LITE_UINT16: | |||
elesize = 2; | |||
break; | |||
case LiteDataType::LITE_INT8: | |||
case LiteDataType::LITE_UINT8: | |||
elesize = 1; | |||
break; | |||
default: | |||
LITE_THROW("not support data type."); | |||
} | |||
return elesize; | |||
} | |||
bool Layout::operator==(const Layout& other) const { | |||
bool equal = true; | |||
equal &= (ndim == other.ndim); | |||
equal &= (data_type == other.data_type); | |||
for (size_t i = 0; i < ndim; i++) { | |||
equal &= (shapes[i] == other.shapes[i]); | |||
} | |||
return equal; | |||
} | |||
Tensor::~Tensor() = default; | |||
Tensor::Tensor() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_tensor_impl = call_func<TensorImplDft, | |||
std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
"create_tensor"); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Tensor::Tensor(LiteDeviceType device_type, bool is_pinned_host) | |||
: m_is_pinned_host(is_pinned_host), m_device_type(device_type) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_tensor_impl = call_func<TensorImplDft, | |||
std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
"create_tensor", device_type, is_pinned_host); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Tensor::Tensor(LiteDeviceType device_type, const Layout& layout, | |||
bool is_pinned_host) | |||
: m_is_pinned_host(is_pinned_host), | |||
m_layout(layout), | |||
m_device_type(device_type) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_tensor_impl = call_func<TensorImplDft, | |||
std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
"create_tensor", device_type, layout, is_pinned_host); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Tensor::Tensor(int device_id, LiteDeviceType device_type, const Layout& layout, | |||
bool is_pinned_host) | |||
: m_is_pinned_host(is_pinned_host), | |||
m_device_id(device_id), | |||
m_layout(layout), | |||
m_device_type(device_type) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_tensor_impl = call_func<TensorImplDft, | |||
std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
"create_tensor", device_id, device_type, layout, is_pinned_host); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Tensor::Tensor(int device_id, int stream_id, LiteDeviceType device_type, | |||
bool is_pinned_host) | |||
: m_is_pinned_host(is_pinned_host), | |||
m_device_id(device_id), | |||
m_device_type(device_type) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_tensor_impl = call_func<TensorImplDft, | |||
std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
"create_tensor", device_id, stream_id, device_type, is_pinned_host); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
Tensor::Tensor(LiteBackend backend, LiteDeviceType device_type, int device_id, | |||
const Layout& layout, bool is_pinned_host) { | |||
if (backend == LiteBackend::LITE_DEFAULT) { | |||
m_tensor_impl = | |||
call_func<TensorImplDft, | |||
std::shared_ptr<lite::Tensor::TensorImplBase>>( | |||
"create_tensor", device_id, device_type, layout, | |||
is_pinned_host); | |||
} else { | |||
LITE_MARK_USED_VAR(device_type); | |||
LITE_MARK_USED_VAR(is_pinned_host); | |||
LITE_MARK_USED_VAR(layout); | |||
LITE_MARK_USED_VAR(device_id); | |||
LITE_THROW("unknow backend, enum id is : %d."); | |||
} | |||
} | |||
void Tensor::reshape(const std::vector<int>& shape) { | |||
LITE_ASSERT(m_layout.ndim > 0, "The tensor to be reshape is empty."); | |||
uint32_t length = shape.size(); | |||
LITE_ASSERT(length < Layout::MAXDIM, | |||
"The ndim of reshape input is too large."); | |||
Layout new_layout = m_layout; | |||
new_layout.ndim = length; | |||
size_t total_length = | |||
get_tensor_total_size_in_byte() / m_layout.get_elem_size(); | |||
uint32_t unfixed_number = 0; | |||
uint32_t unfixed_index = 0; | |||
for (uint32_t i = 0; i < length; i++) { | |||
if (shape[i] == -1) { | |||
unfixed_number += 1; | |||
unfixed_index = i; | |||
} else { | |||
LITE_ASSERT(shape[i] > 0, "The reshape inputs invalid."); | |||
new_layout.shapes[i] = shape[i]; | |||
} | |||
} | |||
LITE_ASSERT(unfixed_number <= 1, "The reshape inputs invalid."); | |||
if (unfixed_number) { | |||
size_t left = total_length; | |||
for (uint32_t i = 0; i < length; i++) { | |||
if (i == unfixed_index) { | |||
continue; | |||
} else { | |||
LITE_ASSERT(left > 0 && (left % new_layout.shapes[i] == 0), | |||
"The reshape inputs invalid."); | |||
left = left / new_layout.shapes[i]; | |||
} | |||
} | |||
LITE_ASSERT(left > 0, "The reshape inputs invalid."); | |||
new_layout.shapes[unfixed_index] = left; | |||
} | |||
size_t new_total = 1; | |||
for (uint32_t i = 0; i < length; i++) { | |||
new_total *= new_layout.shapes[i]; | |||
} | |||
LITE_ASSERT(new_total == total_length, "The reshape inputs invalid."); | |||
m_layout = new_layout; | |||
m_tensor_impl->reshape(m_layout); | |||
} | |||
size_t Tensor::get_tensor_total_size_in_byte() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
size_t elemsize = m_layout.get_elem_size(); | |||
size_t total = m_layout.ndim == 0 ? 0 : 1; | |||
for (size_t i = 0; i < m_layout.ndim; i++) { | |||
total *= m_layout.shapes[i]; | |||
} | |||
return total * elemsize; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void* Tensor::get_memory_ptr() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_layout.ndim != 0, | |||
"Tensor layout is not valid when get memory ptr."); | |||
return m_tensor_impl->get_memory_ptr(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void* Tensor::get_memory_ptr(const std::vector<size_t>& idx) const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
return m_tensor_impl->get_memory_ptr(idx); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
std::shared_ptr<Tensor> Tensor::slice(const std::vector<size_t>& start, | |||
const std::vector<size_t>& end, | |||
const std::vector<size_t>& step) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
auto ret = m_tensor_impl->slice(start, end, step); | |||
ret->update_from_implement(); | |||
return ret; | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::fill_zero() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_layout.ndim > 0, | |||
"fill_zero can't apply on a tensor with empty layout."); | |||
m_tensor_impl->fill_zero(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::share_memory_with(const Tensor& src_tensor) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(src_tensor.m_layout.ndim > 0, | |||
"To be shared tensor with empty layout."); | |||
m_tensor_impl->share_memory_with(src_tensor.m_tensor_impl.get()); | |||
update_from_implement(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::set_layout(const Layout& layout) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_layout = layout; | |||
m_tensor_impl->set_layout(layout); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::reset(void* prepared_data, size_t data_length_in_byte) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(m_layout.ndim, | |||
"Tensor layout is empty, please reset with layout"); | |||
LITE_ASSERT(data_length_in_byte >= get_tensor_total_size_in_byte(), | |||
"the memory reset to the tensor is too small."); | |||
m_tensor_impl->reset(prepared_data); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::reset(void* prepared_data, const Layout& layout) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_layout = layout; | |||
m_tensor_impl->reset(prepared_data, layout); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
bool Tensor::is_continue_memory() const { | |||
LITE_ERROR_HANDLER_BEGIN | |||
return m_tensor_impl->is_continue_memory(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::copy_from(const Tensor& src) { | |||
LITE_ERROR_HANDLER_BEGIN | |||
LITE_ASSERT(src.get_layout().ndim != 0, | |||
"when tensor copy, the src tensor layout is empty."); | |||
m_tensor_impl->copy_from(src.m_tensor_impl.get()); | |||
update_from_implement(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void Tensor::update_from_implement() { | |||
LITE_ERROR_HANDLER_BEGIN | |||
m_layout = m_tensor_impl->get_layout(); | |||
m_device_type = m_tensor_impl->get_device_type(); | |||
m_device_id = m_tensor_impl->get_device_id(); | |||
m_is_pinned_host = m_tensor_impl->is_pinned_host(); | |||
LITE_ERROR_HANDLER_END | |||
} | |||
void LiteAny::type_missmatch(size_t expect, size_t get) const { | |||
LITE_THROW(ssprintf( | |||
"The type store in LiteAny is not match the visit type, type of " | |||
"storage length is %zu, type of visit length is %zu.", | |||
expect, get)); | |||
} | |||
std::shared_ptr<Tensor> TensorUtils::concat(const std::vector<Tensor>& tensors, | |||
int dim, LiteDeviceType dst_device, | |||
int dst_device_id) { | |||
if (tensors.size() <= 0) { | |||
return std::make_shared<Tensor>(); | |||
} | |||
if (dst_device == LiteDeviceType::LITE_DEVICE_DEFAULT) { | |||
dst_device = tensors.front().get_device_type(); | |||
} | |||
if (dst_device_id == -1) { | |||
dst_device_id = tensors.front().get_device_id(); | |||
} | |||
bool is_pinned_host = tensors.front().is_pinned_host(); | |||
auto layout = tensors.front().get_layout(); | |||
LITE_ASSERT(static_cast<int>(layout.ndim) > dim, | |||
"the dim in concat is error."); | |||
size_t sum_in_dim = layout.shapes[dim]; | |||
for (size_t i = 1; i < tensors.size(); ++i) { | |||
auto other_layout = tensors[i].get_layout(); | |||
LITE_ASSERT(other_layout.ndim == layout.ndim, | |||
"the dim size of tensors is not same!"); | |||
LITE_ASSERT(other_layout.data_type == layout.data_type, | |||
"the dtype of tensors is not same!"); | |||
for (size_t j = 0; j < other_layout.ndim; ++j) { | |||
if (dim == static_cast<int>(j)) { | |||
sum_in_dim += other_layout.shapes[j]; | |||
continue; | |||
} | |||
LITE_ASSERT(other_layout.shapes[j] == layout.shapes[j], | |||
"the shape of tensors is not same!"); | |||
} | |||
} | |||
layout.shapes[dim] = sum_in_dim; | |||
auto result = std::make_shared<Tensor>(dst_device_id, dst_device, layout, | |||
is_pinned_host); | |||
size_t index = 0; | |||
std::vector<size_t> start(dim + 1, 0); | |||
std::vector<size_t> end(dim + 1, 0); | |||
for (int i = 0; i < dim; i++) { | |||
end[i] = layout.shapes[i]; | |||
} | |||
for (size_t i = 0; i < tensors.size(); ++i) { | |||
auto&& tensor = tensors[i]; | |||
auto layout = tensor.get_layout(); | |||
if (layout.shapes[dim] == 0) | |||
continue; | |||
start[dim] = index; | |||
end[dim] = index + layout.shapes[dim]; | |||
auto&& sub_dst = result->slice(start, end); | |||
sub_dst->copy_from(tensor); | |||
index += layout.shapes[dim]; | |||
} | |||
return result; | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,101 @@ | |||
/** | |||
* \file src/tensor_impl_base.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite/tensor.h" | |||
#include "misc.h" | |||
#include "type_info.h" | |||
#include <unordered_map> | |||
namespace lite { | |||
/*! | |||
* \brief implement the Tensor | |||
*/ | |||
class Tensor::TensorImplBase : public DynTypeObj { | |||
public: | |||
virtual ~TensorImplBase() = default; | |||
virtual LiteDeviceType get_device_type() const = 0; | |||
virtual int get_device_id() const = 0; | |||
virtual LiteBackend get_backend_type() const = 0; | |||
virtual Layout get_layout() const = 0; | |||
virtual bool is_pinned_host() const = 0; | |||
virtual void* get_memory_ptr() const = 0; | |||
virtual void* get_memory_ptr(const std::vector<size_t>& idx) const = 0; | |||
virtual void set_layout(const Layout& layout) = 0; | |||
//! use the user allocated data to reset the memory of the tensor, the | |||
//! memory will not be managed by the lite, later, the user should delete | |||
//! it. | |||
virtual void reset(void* prepared_data) = 0; | |||
//! use the user allocated data and corresponding layout to reset the data | |||
//! and layout of the tensor, the memory will not be managed by lite, later, | |||
//! the user should delete it. | |||
virtual void reset(void* prepared_data, const Layout& layout) = 0; | |||
//! reshape the tensor with new shape, keep the data_type the same | |||
virtual void reshape(const Layout& layout) = 0; | |||
//! get a new tensor slice from the origin tensor | |||
virtual std::shared_ptr<Tensor> slice( | |||
const std::vector<size_t>& start, const std::vector<size_t>& end, | |||
const std::vector<size_t>& step = {}) = 0; | |||
//! set the tensor memory with zero | |||
virtual void fill_zero() = 0; | |||
//! copy tensor form other tensor | |||
//! Note: the best way for tensor copy is just set the dst device, left | |||
//! layout empty, when copying the dst layout will be set the same with | |||
//! src | |||
virtual void copy_from(const TensorImplBase* src_impl) = 0; | |||
//! share memory with other tensor | |||
virtual void share_memory_with(const TensorImplBase* src_impl) = 0; | |||
//! whether the memory of tensor is continue | |||
virtual bool is_continue_memory() const = 0; | |||
}; | |||
/*! | |||
* \brief friend class of Tensor, for convenient accessing the Network members | |||
*/ | |||
class TensorHelper { | |||
public: | |||
static inline std::shared_ptr<Tensor::TensorImplBase> implement( | |||
const std::shared_ptr<Tensor> tensor) { | |||
LITE_ASSERT(tensor); | |||
return tensor->m_tensor_impl; | |||
} | |||
static inline std::shared_ptr<Tensor::TensorImplBase> implement( | |||
const Tensor* tensor) { | |||
LITE_ASSERT(tensor); | |||
return tensor->m_tensor_impl; | |||
} | |||
static inline void implement(const std::shared_ptr<Tensor> tensor, | |||
std::shared_ptr<Tensor::TensorImplBase> impl) { | |||
LITE_ASSERT(tensor); | |||
tensor->m_tensor_impl = impl; | |||
} | |||
}; | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,97 @@ | |||
/** | |||
* \file src/type_info.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "misc.h" | |||
namespace lite { | |||
/*! | |||
* \brief an object to represent a type | |||
* | |||
* LITE has a lightweight RTTI system. Each type is represented by the | |||
* address of a Typeinfo object, which is stored in the .bss segment. | |||
* | |||
* LITE_TYPEINFO_OBJ_DECL should be placed into the definition of classes that | |||
* need compile-time type support. | |||
* | |||
* For classes that need RTTI, they should be derived from DynTypeObj | |||
*/ | |||
struct Typeinfo { | |||
//! name of the corresponding type; nullptr if MGB_VERBOSE_TYPEINFO_NAME==0 | |||
const char* const name; | |||
/*! | |||
* \brief whether this is the type of given object | |||
* \tparam T a class with static typeinfo() method | |||
*/ | |||
template <typename T> | |||
bool is() const { | |||
return T::typeinfo() == this; | |||
} | |||
}; | |||
/*! | |||
* \brief base class to emulate RTTI without compiler support | |||
*/ | |||
class DynTypeObj { | |||
public: | |||
virtual Typeinfo* dyn_typeinfo() const = 0; | |||
//! cast this to a final object with type check | |||
template <class T> | |||
T& cast_final_safe() { | |||
LITE_ASSERT(T::typeinfo() == dyn_typeinfo(), | |||
"can not convert type %s to %s", dyn_typeinfo()->name, | |||
T::typeinfo()->name); | |||
return *static_cast<T*>(this); | |||
} | |||
template <class T> | |||
const T& cast_final_safe() const { | |||
return const_cast<DynTypeObj*>(this)->cast_final_safe<T>(); | |||
} | |||
//! check whether this is same to given type | |||
template <class T> | |||
bool same_type() const { | |||
return dyn_typeinfo() == T::typeinfo(); | |||
} | |||
protected: | |||
~DynTypeObj() = default; | |||
}; | |||
//! put in the declaration of a final class inherited from DynTypeObj | |||
#define LITE_DYN_TYPE_OBJ_FINAL_DECL \ | |||
public: \ | |||
::lite::Typeinfo* dyn_typeinfo() const override final; \ | |||
static inline ::lite::Typeinfo* typeinfo() { return &sm_typeinfo; } \ | |||
\ | |||
private: \ | |||
static ::lite::Typeinfo sm_typeinfo | |||
#if LITE_ENABLE_LOGGING | |||
//! get class name from class object | |||
#define _LITE_TYPEINFO_CLASS_NAME(_cls) #_cls | |||
#else | |||
#define _LITE_TYPEINFO_CLASS_NAME(_cls) nullptr | |||
#endif | |||
//! put in the impl file of a class that needs static typeinfo() | |||
#define LITE_TYPEINFO_OBJ_IMPL(_cls) \ | |||
::lite::Typeinfo _cls::sm_typeinfo { _LITE_TYPEINFO_CLASS_NAME(_cls) } | |||
//! put in the impl file of a final class inherited from DynTypeObj | |||
#define LITE_DYN_TYPE_OBJ_FINAL_IMPL(_cls) \ | |||
::lite::Typeinfo* _cls::dyn_typeinfo() const { return &sm_typeinfo; } \ | |||
LITE_TYPEINFO_OBJ_IMPL(_cls) | |||
} // namespace lite | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,10 @@ | |||
{ | |||
global: | |||
extern "C++" {lite::*;}; | |||
Lite*; | |||
LITE*; | |||
default_config; | |||
default_network_io; | |||
local: *; | |||
}; |
@@ -0,0 +1,23 @@ | |||
if (MGE_WITH_TEST) | |||
file (GLOB_RECURSE SOURCES ./*.cpp main.cpp) | |||
add_executable (lite_test ${SOURCES}) | |||
target_link_libraries(lite_test gtest) | |||
target_link_libraries(lite_test lite_static) | |||
if(LITE_BUILD_WITH_MGE) | |||
# lite_test will depends megbrain interface | |||
target_link_libraries(lite_test megbrain) | |||
endif() | |||
if(UNIX) | |||
if(APPLE OR ANDROID) | |||
target_link_libraries(lite_test dl) | |||
else() | |||
target_link_libraries(lite_test dl rt) | |||
endif() | |||
endif() | |||
install (TARGETS lite_test | |||
EXPORT ${LITE_EXPORT_TARGETS} | |||
RUNTIME DESTINATION lite/bin) | |||
endif() |
@@ -0,0 +1,33 @@ | |||
/** | |||
* \file test/main.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include <gtest/gtest.h> | |||
#include "../src/misc.h" | |||
#include "lite/global.h" | |||
namespace { | |||
class ResetSeedListener : public ::testing::EmptyTestEventListener { | |||
void OnTestStart(const ::testing::TestInfo&) override {} | |||
}; | |||
} // namespace | |||
int main(int argc, char** argv) { | |||
ResetSeedListener listener; | |||
auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); | |||
::testing::InitGoogleTest(&argc, argv); | |||
listeners.Append(&listener); | |||
lite::set_log_level(LiteLogLevel::WARN); | |||
auto ret = RUN_ALL_TESTS(); | |||
listeners.Release(&listener); | |||
return ret; | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,638 @@ | |||
/* | |||
Copyright 2017 Leon Merten Lohse | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
in the Software without restriction, including without limitation the rights | |||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
copies of the Software, and to permit persons to whom the Software is | |||
furnished to do so, subject to the following conditions: | |||
The above copyright notice and this permission notice shall be included in | |||
all copies or substantial portions of the Software. | |||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
SOFTWARE. | |||
*/ | |||
/* | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#ifndef NPY_H | |||
#define NPY_H | |||
#include <algorithm> | |||
#include <complex> | |||
#include <cstdint> | |||
#include <cstring> | |||
#include <fstream> | |||
#include <iostream> | |||
#include <regex> | |||
#include <sstream> | |||
#include <stdexcept> | |||
#include <string> | |||
#include <unordered_map> | |||
#include <vector> | |||
namespace npy { | |||
/* Compile-time test for byte order. | |||
If your compiler does not define these per default, you may want to define | |||
one of these constants manually. | |||
Defaults to little endian order. */ | |||
#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ | |||
defined(__BIG_ENDIAN__) || defined(__ARMEB__) || \ | |||
defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(_MIBSEB) || \ | |||
defined(__MIBSEB) || defined(__MIBSEB__) | |||
const bool big_endian = true; | |||
#else | |||
const bool big_endian = false; | |||
#endif | |||
const char magic_string[] = "\x93NUMPY"; | |||
const size_t magic_string_length = 6; | |||
const char little_endian_char = '<'; | |||
const char big_endian_char = '>'; | |||
const char no_endian_char = '|'; | |||
constexpr char host_endian_char = | |||
(big_endian ? big_endian_char : little_endian_char); | |||
/* npy array length */ | |||
typedef unsigned long int ndarray_len_t; | |||
inline void write_magic(std::ostream& ostream, unsigned char v_major = 1, | |||
unsigned char v_minor = 0) { | |||
ostream.write(magic_string, magic_string_length); | |||
ostream.put(v_major); | |||
ostream.put(v_minor); | |||
} | |||
inline void read_magic(std::istream& istream, unsigned char& v_major, | |||
unsigned char& v_minor) { | |||
char buf[magic_string_length + 2]; | |||
istream.read(buf, magic_string_length + 2); | |||
if (!istream) { | |||
fprintf(stderr, "io error: failed reading file"); | |||
} | |||
if (0 != std::memcmp(buf, magic_string, magic_string_length)) { | |||
fprintf(stderr, "this file does not have a valid npy format."); | |||
} | |||
v_major = buf[magic_string_length]; | |||
v_minor = buf[magic_string_length + 1]; | |||
} | |||
// typestring magic | |||
struct Typestring { | |||
private: | |||
char c_endian; | |||
char c_type; | |||
int len; | |||
public: | |||
inline std::string str() { | |||
const size_t max_buflen = 16; | |||
char buf[max_buflen]; | |||
std::sprintf(buf, "%c%c%u", c_endian, c_type, len); | |||
return std::string(buf); | |||
} | |||
Typestring(const std::vector<float>&) | |||
: c_endian{host_endian_char}, c_type{'f'}, len{sizeof(float)} {} | |||
Typestring(const std::vector<double>&) | |||
: c_endian{host_endian_char}, c_type{'f'}, len{sizeof(double)} {} | |||
Typestring(const std::vector<long double>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'f'}, | |||
len{sizeof(long double)} {} | |||
Typestring(const std::vector<char>&) | |||
: c_endian{no_endian_char}, c_type{'i'}, len{sizeof(char)} {} | |||
Typestring(const std::vector<short>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(short)} {} | |||
Typestring(const std::vector<int>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(int)} {} | |||
Typestring(const std::vector<long>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long)} {} | |||
Typestring(const std::vector<long long>&) | |||
: c_endian{host_endian_char}, c_type{'i'}, len{sizeof(long long)} {} | |||
Typestring(const std::vector<unsigned char>&) | |||
: c_endian{no_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned char)} {} | |||
Typestring(const std::vector<unsigned short>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned short)} {} | |||
Typestring(const std::vector<unsigned int>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned int)} {} | |||
Typestring(const std::vector<unsigned long>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned long)} {} | |||
Typestring(const std::vector<unsigned long long>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'u'}, | |||
len{sizeof(unsigned long long)} {} | |||
Typestring(const std::vector<std::complex<float>>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'c'}, | |||
len{sizeof(std::complex<float>)} {} | |||
Typestring(const std::vector<std::complex<double>>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'c'}, | |||
len{sizeof(std::complex<double>)} {} | |||
Typestring(const std::vector<std::complex<long double>>&) | |||
: c_endian{host_endian_char}, | |||
c_type{'c'}, | |||
len{sizeof(std::complex<long double>)} {} | |||
}; | |||
inline void parse_typestring(std::string typestring) { | |||
std::regex re("'([<>|])([ifuc])(\\d+)'"); | |||
std::smatch sm; | |||
std::regex_match(typestring, sm, re); | |||
if (sm.size() != 4) { | |||
fprintf(stderr, "invalid typestring"); | |||
} | |||
} | |||
namespace pyparse { | |||
/** | |||
Removes leading and trailing whitespaces | |||
*/ | |||
inline std::string trim(const std::string& str) { | |||
const std::string whitespace = " \t"; | |||
auto begin = str.find_first_not_of(whitespace); | |||
if (begin == std::string::npos) | |||
return ""; | |||
auto end = str.find_last_not_of(whitespace); | |||
return str.substr(begin, end - begin + 1); | |||
} | |||
inline std::string get_value_from_map(const std::string& mapstr) { | |||
size_t sep_pos = mapstr.find_first_of(":"); | |||
if (sep_pos == std::string::npos) | |||
return ""; | |||
std::string tmp = mapstr.substr(sep_pos + 1); | |||
return trim(tmp); | |||
} | |||
/** | |||
Parses the string representation of a Python dict | |||
The keys need to be known and may not appear anywhere else in the data. | |||
*/ | |||
inline std::unordered_map<std::string, std::string> parse_dict( | |||
std::string in, std::vector<std::string>& keys) { | |||
std::unordered_map<std::string, std::string> map; | |||
if (keys.size() == 0) | |||
return map; | |||
in = trim(in); | |||
// unwrap dictionary | |||
if ((in.front() == '{') && (in.back() == '}')) | |||
in = in.substr(1, in.length() - 2); | |||
else { | |||
fprintf(stderr, "Not a Python dictionary."); | |||
} | |||
std::vector<std::pair<size_t, std::string>> positions; | |||
for (auto const& value : keys) { | |||
size_t pos = in.find("'" + value + "'"); | |||
if (pos == std::string::npos) { | |||
fprintf(stderr, "Missing %s key.", value.c_str()); | |||
} | |||
std::pair<size_t, std::string> position_pair{pos, value}; | |||
positions.push_back(position_pair); | |||
} | |||
// sort by position in dict | |||
std::sort(positions.begin(), positions.end()); | |||
for (size_t i = 0; i < positions.size(); ++i) { | |||
std::string raw_value; | |||
size_t begin{positions[i].first}; | |||
size_t end{std::string::npos}; | |||
std::string key = positions[i].second; | |||
if (i + 1 < positions.size()) | |||
end = positions[i + 1].first; | |||
raw_value = in.substr(begin, end - begin); | |||
raw_value = trim(raw_value); | |||
if (raw_value.back() == ',') | |||
raw_value.pop_back(); | |||
map[key] = get_value_from_map(raw_value); | |||
} | |||
return map; | |||
} | |||
/** | |||
Parses the string representation of a Python boolean | |||
*/ | |||
inline bool parse_bool(const std::string& in) { | |||
if (in == "True") | |||
return true; | |||
if (in == "False") | |||
return false; | |||
fprintf(stderr, "Invalid python boolan."); | |||
return false; | |||
} | |||
/** | |||
Parses the string representation of a Python str | |||
*/ | |||
inline std::string parse_str(const std::string& in) { | |||
if ((in.front() == '\'') && (in.back() == '\'')) | |||
return in.substr(1, in.length() - 2); | |||
fprintf(stderr, "Invalid python string."); | |||
return ""; | |||
} | |||
/** | |||
Parses the string represenatation of a Python tuple into a vector of its items | |||
*/ | |||
inline std::vector<std::string> parse_tuple(std::string in) { | |||
std::vector<std::string> v; | |||
const char seperator = ','; | |||
in = trim(in); | |||
if ((in.front() == '(') && (in.back() == ')')) | |||
in = in.substr(1, in.length() - 2); | |||
else { | |||
fprintf(stderr, "Invalid Python tuple."); | |||
} | |||
std::istringstream iss(in); | |||
for (std::string token; std::getline(iss, token, seperator);) { | |||
v.push_back(token); | |||
} | |||
return v; | |||
} | |||
template <typename T> | |||
inline std::string write_tuple(const std::vector<T>& v) { | |||
if (v.size() == 0) | |||
return ""; | |||
std::ostringstream ss; | |||
if (v.size() == 1) { | |||
ss << "(" << v.front() << ",)"; | |||
} else { | |||
const std::string delimiter = ", "; | |||
// v.size() > 1 | |||
ss << "("; | |||
std::copy(v.begin(), v.end() - 1, | |||
std::ostream_iterator<T>(ss, delimiter.c_str())); | |||
ss << v.back(); | |||
ss << ")"; | |||
} | |||
return ss.str(); | |||
} | |||
inline std::string write_boolean(bool b) { | |||
if (b) | |||
return "True"; | |||
else | |||
return "False"; | |||
} | |||
} // namespace pyparse | |||
inline void parse_header(std::string header, std::string& descr) { | |||
/* | |||
The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
The next 1 byte is an unsigned byte: the major version number of the file | |||
format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
number of the file format, e.g. x00. Note: the version of the file format | |||
is not tied to the version of the numpy package. The next 2 bytes form a | |||
little-endian unsigned short int: the length of the header data | |||
HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
array's format. It is an ASCII string which contains a Python literal | |||
expression of a dictionary. It is terminated by a newline ('n') and | |||
padded with spaces | |||
('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
evenly divisible by 16 for alignment purposes. The dictionary contains | |||
three keys: | |||
"descr" : dtype.descr | |||
An object that can be passed as an argument to the numpy.dtype() | |||
constructor to create the array's dtype. For repeatability and | |||
readability, this dictionary is formatted using pprint.pformat() so the | |||
keys are in alphabetic order. | |||
*/ | |||
// remove trailing newline | |||
if (header.back() != '\n') | |||
fprintf(stderr, "invalid header"); | |||
header.pop_back(); | |||
// parse the dictionary | |||
std::vector<std::string> keys{"descr"}; | |||
auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
if (dict_map.size() == 0) | |||
fprintf(stderr, "invalid dictionary in header"); | |||
std::string descr_s = dict_map["descr"]; | |||
parse_typestring(descr_s); | |||
// remove | |||
descr = npy::pyparse::parse_str(descr_s); | |||
return; | |||
} | |||
inline void parse_header(std::string header, std::string& descr, | |||
bool& fortran_order, | |||
std::vector<ndarray_len_t>& shape) { | |||
/* | |||
The first 6 bytes are a magic string: exactly "x93NUMPY". | |||
The next 1 byte is an unsigned byte: the major version number of the file | |||
format, e.g. x01. The next 1 byte is an unsigned byte: the minor version | |||
number of the file format, e.g. x00. Note: the version of the file format | |||
is not tied to the version of the numpy package. The next 2 bytes form a | |||
little-endian unsigned short int: the length of the header data | |||
HEADER_LEN. The next HEADER_LEN bytes form the header data describing the | |||
array's format. It is an ASCII string which contains a Python literal | |||
expression of a dictionary. It is terminated by a newline ('n') and | |||
padded with spaces | |||
('x20') to make the total length of the magic string + 4 + HEADER_LEN be | |||
evenly divisible by 16 for alignment purposes. The dictionary contains | |||
three keys: | |||
"descr" : dtype.descr | |||
An object that can be passed as an argument to the numpy.dtype() | |||
constructor to create the array's dtype. "fortran_order" : bool Whether | |||
the array data is Fortran-contiguous or not. Since Fortran-contiguous | |||
arrays are a common form of non-C-contiguity, we allow them to be written | |||
directly to disk for efficiency. "shape" : tuple of int The shape of the | |||
array. For repeatability and readability, this dictionary is formatted | |||
using pprint.pformat() so the keys are in alphabetic order. | |||
*/ | |||
// remove trailing newline | |||
if (header.back() != '\n') | |||
fprintf(stderr, "invalid header"); | |||
header.pop_back(); | |||
// parse the dictionary | |||
std::vector<std::string> keys{"descr", "fortran_order", "shape"}; | |||
auto dict_map = npy::pyparse::parse_dict(header, keys); | |||
if (dict_map.size() == 0) | |||
fprintf(stderr, "invalid dictionary in header"); | |||
std::string descr_s = dict_map["descr"]; | |||
std::string fortran_s = dict_map["fortran_order"]; | |||
std::string shape_s = dict_map["shape"]; | |||
// TODO: extract info from typestring | |||
parse_typestring(descr_s); | |||
// remove | |||
descr = npy::pyparse::parse_str(descr_s); | |||
// convert literal Python bool to C++ bool | |||
fortran_order = npy::pyparse::parse_bool(fortran_s); | |||
// parse the shape tuple | |||
auto shape_v = npy::pyparse::parse_tuple(shape_s); | |||
if (shape_v.size() == 0) | |||
fprintf(stderr, "invalid shape tuple in header"); | |||
for (auto item : shape_v) { | |||
ndarray_len_t dim = static_cast<ndarray_len_t>(std::stoul(item)); | |||
shape.push_back(dim); | |||
} | |||
} | |||
inline std::string write_header_dict(const std::string& descr, | |||
bool fortran_order, | |||
const std::vector<ndarray_len_t>& shape) { | |||
std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); | |||
std::string shape_s = npy::pyparse::write_tuple(shape); | |||
return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + | |||
", 'shape': " + shape_s + ", }"; | |||
} | |||
inline void write_header(std::ostream& out, const std::string& descr, | |||
bool fortran_order, | |||
const std::vector<ndarray_len_t>& shape_v) { | |||
std::string header_dict = write_header_dict(descr, fortran_order, shape_v); | |||
size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; | |||
unsigned char version[2] = {1, 0}; | |||
if (length >= 255 * 255) { | |||
length = magic_string_length + 2 + 4 + header_dict.length() + 1; | |||
version[0] = 2; | |||
version[1] = 0; | |||
} | |||
size_t padding_len = 16 - length % 16; | |||
std::string padding(padding_len, ' '); | |||
// write magic | |||
write_magic(out, version[0], version[1]); | |||
// write header length | |||
if (version[0] == 1 && version[1] == 0) { | |||
char header_len_le16[2]; | |||
uint16_t header_len = static_cast<uint16_t>(header_dict.length() + | |||
padding.length() + 1); | |||
header_len_le16[0] = (header_len >> 0) & 0xff; | |||
header_len_le16[1] = (header_len >> 8) & 0xff; | |||
out.write(reinterpret_cast<char*>(header_len_le16), 2); | |||
} else { | |||
char header_len_le32[4]; | |||
uint32_t header_len = static_cast<uint32_t>(header_dict.length() + | |||
padding.length() + 1); | |||
header_len_le32[0] = (header_len >> 0) & 0xff; | |||
header_len_le32[1] = (header_len >> 8) & 0xff; | |||
header_len_le32[2] = (header_len >> 16) & 0xff; | |||
header_len_le32[3] = (header_len >> 24) & 0xff; | |||
out.write(reinterpret_cast<char*>(header_len_le32), 4); | |||
} | |||
out << header_dict << padding << '\n'; | |||
} | |||
inline std::string read_header(std::istream& istream) { | |||
// check magic bytes an version number | |||
unsigned char v_major, v_minor; | |||
read_magic(istream, v_major, v_minor); | |||
uint32_t header_length = 0; | |||
if (v_major == 1 && v_minor == 0) { | |||
char header_len_le16[2]; | |||
istream.read(header_len_le16, 2); | |||
header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); | |||
if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { | |||
// TODO: display warning | |||
} | |||
} else if (v_major == 2 && v_minor == 0) { | |||
char header_len_le32[4]; | |||
istream.read(header_len_le32, 4); | |||
header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) | | |||
(header_len_le32[2] << 16) | (header_len_le32[3] << 24); | |||
if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { | |||
// TODO: display warning | |||
} | |||
} else { | |||
fprintf(stderr, "unsupported file format version"); | |||
} | |||
auto buf_v = std::vector<char>(); | |||
buf_v.reserve(header_length); | |||
istream.read(buf_v.data(), header_length); | |||
std::string header(buf_v.data(), header_length); | |||
return header; | |||
} | |||
inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) { | |||
ndarray_len_t size = 1; | |||
for (ndarray_len_t i : shape) | |||
size *= i; | |||
return size; | |||
} | |||
template <typename Scalar> | |||
inline void SaveArrayAsNumpy(const std::string& filename, bool fortran_order, | |||
unsigned int n_dims, const unsigned long shape[], | |||
const std::vector<Scalar>& data) { | |||
Typestring typestring_o(data); | |||
std::string typestring = typestring_o.str(); | |||
std::ofstream stream(filename, std::ofstream::binary); | |||
if (!stream) { | |||
fprintf(stderr, "io error: failed to open a file."); | |||
} | |||
std::vector<ndarray_len_t> shape_v(shape, shape + n_dims); | |||
write_header(stream, typestring, fortran_order, shape_v); | |||
auto size = static_cast<size_t>(comp_size(shape_v)); | |||
stream.write(reinterpret_cast<const char*>(data.data()), | |||
sizeof(Scalar) * size); | |||
} | |||
template <typename Scalar> | |||
inline void LoadArrayFromNumpy(const std::string& filename, | |||
std::vector<unsigned long>& shape, | |||
std::vector<Scalar>& data) { | |||
bool fortran_order; | |||
LoadArrayFromNumpy<Scalar>(filename, shape, fortran_order, data); | |||
} | |||
template <typename Scalar> | |||
inline void LoadArrayFromNumpy(const std::string& filename, | |||
std::vector<unsigned long>& shape, | |||
bool& fortran_order, std::vector<Scalar>& data) { | |||
std::ifstream stream(filename, std::ifstream::binary); | |||
if (!stream) { | |||
fprintf(stderr, "io error: failed to open a file."); | |||
} | |||
std::string header = read_header(stream); | |||
// parse header | |||
std::string typestr; | |||
parse_header(header, typestr, fortran_order, shape); | |||
// check if the typestring matches the given one | |||
Typestring typestring_o{data}; | |||
std::string expect_typestr = typestring_o.str(); | |||
if (typestr != expect_typestr) { | |||
fprintf(stderr, "formatting error: typestrings not matching"); | |||
} | |||
// compute the data size based on the shape | |||
auto size = static_cast<size_t>(comp_size(shape)); | |||
data.resize(size); | |||
// read the data | |||
stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar) * size); | |||
} | |||
inline void LoadArrayFromNumpy(const std::string& filename, | |||
std::string& type_str, | |||
std::vector<ndarray_len_t>& shape, | |||
std::vector<int8_t>& data) { | |||
std::ifstream stream(filename, std::ifstream::binary); | |||
if (!stream) { | |||
fprintf(stderr, "io error: failed to open a file."); | |||
} | |||
std::string header = read_header(stream); | |||
bool fortran_order; | |||
// parse header | |||
parse_header(header, type_str, fortran_order, shape); | |||
// check if the typestring matches the given one | |||
std::string size_str = type_str.substr(type_str.size() - 1); | |||
size_t elem_size = atoi(size_str.c_str()); | |||
// compute the data size based on the shape | |||
auto byte_size = elem_size * static_cast<size_t>(comp_size(shape)); | |||
data.resize(byte_size); | |||
// read the data | |||
stream.read(reinterpret_cast<char*>(data.data()), byte_size); | |||
} | |||
} // namespace npy | |||
#endif // NPY_H |
@@ -0,0 +1,184 @@ | |||
/** | |||
* \file test/test_common.h | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#pragma once | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../src/misc.h" | |||
#include "../src/mge/network_impl.h" | |||
#include "../src/mge/common.h" | |||
#include "lite/network.h" | |||
#include "lite/tensor.h" | |||
#include "megbrain/tensor.h" | |||
#include "megbrain/graph/bases.h" | |||
#include "megbrain/plugin/opr_io_dump.h" | |||
#include "megbrain/plugin/profiler.h" | |||
#include "megbrain/serialization/extern_c_opr.h" | |||
#include "megbrain/serialization/file.h" | |||
#include "megbrain/serialization/load_dump_config.h" | |||
#include "megbrain/serialization/serializer.h" | |||
#include "megbrain/utils/thin/hash_table.h" | |||
#include "npy.h" | |||
#include <gtest/gtest.h> | |||
#include <string.h> | |||
#include <chrono> | |||
#include <memory> | |||
#include <random> | |||
namespace lite { | |||
template <typename T> | |||
static ::testing::AssertionResult compare_memory(const void* memory0, | |||
const void* memory1, | |||
size_t length, | |||
float maxerr = 1e-3) { | |||
const T* data_ptr0 = static_cast<const T*>(memory0); | |||
const T* data_ptr1 = static_cast<const T*>(memory1); | |||
for (size_t i = 0; i < length; i++) { | |||
auto diff = std::abs(data_ptr0[i] - data_ptr1[i]); | |||
if (diff > maxerr) { | |||
return ::testing::AssertionFailure() | |||
<< "Unequal value:\n" | |||
<< "value 0 = " << data_ptr0[i] << "\n" | |||
<< "value 1 = " << data_ptr1[i] << "\n" | |||
<< "At index: " << i << "\n"; | |||
} | |||
} | |||
return ::testing::AssertionSuccess(); | |||
} | |||
template <typename T> | |||
void compare_lite_tensor(std::shared_ptr<Tensor> tensor0, | |||
std::shared_ptr<Tensor> tensor1, float maxerr = 1e-3) { | |||
size_t elemsize = tensor0->get_layout().get_elem_size(); | |||
T* data_ptr0 = static_cast<T*>(tensor0->get_memory_ptr()); | |||
T* data_ptr1 = static_cast<T*>(tensor1->get_memory_ptr()); | |||
size_t length = tensor0->get_tensor_total_size_in_byte() / elemsize; | |||
EXPECT_TRUE(compare_memory<T>(data_ptr0, data_ptr1, length, maxerr)); | |||
} | |||
__attribute__((unused)) static std::shared_ptr<Tensor> get_input_data( | |||
std::string path) { | |||
std::string type_str; | |||
std::vector<npy::ndarray_len_t> stl_shape; | |||
std::vector<int8_t> raw; | |||
npy::LoadArrayFromNumpy(path, type_str, stl_shape, raw); | |||
auto lite_tensor = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU); | |||
Layout layout; | |||
layout.ndim = stl_shape.size(); | |||
const std::map<std::string, LiteDataType> type_map = { | |||
{"f4", LiteDataType::LITE_FLOAT}, | |||
{"i4", LiteDataType::LITE_INT}, | |||
{"i1", LiteDataType::LITE_INT8}, | |||
{"u1", LiteDataType::LITE_UINT8}}; | |||
layout.shapes[0] = 1; | |||
for (size_t i = 0; i < stl_shape.size(); i++) { | |||
layout.shapes[i] = static_cast<size_t>(stl_shape[i]); | |||
} | |||
for (auto& item : type_map) { | |||
if (type_str.find(item.first) != std::string::npos) { | |||
layout.data_type = item.second; | |||
break; | |||
} | |||
} | |||
lite_tensor->set_layout(layout); | |||
size_t length = lite_tensor->get_tensor_total_size_in_byte(); | |||
void* dest = lite_tensor->get_memory_ptr(); | |||
memcpy(dest, raw.data(), length); | |||
return lite_tensor; | |||
} | |||
__attribute__((unused)) static std::shared_ptr<Tensor> mgelite_lar( | |||
std::string model_path, const Config& config, std::string, | |||
std::shared_ptr<Tensor> input) { | |||
std::unique_ptr<Network> network = std::make_unique<Network>(config); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0); | |||
auto src_ptr = input->get_memory_ptr(); | |||
auto src_layout = input->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
network->forward(); | |||
network->wait(); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
Layout out_layout = output_tensor->get_layout(); | |||
auto ret = std::make_shared<Tensor>(LiteDeviceType::LITE_CPU, out_layout); | |||
void* out_data = output_tensor->get_memory_ptr(); | |||
void* dst_data = ret->get_memory_ptr(); | |||
memcpy(dst_data, out_data, ret->get_tensor_total_size_in_byte()); | |||
return ret; | |||
} | |||
__attribute__((unused)) static std::shared_ptr<Tensor> mgb_lar( | |||
std::string model_path, const Config& config, std::string input_name, | |||
std::shared_ptr<Tensor> input) { | |||
LITE_ASSERT(config.bare_model_cryption_name.size() == 0); | |||
using namespace mgb; | |||
serialization::GraphLoader::LoadConfig mgb_config; | |||
mgb_config.comp_node_mapper = [config](CompNode::Locator& loc) { | |||
loc = to_compnode_locator(config.device_type); | |||
}; | |||
mgb_config.comp_graph = ComputingGraph::make(); | |||
auto&& graph_opt = mgb_config.comp_graph->options(); | |||
if (config.options.weight_preprocess) { | |||
graph_opt.graph_opt.enable_weight_preprocess(); | |||
} | |||
graph_opt.comp_node_seq_record_level = | |||
config.options.comp_node_seq_record_level; | |||
auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str()); | |||
auto format = | |||
serialization::GraphLoader::identify_graph_dump_format(*inp_file); | |||
mgb_assert(format.valid(), | |||
"invalid model: unknown model format, please make sure input " | |||
"file is generated by GraphDumper"); | |||
auto loader = | |||
serialization::GraphLoader::make(std::move(inp_file), format.val()); | |||
auto load_ret = loader->load(mgb_config, false); | |||
ComputingGraph::OutputSpec out_spec; | |||
std::vector<HostTensorND> output_tensors(load_ret.output_var_list.size()); | |||
for (size_t i = 0; i < load_ret.output_var_list.size(); i++) { | |||
auto cb = [&output_tensors, i](const DeviceTensorND& dv) mutable { | |||
output_tensors[i].copy_from(dv); | |||
}; | |||
out_spec.emplace_back(load_ret.output_var_list[i], std::move(cb)); | |||
} | |||
auto func = load_ret.graph_compile(out_spec); | |||
auto& in = load_ret.tensor_map.find(input_name)->second; | |||
in->copy_from(*TensorHelper::implement(input) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor()); | |||
func->execute(); | |||
func->wait(); | |||
std::shared_ptr<Tensor> ret = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
to_lite_layout(output_tensors[0].layout())); | |||
auto mge_tensor = TensorHelper::implement(ret) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor(); | |||
mge_tensor->copy_from(output_tensors[0]); | |||
return ret; | |||
} | |||
} // namespace lite | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,115 @@ | |||
/** | |||
* \file test/test_misc.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "test_common.h" | |||
#include "../src/decryption/decrypt_base.h" | |||
#include "../src/network_impl_base.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/tensor.h" | |||
#include "megbrain/utils/metahelper.h" | |||
#include <gtest/gtest.h> | |||
#include <string.h> | |||
#include <chrono> | |||
#include <memory> | |||
#include <random> | |||
using namespace lite; | |||
TEST(TestMisc, DecryptionRegister) { | |||
size_t number = decryption_static_data().decryption_methods.size(); | |||
//! At least one method is register by lite | |||
ASSERT_GE(number, 1); | |||
DecryptionFunc func; | |||
register_decryption_and_key("AllForTest0", func, {}); | |||
ASSERT_EQ(number + 1, decryption_static_data().decryption_methods.size()); | |||
} | |||
TEST(TestMisc, DecryptionUpdate) { | |||
DecryptionFunc func; | |||
register_decryption_and_key("AllForTest1", func, {}); | |||
func = [](const void*, size_t, | |||
const std::vector<uint8_t>&) -> std::vector<uint8_t> { | |||
return {}; | |||
}; | |||
update_decryption_or_key("AllForTest1", func, {}); | |||
ASSERT_NE(decryption_static_data().decryption_methods["AllForTest1"].first, | |||
nullptr); | |||
ASSERT_EQ(decryption_static_data() | |||
.decryption_methods["AllForTest1"] | |||
.second->size(), | |||
0); | |||
update_decryption_or_key("AllForTest1", {}, {1, 2, 3}); | |||
ASSERT_EQ(decryption_static_data() | |||
.decryption_methods["AllForTest1"] | |||
.second->size(), | |||
3); | |||
} | |||
TEST(TestMisc, SharedSameDeviceTensor) { | |||
using namespace mgb; | |||
serialization::GraphLoader::LoadConfig mgb_config; | |||
mgb_config.comp_node_mapper = [](CompNode::Locator& loc) { | |||
loc = to_compnode_locator(LiteDeviceType::LITE_CPU); | |||
}; | |||
mgb_config.comp_graph = ComputingGraph::make(); | |||
std::string model_path = "./shufflenet.mge"; | |||
auto inp_file = mgb::serialization::InputFile::make_fs(model_path.c_str()); | |||
auto format = | |||
serialization::GraphLoader::identify_graph_dump_format(*inp_file); | |||
mgb_assert(format.valid(), | |||
"invalid model: unknown model format, please make sure input " | |||
"file is generated by GraphDumper"); | |||
auto loader = | |||
serialization::GraphLoader::make(std::move(inp_file), format.val()); | |||
auto load_ret_1 = loader->load(mgb_config, true); | |||
auto load_ret_2 = loader->load(mgb_config, true); | |||
ASSERT_EQ(load_ret_1.output_var_list.size(), | |||
load_ret_2.output_var_list.size()); | |||
ComputingGraph::OutputSpec out_spec_1, out_spec_2; | |||
for (size_t i = 0; i < load_ret_1.output_var_list.size(); i++) { | |||
out_spec_1.emplace_back(load_ret_1.output_var_list[i], nullptr); | |||
out_spec_2.emplace_back(load_ret_2.output_var_list[i], nullptr); | |||
} | |||
auto func_1 = load_ret_1.graph_compile(out_spec_1); | |||
auto func_2 = load_ret_2.graph_compile(out_spec_1); | |||
std::vector<cg::OperatorNodeBase*> oprs_1, oprs_2; | |||
func_1->iter_opr_seq([&oprs_1](cg::OperatorNodeBase* opr) -> bool { | |||
if (opr->try_cast_final<opr::ImmutableTensor>()) { | |||
oprs_1.push_back(opr); | |||
} | |||
return true; | |||
}); | |||
func_1->iter_opr_seq([&oprs_2](cg::OperatorNodeBase* opr) -> bool { | |||
if (opr->try_cast_final<opr::ImmutableTensor>()) { | |||
oprs_2.push_back(opr); | |||
} | |||
return true; | |||
}); | |||
ASSERT_EQ(oprs_1.size(), oprs_2.size()); | |||
for (size_t i = 0; i < oprs_1.size(); i++) { | |||
auto tensor_1 = | |||
oprs_1[i]->try_cast_final<opr::ImmutableTensor>()->value(); | |||
auto tensor_2 = | |||
oprs_2[i]->try_cast_final<opr::ImmutableTensor>()->value(); | |||
ASSERT_EQ(tensor_1.raw_ptr(), tensor_2.raw_ptr()); | |||
} | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,895 @@ | |||
/** | |||
* \file test/test_network_c.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "../src/misc.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../src/common.h" | |||
#include "../src/mge/network_impl.h" | |||
#include "../lite-c/src/common.h" | |||
#include "lite-c/global_c.h" | |||
#include "lite-c/network_c.h" | |||
#include "lite-c/tensor_c.h" | |||
#include "./test_common.h" | |||
#include "megbrain/tensor.h" | |||
#include <string.h> | |||
#include <chrono> | |||
#include <memory> | |||
#include <random> | |||
#include <unordered_map> | |||
namespace { | |||
int affinity_set = false; | |||
int single_thread_affinity(int) { | |||
affinity_set = true; | |||
return 0; | |||
} | |||
std::atomic_size_t m_nr_left{0}; | |||
std::atomic_size_t m_nr_allocated{0}; | |||
void* allocate(LiteDeviceType device, int, size_t size, size_t align) { | |||
LITE_ASSERT(device == LiteDeviceType::LITE_CPU); | |||
m_nr_left++; | |||
m_nr_allocated++; | |||
#ifdef WIN32 | |||
return _aligned_malloc(size, align); | |||
#elif defined(__ANDROID__) || defined(ANDROID) | |||
return memalign(align, size); | |||
#else | |||
void* ptr = nullptr; | |||
auto err = posix_memalign(&ptr, align, size); | |||
mgb_assert(!err, "failed to malloc %zu bytes with align %zu", size, align); | |||
return ptr; | |||
#endif | |||
} | |||
void free(LiteDeviceType device, int, void* ptr) { | |||
m_nr_left--; | |||
LITE_ASSERT(device == LiteDeviceType::LITE_CPU); | |||
#ifdef WIN32 | |||
_aligned_free(ptr); | |||
#else | |||
::free(ptr); | |||
#endif | |||
}; | |||
#define NUMBER_THREDS (4) | |||
std::vector<std::thread::id> thread_ids(NUMBER_THREDS); | |||
int multi_thread_affinity(int id) { | |||
thread_ids[id] = std::this_thread::get_id(); | |||
return 0; | |||
}; | |||
volatile bool finished = false; | |||
int finish_callback() { | |||
finished = true; | |||
return 0; | |||
} | |||
volatile bool start_checked = false; | |||
int start_callback(const LiteIO* inputs, const LiteTensor* input_tensors, | |||
size_t size) { | |||
start_checked = true; | |||
auto check_func = [&]() { | |||
ASSERT_EQ(size, 1); | |||
ASSERT_EQ(std::string(inputs->name), "data"); | |||
LiteLayout layout; | |||
LITE_get_tensor_layout(*input_tensors, &layout); | |||
ASSERT_EQ(layout.ndim, 4); | |||
ASSERT_EQ(layout.shapes[1], 3); | |||
ASSERT_EQ(layout.shapes[2], 224); | |||
ASSERT_EQ(layout.shapes[3], 224); | |||
}; | |||
check_func(); | |||
return 0; | |||
} | |||
volatile bool finish_checked = false; | |||
int finish_callback(const LiteIO* outputs, const LiteTensor* output_tensors, | |||
size_t size) { | |||
finish_checked = true; | |||
auto check_func = [&]() { | |||
ASSERT_EQ(size, 1); | |||
ASSERT_EQ(std::string(outputs->name), | |||
"TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); | |||
LiteLayout layout; | |||
LITE_get_tensor_layout(*output_tensors, &layout); | |||
ASSERT_EQ(layout.shapes[1], 1000); | |||
}; | |||
check_func(); | |||
return 0; | |||
} | |||
} // namespace | |||
#define LITE_CAPI_CHECK(_expr) \ | |||
do { \ | |||
int _ret = (_expr); \ | |||
if (_ret) { \ | |||
LITE_THROW(LITE_get_last_error()); \ | |||
} \ | |||
} while (0) | |||
#define ForwardMgb \ | |||
lite::Config config; \ | |||
auto lite_tensor = lite::get_input_data("./input_data.npy"); \ | |||
size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); \ | |||
std::string model_path = "./shufflenet.mge"; \ | |||
auto result_mgb = mgb_lar(model_path, config, "data", lite_tensor) | |||
#define MakeNetwork \ | |||
LiteNetwork c_network; \ | |||
LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), \ | |||
*default_network_io())) | |||
#define LoadNetwork \ | |||
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str())) | |||
#define SetInput \ | |||
LiteTensor c_input_tensor, c_output_tensor; \ | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, "data", LITE_INPUT, \ | |||
&c_input_tensor)); \ | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, \ | |||
lite_tensor->get_memory_ptr(), \ | |||
data_length_in_byte)) | |||
#define ForwardNetwork \ | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); \ | |||
LITE_CAPI_CHECK(LITE_wait(c_network)) | |||
#define GetOutput \ | |||
const char* output_name; \ | |||
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); \ | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_OUTPUT, \ | |||
&c_output_tensor)); \ | |||
void* output_ptr; \ | |||
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor, &output_ptr)) | |||
#define CompareResult \ | |||
EXPECT_TRUE(lite::compare_memory<float>( \ | |||
output_ptr, result_mgb->get_memory_ptr(), \ | |||
result_mgb->get_tensor_total_size_in_byte() / sizeof(float))) | |||
TEST(TestCapiNetWork, BasicResetInput) { | |||
ForwardMgb; | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); | |||
LoadNetwork; | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
LITE_destroy_network(c_network); | |||
} | |||
TEST(TestCapiNetWork, GetAllName) { | |||
std::string model_path = "./shufflenet.mge"; | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); | |||
LoadNetwork; | |||
size_t input_size, output_size; | |||
LITE_get_all_input_name(c_network, &input_size, nullptr); | |||
LITE_get_all_output_name(c_network, &output_size, nullptr); | |||
std::vector<const char*> input_names(input_size); | |||
LITE_get_all_input_name(c_network, nullptr, input_names.data()); | |||
ASSERT_EQ(input_names.size(), 1); | |||
ASSERT_TRUE(std::string(input_names[0]) == "data"); | |||
std::vector<const char*> output_names(output_size); | |||
LITE_get_all_output_name(c_network, nullptr, output_names.data()); | |||
ASSERT_TRUE(std::string(output_names[0]) == | |||
"TRUE_DIV(EXP[12065],reduce0[12067])[12077]"); | |||
ASSERT_EQ(output_names.size(), 1); | |||
LITE_destroy_network(c_network); | |||
} | |||
#if LITE_BUILD_WITH_RKNPU | |||
static int GetTop(float* pfProb, float* pfMaxProb, uint32_t* pMaxClass, | |||
uint32_t outputCount, uint32_t topNum) { | |||
uint32_t i, j; | |||
#define MAX_TOP_NUM 20 | |||
if (topNum > MAX_TOP_NUM) | |||
return 0; | |||
memset(pfMaxProb, 0, sizeof(float) * topNum); | |||
memset(pMaxClass, 0xff, sizeof(float) * topNum); | |||
for (j = 0; j < topNum; j++) { | |||
for (i = 0; i < outputCount; i++) { | |||
if ((i == *(pMaxClass + 0)) || (i == *(pMaxClass + 1)) || | |||
(i == *(pMaxClass + 2)) || (i == *(pMaxClass + 3)) || | |||
(i == *(pMaxClass + 4))) { | |||
continue; | |||
} | |||
if (pfProb[i] > *(pfMaxProb + j)) { | |||
*(pfMaxProb + j) = pfProb[i]; | |||
*(pMaxClass + j) = i; | |||
} | |||
} | |||
} | |||
return 1; | |||
} | |||
TEST(TestCapiNetWork, rknntest_set_info) { | |||
#define SET_INFO_SIZE 2 | |||
#define TENSOR_TYPE_UINT8 3 | |||
#define TENSOR_FORMAT_NHWC 1 | |||
LiteConfig config; | |||
config.backend = LiteBackend::LITE_RK_NPU; | |||
config.device_type = LiteDeviceType::LITE_NPU; | |||
config.bare_model_cryption_name = nullptr; | |||
auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy"); | |||
auto true_tensor = lite::get_input_data("./output_data.npy"); | |||
auto rknn_model = "./model/mobilenet_v1.rknn"; | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config)); | |||
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model)); | |||
size_t input_size, output_size; | |||
LITE_get_all_input_name(c_network, &input_size, nullptr); | |||
LITE_get_all_output_name(c_network, &output_size, nullptr); | |||
std::vector<const char*> input_names(input_size); | |||
std::vector<const char*> output_names(output_size); | |||
LiteTensor c_input_tensor, c_output_tensor; | |||
LITE_get_all_input_name(c_network, nullptr, input_names.data()); | |||
LITE_get_all_output_name(c_network, nullptr, output_names.data()); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO, | |||
&c_input_tensor)); | |||
size_t input_length = 0; | |||
LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length); | |||
size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); | |||
{ | |||
LiteLayout input_layout; | |||
LITE_get_tensor_layout(c_input_tensor, &input_layout); | |||
ASSERT_TRUE(input_layout.data_type == LITE_INT8); | |||
std::vector<int> input_shape={1,224,224,3}; | |||
for (size_t i = 0; i < input_layout.ndim; i++) { | |||
ASSERT_TRUE(input_layout.shapes[i]=input_shape[i]); | |||
} | |||
} | |||
{ | |||
int size_attr = 0; | |||
LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, nullptr, nullptr, | |||
&size_attr)); | |||
ASSERT_TRUE(size_attr > 0); | |||
const char* keys[size_attr]; | |||
void* values[size_attr]; | |||
LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_input_tensor, keys, values, | |||
&size_attr)); | |||
ASSERT_TRUE(size_attr > 5); | |||
std::unordered_map<std::string, uint32_t> result_map = { | |||
{"zp", 0}, | |||
{"index", 0}, | |||
{"size_with_stride", 150528}, | |||
{"stride", 224}, | |||
{"n_size", 150528}, | |||
{"n_elems", 150528}, | |||
{"qnt_type", 2}, | |||
{"n_dims", 4}, | |||
{"type", 2}, | |||
{"fmt", 1}, | |||
{"dims0", 1}, | |||
{"dims1", 224}, | |||
{"dims2", 224}, | |||
{"dims3", 3}, | |||
}; | |||
for (int i = 0; i < size_attr; i++) { | |||
std::string key(keys[i]); | |||
if (key == "names") { | |||
ASSERT_TRUE(std::string("input") == | |||
std::string(static_cast<const char*>(values[i]))); | |||
} else if (key == "scale") { | |||
float scale = *static_cast<float*>(values[i]); | |||
ASSERT_TRUE(std::fabs(scale - 0.007812) < 0.00001); | |||
} else if (key == "fl" || key == "pass_through") { | |||
uint8_t val = *static_cast<uint8_t*>(values[i]); | |||
if (key == "fl") { | |||
ASSERT_TRUE(val == 0); | |||
} else { | |||
ASSERT_TRUE(val == 1); | |||
} | |||
} else { | |||
uint32_t val = *static_cast<uint32_t*>(values[i]); | |||
ASSERT_TRUE(result_map[std::string(keys[i])]==val); | |||
} | |||
} | |||
} | |||
const char* keys[] = {"type", "fmt"}; | |||
int info_size = SET_INFO_SIZE; | |||
int type = TENSOR_TYPE_UINT8; | |||
int fmt = TENSOR_FORMAT_NHWC; | |||
void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)}; | |||
LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values, | |||
info_size)); | |||
ASSERT_TRUE(std::string(output_names[0]) == | |||
std::string("MobilenetV1/Predictions/Reshape_1")); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
&c_output_tensor)); | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, | |||
lite_tensor->get_memory_ptr(), | |||
data_length_in_byte)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
&c_output_tensor)); | |||
//LiteLayout tmp_output_layout; | |||
//LITE_get_tensor_layout(c_output_tensor, &tmp_output_layout); | |||
//tmp_output_layout.data_type = LiteDataType::LITE_FLOAT; | |||
//LITE_set_tensor_layout(c_output_tensor, tmp_output_layout); | |||
{ | |||
const char* keys[] = {"want_float"}; | |||
uint8_t want_float = 1; | |||
void* values[] = {static_cast<void*>(&want_float)}; | |||
LITE_CAPI_CHECK( | |||
LITE_set_tensor_information(c_output_tensor, keys, values, 1)); | |||
} | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
ASSERT_TRUE(std::string(output_names[0]) == "MobilenetV1/Predictions/Reshape_1"); | |||
ASSERT_EQ(output_names.size(), 1); | |||
{ | |||
LiteLayout output_layout; | |||
LITE_get_tensor_layout(c_output_tensor, &output_layout); | |||
ASSERT_TRUE(output_layout.data_type == LITE_FLOAT); | |||
int size_attr = 0; | |||
LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, nullptr, nullptr, | |||
&size_attr)); | |||
ASSERT_TRUE(size_attr > 0); | |||
const char* keys[size_attr]; | |||
void* values[size_attr]; | |||
LITE_CAPI_CHECK(LITE_get_tensor_attribute(c_output_tensor, keys, values, | |||
&size_attr)); | |||
ASSERT_TRUE(size_attr > 5); | |||
std::unordered_map<std::string, uint32_t> result_map = { | |||
{"zp", 0}, | |||
{"index", 0}, | |||
{"size_with_stride", 2002}, | |||
{"stride", 0}, | |||
{"n_size", 2002}, | |||
{"n_elems", 1001}, | |||
{"qnt_type", 2}, | |||
{"n_dims", 2}, | |||
{"type", 0}, | |||
{"fmt", 2}, | |||
{"dims0", 1}, | |||
{"dims1", 1001}, | |||
}; | |||
for (int i = 0; i < size_attr; i++) { | |||
std::string key(keys[i]); | |||
if (key == "names") { | |||
ASSERT_TRUE("MobilenetV1/Predictions/Reshape_1" == | |||
std::string(static_cast<const char*>(values[i]))); | |||
} else if (key == "scale") { | |||
float scale = *static_cast<float*>(values[i]); | |||
ASSERT_TRUE(std::fabs(scale - 1.0) < 0.00001); | |||
} else if (key == "fl" || key == "pass_through") { | |||
uint8_t val = *static_cast<uint8_t*>(values[i]); | |||
ASSERT_TRUE(val == 0); | |||
} else { | |||
uint32_t val = *static_cast<uint32_t*>(values[i]); | |||
ASSERT_TRUE(result_map[std::string(keys[i])]==val); | |||
} | |||
} | |||
} | |||
{ | |||
uint32_t MaxClass[5]; | |||
float fMaxProb[5]; | |||
void* output_ptr; | |||
LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
float* buffer = (float*)output_ptr; | |||
uint32_t sz = true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
GetTop(buffer, fMaxProb, MaxClass, sz, 5); | |||
std::vector<uint32_t> result_class = { | |||
286, 464, 282, 357, 285, | |||
}; | |||
std::vector<float> result_prob = { | |||
0.407227, 0.365723, 0.090454, 0.018051, 0.013069, | |||
}; | |||
for (int i = 0; i < 5; i++) { | |||
ASSERT_TRUE(result_class[i] == MaxClass[i]); | |||
ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); | |||
} | |||
} | |||
{ | |||
float* true_data = static_cast<float*>(true_tensor->get_memory_ptr()); | |||
void* output_ptr; | |||
LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
float* data1 = static_cast<float*>(output_ptr); | |||
size_t length = | |||
true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
for (size_t i = 0; i < length; i++) { | |||
ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3); | |||
} | |||
} | |||
LITE_destroy_network(c_network); | |||
#undef SET_INFO_SIZE | |||
#undef TENSOR_FORMAT_NHWC | |||
#undef TENSOR_TYPE_UINT8 | |||
} | |||
TEST(TestCapiNetWork, rknntest_set_info_two_input) { | |||
#define SET_INFO_SIZE 2 | |||
#define TENSOR_TYPE_UINT8 3 | |||
#define TENSOR_FORMAT_NHWC 1 | |||
LiteConfig config; | |||
config.backend = LiteBackend::LITE_RK_NPU; | |||
config.device_type = LiteDeviceType::LITE_NPU; | |||
config.bare_model_cryption_name = nullptr; | |||
auto lite_tensor = lite::get_input_data("./model/cat_224x224.npy"); | |||
auto lite_tensor_dog = lite::get_input_data("./model/dog_224x224.npy"); | |||
auto true_tensor = lite::get_input_data("./output_data.npy"); | |||
auto rknn_model = "./model/mobilenet_v1.rknn"; | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK(LITE_make_network_config(&c_network, config)); | |||
LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, rknn_model)); | |||
size_t input_size, output_size; | |||
LITE_get_all_input_name(c_network, &input_size, nullptr); | |||
LITE_get_all_output_name(c_network, &output_size, nullptr); | |||
std::vector<const char*> input_names(input_size); | |||
std::vector<const char*> output_names(output_size); | |||
LiteTensor c_input_tensor, c_output_tensor; | |||
LITE_get_all_input_name(c_network, nullptr, input_names.data()); | |||
LITE_get_all_output_name(c_network, nullptr, output_names.data()); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, input_names[0], LITE_IO, | |||
&c_input_tensor)); | |||
size_t input_length = 0; | |||
LITE_get_tensor_total_size_in_byte(c_input_tensor, &input_length); | |||
size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); | |||
{ | |||
LiteLayout input_layout; | |||
LITE_get_tensor_layout(c_input_tensor, &input_layout); | |||
ASSERT_TRUE(input_layout.data_type == LITE_INT8); | |||
std::vector<int> input_shape = {1, 224, 224, 3}; | |||
for (size_t i = 0; i < input_layout.ndim; i++) { | |||
ASSERT_TRUE(input_layout.shapes[i] = input_shape[i]); | |||
} | |||
} | |||
const char* keys[] = {"type", "fmt"}; | |||
int info_size = SET_INFO_SIZE; | |||
int type = TENSOR_TYPE_UINT8; | |||
int fmt = TENSOR_FORMAT_NHWC; | |||
void* values[] = {static_cast<void*>(&type), static_cast<void*>(&fmt)}; | |||
LITE_CAPI_CHECK(LITE_set_tensor_information(c_input_tensor, keys, values, | |||
info_size)); | |||
ASSERT_TRUE(std::string(output_names[0]) == | |||
std::string("MobilenetV1/Predictions/Reshape_1")); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
&c_output_tensor)); | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, | |||
lite_tensor->get_memory_ptr(), | |||
data_length_in_byte)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_names[0], LITE_IO, | |||
&c_output_tensor)); | |||
{ | |||
const char* keys[] = {"want_float"}; | |||
uint8_t want_float = 1; | |||
void* values[] = {static_cast<void*>(&want_float)}; | |||
LITE_CAPI_CHECK( | |||
LITE_set_tensor_information(c_output_tensor, keys, values, 1)); | |||
} | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
ASSERT_TRUE(std::string(output_names[0]) == | |||
"MobilenetV1/Predictions/Reshape_1"); | |||
ASSERT_EQ(output_names.size(), 1); | |||
{ | |||
uint32_t MaxClass[5]; | |||
float fMaxProb[5]; | |||
void* output_ptr; | |||
LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
float* buffer = (float*)output_ptr; | |||
uint32_t sz = | |||
true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
GetTop(buffer, fMaxProb, MaxClass, sz, 5); | |||
std::vector<uint32_t> result_class = { | |||
286, 464, 282, 357, 285, | |||
}; | |||
std::vector<float> result_prob = { | |||
0.407227, 0.365723, 0.090454, 0.018051, 0.013069, | |||
}; | |||
for (int i = 0; i < 5; i++) { | |||
ASSERT_TRUE(result_class[i] == MaxClass[i]); | |||
ASSERT_TRUE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); | |||
} | |||
} | |||
{ | |||
float* true_data = static_cast<float*>(true_tensor->get_memory_ptr()); | |||
void* output_ptr; | |||
LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
float* data1 = static_cast<float*>(output_ptr); | |||
size_t length = | |||
true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
for (size_t i = 0; i < length; i++) { | |||
ASSERT_LT(std::abs(data1[i] - true_data[i]), 1e-3); | |||
} | |||
} | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory(c_input_tensor, | |||
lite_tensor_dog->get_memory_ptr(), | |||
data_length_in_byte)); | |||
LITE_CAPI_CHECK(LITE_forward(c_network)); | |||
LITE_CAPI_CHECK(LITE_wait(c_network)); | |||
ASSERT_TRUE(std::string(output_names[0]) == | |||
"MobilenetV1/Predictions/Reshape_1"); | |||
ASSERT_EQ(output_names.size(), 1); | |||
{ | |||
uint32_t MaxClass[5]; | |||
float fMaxProb[5]; | |||
void* output_ptr; | |||
LITE_get_tensor_memory(c_output_tensor, &output_ptr); | |||
float* buffer = (float*)output_ptr; | |||
uint32_t sz = | |||
true_tensor->get_tensor_total_size_in_byte() / sizeof(float); | |||
GetTop(buffer, fMaxProb, MaxClass, sz, 5); | |||
std::vector<float> result_prob = { | |||
0.407227, 0.365723, 0.090454, 0.018051, 0.013069, | |||
}; | |||
for (int i = 0; i < 5; i++) { | |||
ASSERT_FALSE(std::fabs(result_prob[i] - fMaxProb[i]) < 0.0001); | |||
} | |||
} | |||
LITE_destroy_network(c_network); | |||
#undef SET_INFO_SIZE | |||
#undef TENSOR_FORMAT_NHWC | |||
#undef TENSOR_TYPE_UINT8 | |||
} | |||
#endif | |||
TEST(TestCapiNetWork, BasicResetOutput) { | |||
ForwardMgb; | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK(LITE_make_default_network(&c_network)); | |||
LoadNetwork; | |||
SetInput; | |||
LiteLayout output_layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}; | |||
std::shared_ptr<float> ptr(new float[1000], | |||
[](float* ptr) { delete[] ptr; }); | |||
const char* output_name; | |||
LITE_CAPI_CHECK(LITE_get_output_name(c_network, 0, &output_name)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network, output_name, LITE_IO, | |||
&c_output_tensor)); | |||
LITE_CAPI_CHECK( | |||
LITE_reset_tensor(c_output_tensor, output_layout, ptr.get())); | |||
ForwardNetwork; | |||
EXPECT_TRUE(lite::compare_memory<float>( | |||
ptr.get(), result_mgb->get_memory_ptr(), | |||
result_mgb->get_tensor_total_size_in_byte() / sizeof(float))); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, BasicInplaceAndSingleThreadAffinity) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
//! config the network with cpu inplace mode | |||
LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network)); | |||
LoadNetwork; | |||
//! set single thread affinith callback | |||
LITE_CAPI_CHECK(LITE_set_runtime_thread_affinity(c_network, | |||
single_thread_affinity)); | |||
SetInput; | |||
ForwardNetwork; | |||
ASSERT_EQ(affinity_set, true); | |||
affinity_set = false; | |||
GetOutput; | |||
CompareResult; | |||
LITE_destroy_network(c_network); | |||
} | |||
TEST(TestCapiNetWork, UserAllocator) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
LITE_CAPI_CHECK(LITE_set_memory_allocator(c_network, allocate, free)); | |||
LoadNetwork; | |||
SetInput; | |||
ForwardNetwork; | |||
ASSERT_GE(m_nr_allocated, 1); | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
ASSERT_EQ(m_nr_left, 0); | |||
} | |||
TEST(TestCapiNetWork, BasicMultiThread) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
LITE_CAPI_CHECK(LITE_set_cpu_threads_number(c_network, NUMBER_THREDS)); | |||
LoadNetwork; | |||
LITE_CAPI_CHECK( | |||
LITE_set_runtime_thread_affinity(c_network, multi_thread_affinity)); | |||
SetInput; | |||
ForwardNetwork; | |||
for (size_t i = 0; i < NUMBER_THREDS; i++) { | |||
for (size_t j = i + 1; j < NUMBER_THREDS; j++) { | |||
ASSERT_NE(thread_ids[i], thread_ids[j]); | |||
} | |||
} | |||
for (size_t i = 0; i < NUMBER_THREDS; i++) { | |||
thread_ids[i] = std::thread::id(); | |||
} | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, DeviceIO) { | |||
ForwardMgb; | |||
LiteNetwork c_network; | |||
LiteIO input_io = default_io; | |||
input_io.is_host = true; | |||
input_io.name = "data"; | |||
LiteNetworkIO network_io = *default_network_io(); | |||
network_io.inputs = &input_io; | |||
network_io.input_size = 1; | |||
LITE_CAPI_CHECK(LITE_make_network(&c_network, *default_config(), network_io)); | |||
LoadNetwork; | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, StartCallBack) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
LoadNetwork; | |||
LITE_CAPI_CHECK(LITE_set_start_callback(c_network, start_callback)); | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
ASSERT_TRUE(start_checked); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, FinishCallBack) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
LoadNetwork; | |||
LITE_CAPI_CHECK(LITE_set_finish_callback(c_network, finish_callback)); | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
ASSERT_TRUE(finish_checked); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, BasicCryptAes) { | |||
ForwardMgb; | |||
LiteConfig c_config = *default_config(); | |||
c_config.bare_model_cryption_name = "AES_default"; | |||
LiteNetwork c_network; | |||
LITE_CAPI_CHECK( | |||
LITE_make_network(&c_network, c_config, *default_network_io())); | |||
std::string model_crypt_path = "./shufflenet_crypt_aes.mge"; | |||
LITE_CAPI_CHECK( | |||
LITE_load_model_from_path(c_network, model_crypt_path.c_str())); | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, PackedCryptRc4) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
std::string model_crypt_path = "./test_packed_model_rc4.lite"; | |||
LITE_CAPI_CHECK( | |||
LITE_load_model_from_path(c_network, model_crypt_path.c_str())); | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, AsyncExec) { | |||
finished = false; | |||
ForwardMgb; | |||
LiteNetwork c_network; | |||
LiteConfig c_config = *default_config(); | |||
c_config.options.var_sanity_check_first_run = false; | |||
LITE_CAPI_CHECK( | |||
LITE_make_network(&c_network, c_config, *default_network_io())); | |||
LITE_CAPI_CHECK(LITE_set_async_callback(c_network, finish_callback)); | |||
LoadNetwork; | |||
SetInput; | |||
LITE_forward(c_network); | |||
size_t count = 0; | |||
while (finished == false) { | |||
count++; | |||
} | |||
ASSERT_GT(count, 0); | |||
finished = false; | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, OutputShapeOnly) { | |||
ForwardMgb; | |||
LiteNetwork c_network; | |||
LiteNetworkIO c_network_io = *default_network_io(); | |||
LiteIO io_output = default_io; | |||
io_output.io_type = LiteIOType::LITE_IO_SHAPE; | |||
io_output.name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]"; | |||
c_network_io.outputs = &io_output; | |||
c_network_io.output_size = 1; | |||
LITE_CAPI_CHECK( | |||
LITE_make_network(&c_network, *default_config(), c_network_io)); | |||
LoadNetwork; | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
size_t length = 0; | |||
LITE_CAPI_CHECK( | |||
LITE_get_tensor_total_size_in_byte(c_output_tensor, &length)); | |||
ASSERT_EQ(length / sizeof(float), 1000); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, ProfileIOdump) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
LITE_CAPI_CHECK( | |||
LITE_enable_profile_performance(c_network, "./profile.json")); | |||
LoadNetwork; | |||
SetInput; | |||
ForwardNetwork; | |||
ASSERT_TRUE(fopen("./profile.json", "r")); | |||
LITE_CAPI_CHECK(LITE_enable_io_txt_dump(c_network, "./io_txt_dump.txt")); | |||
ForwardNetwork; | |||
ASSERT_TRUE(fopen("./io_txt_dump.txt", "r")); | |||
GetOutput; | |||
CompareResult; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, GetDeviceType) { | |||
lite::Config config; | |||
auto lite_tensor = lite::get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
MakeNetwork; | |||
LoadNetwork; | |||
LiteDeviceType devicetype; | |||
LITE_CAPI_CHECK(LITE_get_device_type(c_network, &devicetype)); | |||
ASSERT_TRUE(devicetype == LiteDeviceType::LITE_CPU); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, GetModelExtraInfo) { | |||
lite::Config config; | |||
std::string model_path = "./track_640_320_pack_model_rc4_with_info.lite"; | |||
MakeNetwork; | |||
LITE_load_model_from_path(c_network, model_path.c_str()); | |||
const char* info = nullptr; | |||
int info_size = 0; | |||
LITE_CAPI_CHECK(LITE_get_model_extra_info(c_network, &info, &info_size)); | |||
ASSERT_TRUE(info_size > 0); | |||
printf("info %s \n", info); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, TestWorkSpaceLimit) { | |||
lite::Config config; | |||
auto lite_tensor = lite::get_input_data("./input_data.npy"); | |||
size_t data_length_in_byte = lite_tensor->get_tensor_total_size_in_byte(); | |||
std::string model_path = "./shufflenet.mge"; | |||
MakeNetwork; | |||
LoadNetwork; | |||
printf("go to config workspace limit\n"); | |||
LITE_CAPI_CHECK(LITE_set_network_algo_workspace_limit(c_network, 1000)); | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
TEST(TestCapiNetWork, TestShareWeights) { | |||
ForwardMgb; | |||
MakeNetwork; | |||
LoadNetwork; | |||
SetInput; | |||
ForwardNetwork; | |||
GetOutput; | |||
CompareResult; | |||
LiteNetwork c_network2; | |||
LITE_CAPI_CHECK( | |||
LITE_make_network(&c_network2, *default_config(), *default_network_io())); | |||
LITE_CAPI_CHECK(LITE_set_cpu_inplace_mode(c_network2)); | |||
LITE_CAPI_CHECK(LITE_shared_weight_with_network(c_network2, c_network)); | |||
int is_cpu_inplace_mode = false; | |||
LITE_CAPI_CHECK(LITE_is_cpu_inplace_mode(c_network2, &is_cpu_inplace_mode)); | |||
ASSERT_EQ(is_cpu_inplace_mode, true); | |||
LiteTensor c_input_tensor2, c_output_tensor2; | |||
LITE_CAPI_CHECK( | |||
LITE_get_io_tensor(c_network2, "data", LITE_IO, &c_input_tensor2)); | |||
LITE_CAPI_CHECK(LITE_reset_tensor_memory( | |||
c_input_tensor2, lite_tensor->get_memory_ptr(), | |||
lite_tensor->get_tensor_total_size_in_byte())); | |||
LITE_CAPI_CHECK(LITE_forward(c_network2)); | |||
LITE_CAPI_CHECK(LITE_wait(c_network2)); | |||
LITE_CAPI_CHECK(LITE_get_io_tensor(c_network2, output_name, LITE_IO, | |||
&c_output_tensor2)); | |||
void* output_ptr2; | |||
LITE_CAPI_CHECK(LITE_get_tensor_memory(c_output_tensor2, &output_ptr2)); | |||
EXPECT_TRUE(lite::compare_memory<float>( | |||
output_ptr2, result_mgb->get_memory_ptr(), | |||
result_mgb->get_tensor_total_size_in_byte() / sizeof(float))); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network2)); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,351 @@ | |||
/** | |||
* \file test/test_network_options.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../src/common.h" | |||
#include "../src/misc.h" | |||
#include "../src/mge/network_impl.h" | |||
#include "lite/global.h" | |||
#include "megbrain/tensor.h" | |||
#include "test_common.h" | |||
#include <string.h> | |||
#include <chrono> | |||
#include <memory> | |||
#include <random> | |||
using namespace lite; | |||
TEST(TestNetWorkOptions, no_var_sanity_check_and_record) { | |||
Config config; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
config.options.var_sanity_check_first_run = false; | |||
config.options.comp_node_seq_record_level = 1; | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
TEST(TestNetWorkOptions, const_shape) { | |||
Config config; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
config.options.var_sanity_check_first_run = false; | |||
config.options.const_shape = true; | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
TEST(TestNetWorkOptions, NCHW44) { | |||
Config config; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
config.options.var_sanity_check_first_run = false; | |||
config.options.enable_nchw44 = true; | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
Runtime::set_network_algo_policy( | |||
network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
TEST(TestNetWorkOptions, test_cache) { | |||
Config config; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
set_persistent_cache("./algo_cache.txt", true); | |||
network->load_model(model_path); | |||
Runtime::set_network_algo_policy( | |||
network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
dump_persistent_cache("./algo_cache.txt"); | |||
ASSERT_TRUE(fopen("./algo_cache.txt", "r")); | |||
set_persistent_cache("./algo_cache.txt"); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
TEST(TestNetWorkOptions, FastRunIgnorBatch) { | |||
Config config; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
set_persistent_cache("./algo_cache.txt"); | |||
network->load_model(model_path); | |||
Runtime::set_network_algo_policy( | |||
network, | |||
LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE, | |||
1, true); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
dump_persistent_cache("./algo_cache.txt"); | |||
ASSERT_TRUE(fopen("./algo_cache.txt", "r")); | |||
} | |||
#if LITE_WITH_CUDA | |||
TEST(TestNetWorkOptions, NCHW4) { | |||
Config config; | |||
config.device_type = LiteDeviceType::LITE_CUDA; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
config.options.enable_nchw4 = 1; | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
TEST(TestNetWorkOptions, NCHW32) { | |||
Config config; | |||
config.device_type = LiteDeviceType::LITE_CUDA; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
config.options.enable_nchw32 = 1; | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
Runtime::set_network_algo_policy( | |||
network, LiteAlgoSelectStrategy::LITE_ALGO_PROFILE | | |||
LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
TEST(TestNetWorkOptions, jit_level) { | |||
Config config; | |||
config.device_type = LiteDeviceType::LITE_CUDA; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
config.options.jit_level = 1; | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
#endif | |||
#if MGB_ENABLE_TENSOR_RT && LITE_WITH_CUDA | |||
TEST(TestNetWorkOptions, TensorRT) { | |||
Config config; | |||
config.device_type = LiteDeviceType::LITE_CUDA; | |||
auto tensor = get_input_data("./input_data.npy"); | |||
std::string model_path = "./shufflenet.mge"; | |||
std::string input_name = "data"; | |||
auto result_mgb = mgb_lar(model_path, config, input_name, tensor); | |||
std::shared_ptr<Network> network = std::make_shared<Network>(config); | |||
Runtime::use_tensorrt(network); | |||
set_tensor_rt_cache("./tensorrt_cache.txt"); | |||
network->load_model(model_path); | |||
std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name); | |||
auto src_ptr = tensor->get_memory_ptr(); | |||
auto src_layout = tensor->get_layout(); | |||
input_tensor->reset(src_ptr, src_layout); | |||
std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0); | |||
auto result_tensor = std::make_shared<Tensor>( | |||
LiteDeviceType::LITE_CPU, | |||
Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); | |||
void* out_data = result_tensor->get_memory_ptr(); | |||
output_tensor->reset(out_data, result_tensor->get_layout()); | |||
network->forward(); | |||
network->wait(); | |||
dump_tensor_rt_cache(); | |||
ASSERT_TRUE(fopen("./tensorrt_cache.txt", "r")); | |||
compare_lite_tensor<float>(output_tensor, result_mgb); | |||
} | |||
#endif | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,589 @@ | |||
/** | |||
* \file test/test_tensor.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../src/misc.h" | |||
#include "../src/mge/common.h" | |||
#include "../src/mge/network_impl.h" | |||
#include "lite/tensor.h" | |||
#include <gtest/gtest.h> | |||
#include <string.h> | |||
#include <memory> | |||
using namespace lite; | |||
TEST(TestTensor, Basic) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
//! mge tensor has created | |||
ASSERT_TRUE(TensorHelper::implement(&tensor1)); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor2)); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor3)); | |||
//! check member | |||
ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU); | |||
ASSERT_EQ(tensor2.get_layout(), layout); | |||
ASSERT_EQ(tensor3.get_layout(), layout); | |||
//! check the real tensor | |||
ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor1) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor()); | |||
ASSERT_FALSE(TensorHelper::implement(&tensor1) | |||
->cast_final_safe<TensorImplDft>() | |||
.dev_tensor()); | |||
ASSERT_FALSE(TensorHelper::implement(&tensor1) | |||
->cast_final_safe<TensorImplDft>() | |||
.dev_tensor()); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor1) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor()); | |||
} | |||
TEST(TestTensor, SetLayoutReAlloc) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor1; | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
auto old_ptr2 = tensor2.get_memory_ptr(); | |||
auto old_ptr3 = tensor3.get_memory_ptr(); | |||
//! layout set through | |||
Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; | |||
tensor1.set_layout(layout1); | |||
tensor2.set_layout(layout1); | |||
tensor3.set_layout(layout1); | |||
ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); | |||
ASSERT_EQ(tensor3.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); | |||
auto layout2 = TensorHelper::implement(&tensor2) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor() | |||
->layout(); | |||
auto layout3 = TensorHelper::implement(&tensor3) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor() | |||
->layout(); | |||
ASSERT_EQ(to_lite_layout(layout2), layout1); | |||
ASSERT_EQ(to_lite_layout(layout3), layout1); | |||
auto new_ptr2 = tensor2.get_memory_ptr(); | |||
auto new_ptr3 = tensor3.get_memory_ptr(); | |||
ASSERT_EQ(old_ptr2, new_ptr2); | |||
ASSERT_EQ(old_ptr3, new_ptr3); | |||
} | |||
TEST(TestTensor, Reset) { | |||
Layout layout{{3, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
Tensor tensor1; | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
auto old_ptr2 = tensor2.get_memory_ptr(); | |||
auto old_ptr3 = tensor3.get_memory_ptr(); | |||
//! make sure memory is allocted | |||
ASSERT_NO_THROW(memcpy(old_ptr2, old_ptr3, 3 * 20 * 2)); | |||
std::shared_ptr<float> new_ptr2(new float[3 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
std::shared_ptr<float> new_ptr3(new float[3 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
tensor1.reset(new_ptr2.get(), layout); | |||
tensor2.reset(new_ptr2.get(), 3 * 20 * 4); | |||
tensor3.reset(new_ptr3.get(), 3 * 20 * 4); | |||
//! After reset the original mem is freed | |||
/*ASSERT_EXIT((memcpy(old_ptr2, old_ptr3, 3 * 20 * 2), exit(0)), | |||
::testing::KilledBySignal(SIGSEGV), ".*");*/ | |||
ASSERT_EQ(tensor2.get_memory_ptr(), new_ptr2.get()); | |||
ASSERT_EQ(tensor3.get_memory_ptr(), new_ptr3.get()); | |||
ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2)); | |||
Layout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
std::shared_ptr<float> ptr2(new float[6 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
std::shared_ptr<float> ptr3(new float[6 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
tensor2.reset(ptr2.get(), layout1); | |||
tensor3.reset(ptr3.get(), layout1); | |||
//! memory is not freed by Tensor reset | |||
ASSERT_NO_THROW(memcpy(new_ptr2.get(), new_ptr3.get(), 3 * 20 * 2)); | |||
auto host_layout2 = TensorHelper::implement(&tensor2) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor() | |||
->layout(); | |||
auto host_layout3 = TensorHelper::implement(&tensor3) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor() | |||
->layout(); | |||
ASSERT_EQ(to_lite_layout(host_layout2), layout1); | |||
ASSERT_EQ(to_lite_layout(host_layout3), layout1); | |||
} | |||
TEST(TestTensor, CrossCNCopy) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
Tensor tensor3(LiteDeviceType::LITE_CPU, layout); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
auto old_ptr2 = tensor2.get_memory_ptr(); | |||
auto old_ptr3 = tensor3.get_memory_ptr(); | |||
//! test source tenor is empty | |||
ASSERT_THROW(tensor2.copy_from(tensor1), std::exception); | |||
tensor1.copy_from(tensor2); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); | |||
ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); | |||
} | |||
TEST(TestTensor, SharedTensorMemory) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
{ | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
tensor1.share_memory_with(tensor2); | |||
auto ptr1 = tensor1.get_memory_ptr(); | |||
auto ptr2 = tensor2.get_memory_ptr(); | |||
ASSERT_EQ(ptr1, ptr2); | |||
} | |||
// check after tensor2 destroy, tensor1 can also visit | |||
auto ptr1 = static_cast<float*>(tensor1.get_memory_ptr()); | |||
size_t length = tensor1.get_tensor_total_size_in_byte() / | |||
tensor1.get_layout().get_elem_size(); | |||
for (size_t i = 0; i < length; i++) { | |||
ptr1[i] = i; | |||
} | |||
} | |||
TEST(TestTensor, Reshape) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
auto ptr = tensor2.get_memory_ptr(); | |||
//! test wrong case | |||
ASSERT_THROW(tensor2.reshape({-1, -1, 3 * 224 * 224}), std::exception); | |||
ASSERT_THROW(tensor2.reshape({-1, 3, 3 * 224 * 224}), std::exception); | |||
ASSERT_THROW(tensor2.reshape({1, 3, 3 * 224 * 224}), std::exception); | |||
ASSERT_THROW(tensor2.reshape({3, 3, 3 * 224 * 224}), std::exception); | |||
tensor2.reshape({3 * 224 * 224}); | |||
ASSERT_EQ(tensor2.get_layout().ndim, 1); | |||
ASSERT_EQ(tensor2.get_layout().data_type, LiteDataType::LITE_FLOAT); | |||
ASSERT_EQ(tensor2.get_layout().shapes[0], 3 * 224 * 224); | |||
tensor2.reshape({-1, 224, 224}); | |||
ASSERT_EQ(tensor2.get_layout().ndim, 3); | |||
ASSERT_EQ(tensor2.get_layout().shapes[0], 3); | |||
ASSERT_EQ(tensor2.get_layout().shapes[1], 224); | |||
ASSERT_EQ(tensor2.get_memory_ptr(), ptr); | |||
} | |||
TEST(TestTensor, Slice) { | |||
Layout layout{{20, 20}, 2}; | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
auto ptr = tensor2.get_memory_ptr(); | |||
//! test source tenor is empty | |||
ASSERT_THROW(tensor2.slice({5, 10, 10}, {10, 15}), std::exception); | |||
ASSERT_THROW(tensor2.slice({5, 10}, {10, 15}, {5}), std::exception); | |||
ASSERT_THROW(tensor2.slice({5, 10}, {10, 15, 10}), std::exception); | |||
for (int i = 0; i < 20 * 20; i++) { | |||
*(static_cast<float*>(ptr) + i) = i; | |||
} | |||
auto check = [&](size_t start, size_t end, size_t step) { | |||
Tensor tensor3; | |||
tensor3.copy_from( | |||
*tensor2.slice({start, start}, {end, end}, {step, step})); | |||
float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr()); | |||
for (size_t i = start; i < end; i += step) { | |||
for (size_t j = start; j < end; j += step) { | |||
ASSERT_EQ(float(i * 20 + j), *new_ptr); | |||
++new_ptr; | |||
} | |||
} | |||
}; | |||
check(5, 10, 1); | |||
check(5, 11, 2); | |||
check(2, 18, 4); | |||
Tensor tensor3; | |||
tensor3.copy_from(*tensor2.slice({3}, {9}, {2})); | |||
float* new_ptr = static_cast<float*>(tensor3.get_memory_ptr()); | |||
for (size_t i = 3; i < 9; i += 2) { | |||
for (size_t j = 0; j < 20; j++) { | |||
ASSERT_EQ(float(i * 20 + j), *new_ptr); | |||
++new_ptr; | |||
} | |||
} | |||
} | |||
TEST(TestTensor, SliceCopy) { | |||
Layout layout{{20, 20}, 2}; | |||
Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
//! alloc memory | |||
auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
Layout layout_slice{{20, 10}, 2}; | |||
Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice); | |||
auto ptr0 = tensor0.get_memory_ptr(); | |||
for (int i = 0; i < 10 * 20; i++) { | |||
*(static_cast<float*>(ptr0) + i) = i; | |||
} | |||
Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice); | |||
auto ptr1 = tensor1.get_memory_ptr(); | |||
for (int i = 0; i < 10 * 20; i++) { | |||
*(static_cast<float*>(ptr1) + i) = i + 200; | |||
} | |||
auto slice0 = tensor.slice({0, 0}, {20, 10}); | |||
auto slice1 = tensor.slice({0, 10}, {20, 20}); | |||
slice0->copy_from(tensor0); | |||
slice1->copy_from(tensor1); | |||
ASSERT_FALSE(slice0->is_continue_memory()); | |||
ASSERT_FALSE(slice1->is_continue_memory()); | |||
for (size_t i = 0; i < 20; i++) { | |||
for (size_t j = 0; j < 10; j++) { | |||
ASSERT_EQ(float(i * 10 + j), *ptr); | |||
++ptr; | |||
} | |||
for (size_t j = 0; j < 10; j++) { | |||
ASSERT_EQ(float(i * 10 + j + 200), *ptr); | |||
++ptr; | |||
} | |||
} | |||
slice0->fill_zero(); | |||
Tensor tmp; | |||
tmp.copy_from(*slice0); | |||
float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr()); | |||
for (size_t i = 0; i < 20; i++) { | |||
for (size_t j = 0; j < 10; j++) { | |||
ASSERT_EQ(float(0), *tmp_ptr); | |||
++tmp_ptr; | |||
} | |||
} | |||
} | |||
TEST(TestTensor, GetPtrOffset) { | |||
Layout layout{{20, 20}, 2}; | |||
Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
//! alloc memory | |||
auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
auto ptr_offset = tensor.get_memory_ptr({10, 10}); | |||
ASSERT_EQ(ptr_offset, ptr + 10 * 20 + 10); | |||
auto slice0 = tensor.slice({0, 0}, {20, 10}); | |||
auto slice1 = tensor.slice({0, 10}, {20, 20}); | |||
ASSERT_FALSE(slice0->is_continue_memory()); | |||
ASSERT_FALSE(slice1->is_continue_memory()); | |||
auto ptr_offset_slice0 = slice0->get_memory_ptr({6, 5}); | |||
auto ptr_offset_slice1 = slice1->get_memory_ptr({2, 5}); | |||
ASSERT_EQ(ptr_offset_slice0, ptr + 6 * 20 + 5); | |||
ASSERT_EQ(ptr_offset_slice1, ptr + 2 * 20 + 10 + 5); | |||
} | |||
TEST(TestTensor, Concat) { | |||
Layout layout{{5, 5, 5}, 3}; | |||
std::vector<Tensor> tensors; | |||
for (int i = 0; i < 4; i++) { | |||
Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
for (int n = 0; n < 5 * 5 * 5; n++) { | |||
ptr[n] = i; | |||
} | |||
tensors.push_back(tensor); | |||
} | |||
auto check = [&](int dim) { | |||
auto new_tensor = TensorUtils::concat(tensors, dim); | |||
auto ptr = static_cast<float*>(new_tensor->get_memory_ptr()); | |||
size_t stride = std::pow(5, (3 - dim)); | |||
for (int i = 0; i < 4; i++) { | |||
for (size_t j = 0; j < stride; j++) { | |||
ASSERT_EQ(ptr[i * stride + j], i); | |||
} | |||
} | |||
}; | |||
check(0); | |||
check(1); | |||
check(2); | |||
} | |||
#if LITE_WITH_CUDA | |||
TEST(TestTensor, BasicDevice) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor1(LiteDeviceType::LITE_CUDA, layout); | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
//! mge tensor has created | |||
ASSERT_TRUE(TensorHelper::implement(&tensor1)); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor2)); | |||
//! check member | |||
ASSERT_EQ(tensor1.get_device_type(), LiteDeviceType::LITE_CUDA); | |||
ASSERT_EQ(tensor2.get_device_type(), LiteDeviceType::LITE_CPU); | |||
ASSERT_EQ(tensor2.get_layout(), layout); | |||
//! check the real tensor | |||
ASSERT_EQ(tensor1.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 224 * 224 * 4); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor2) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor()); | |||
ASSERT_FALSE(TensorHelper::implement(&tensor2) | |||
->cast_final_safe<TensorImplDft>() | |||
.dev_tensor()); | |||
ASSERT_TRUE(TensorHelper::implement(&tensor1) | |||
->cast_final_safe<TensorImplDft>() | |||
.dev_tensor()); | |||
ASSERT_FALSE(TensorHelper::implement(&tensor1) | |||
->cast_final_safe<TensorImplDft>() | |||
.host_tensor()); | |||
} | |||
TEST(TestTensor, SetLayoutReAllocDevice) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor2(LiteDeviceType::LITE_CUDA, layout); | |||
auto old_ptr2 = tensor2.get_memory_ptr(); | |||
//! layout set through | |||
Layout layout1{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; | |||
tensor2.set_layout(layout1); | |||
ASSERT_EQ(tensor2.get_tensor_total_size_in_byte(), 1 * 3 * 100 * 100); | |||
auto layout2 = TensorHelper::implement(&tensor2) | |||
->cast_final_safe<TensorImplDft>() | |||
.dev_tensor() | |||
->layout(); | |||
ASSERT_EQ(to_lite_layout(layout2), layout1); | |||
auto new_ptr2 = tensor2.get_memory_ptr(); | |||
ASSERT_EQ(old_ptr2, new_ptr2); | |||
} | |||
TEST(TestTensor, CrossCNCopyDevice) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor0; | |||
Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
Tensor tensor2(LiteDeviceType::LITE_CPU, layout); | |||
Tensor tensor3(LiteDeviceType::LITE_CUDA, layout); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
auto old_ptr2 = tensor2.get_memory_ptr(); | |||
auto old_ptr3 = tensor3.get_memory_ptr(); | |||
ASSERT_THROW(tensor3.copy_from(tensor1), std::exception); | |||
tensor1.copy_from(tensor3); | |||
tensor0.copy_from(tensor3); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); | |||
ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); | |||
} | |||
TEST(TestTensor, PinnedHostMem) { | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor1(LiteDeviceType::LITE_CPU); | |||
bool is_pinned_host = true; | |||
Tensor tensor2(LiteDeviceType::LITE_CUDA, layout, is_pinned_host); | |||
Tensor tensor3(LiteDeviceType::LITE_CUDA, layout); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
ASSERT_EQ(tensor2.is_pinned_host(), true); | |||
ASSERT_EQ(tensor3.is_pinned_host(), false); | |||
auto old_ptr2 = tensor2.get_memory_ptr(); | |||
auto old_ptr3 = tensor3.get_memory_ptr(); | |||
//! test source tenor is empty | |||
ASSERT_THROW(tensor2.copy_from(tensor1), std::exception); | |||
tensor1.copy_from(tensor2); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
ASSERT_EQ(tensor2.get_memory_ptr(), old_ptr2); | |||
ASSERT_EQ(tensor3.get_memory_ptr(), old_ptr3); | |||
} | |||
TEST(TestTensor, DeviceId) { | |||
if(get_device_count(LITE_CUDA) <= 1) | |||
return; | |||
Layout layout{{1, 3, 224, 224}, 4}; | |||
Tensor tensor2(0, LiteDeviceType::LITE_CUDA, layout); | |||
Tensor tensor3(1, LiteDeviceType::LITE_CUDA, layout); | |||
tensor2.copy_from(tensor3); | |||
tensor3.copy_from(tensor2); | |||
Tensor tensor1; | |||
tensor1.copy_from(tensor2); | |||
tensor1.copy_from(tensor3); | |||
} | |||
TEST(TestTensor, SliceDevice) { | |||
Layout layout{{20, 20}, 2}; | |||
Tensor host_tensor0; | |||
Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout); | |||
host_tensor0.copy_from(dev_tensor0); | |||
auto ptr = host_tensor0.get_memory_ptr(); | |||
for (int i = 0; i < 20 * 20; i++) { | |||
*(static_cast<float*>(ptr) + i) = i; | |||
} | |||
dev_tensor0.copy_from(host_tensor0); | |||
auto check = [&](size_t start, size_t end, size_t step) { | |||
Tensor host_tensor; | |||
host_tensor.copy_from( | |||
*dev_tensor0.slice({start, start}, {end, end}, {step, step})); | |||
float* new_ptr = static_cast<float*>(host_tensor.get_memory_ptr()); | |||
for (size_t i = start; i < end; i += step) { | |||
for (size_t j = start; j < end; j += step) { | |||
ASSERT_EQ(float(i * 20 + j), *new_ptr); | |||
++new_ptr; | |||
} | |||
} | |||
}; | |||
check(5, 10, 1); | |||
check(5, 11, 2); | |||
check(2, 18, 4); | |||
} | |||
TEST(TestTensor, MemSetDevice) { | |||
Layout layout{{20, 20}, 2, LiteDataType::LITE_INT8}; | |||
Tensor host_tensor0(LiteDeviceType::LITE_CPU, layout); | |||
Tensor dev_tensor0(LiteDeviceType::LITE_CUDA, layout); | |||
auto check = [&](uint8_t val, const Tensor& tensor) { | |||
auto ptr = static_cast<uint8_t*>(tensor.get_memory_ptr()); | |||
for (int i = 0; i < 20 * 20; i++) { | |||
ASSERT_EQ(val, *(ptr + i)); | |||
} | |||
}; | |||
host_tensor0.fill_zero(); | |||
check(0, host_tensor0); | |||
Tensor host_tensor1; | |||
dev_tensor0.fill_zero(); | |||
host_tensor1.copy_from(dev_tensor0); | |||
check(0, host_tensor1); | |||
} | |||
TEST(TestTensor, DeviceSliceCopy) { | |||
Layout layout{{20, 20}, 2}; | |||
Tensor tensor(LiteDeviceType::LITE_CUDA, layout); | |||
//! alloc memory | |||
tensor.get_memory_ptr(); | |||
Layout layout_slice{{20, 10}, 2}; | |||
Tensor tensor0(LiteDeviceType::LITE_CPU, layout_slice); | |||
auto ptr0 = tensor0.get_memory_ptr(); | |||
for (int i = 0; i < 10 * 20; i++) { | |||
*(static_cast<float*>(ptr0) + i) = i; | |||
} | |||
Tensor tensor1(LiteDeviceType::LITE_CPU, layout_slice); | |||
auto ptr1 = tensor1.get_memory_ptr(); | |||
for (int i = 0; i < 10 * 20; i++) { | |||
*(static_cast<float*>(ptr1) + i) = i + 200; | |||
} | |||
auto slice0 = tensor.slice({0, 0}, {20, 10}); | |||
auto slice1 = tensor.slice({0, 10}, {20, 20}); | |||
slice0->copy_from(tensor0); | |||
slice1->copy_from(tensor1); | |||
ASSERT_FALSE(slice0->is_continue_memory()); | |||
ASSERT_FALSE(slice1->is_continue_memory()); | |||
Tensor host_tensor; | |||
host_tensor.copy_from(tensor); | |||
auto ptr = static_cast<float*>(host_tensor.get_memory_ptr()); | |||
for (size_t i = 0; i < 20; i++) { | |||
for (size_t j = 0; j < 10; j++) { | |||
ASSERT_EQ(float(i * 10 + j), *ptr); | |||
++ptr; | |||
} | |||
for (size_t j = 0; j < 10; j++) { | |||
ASSERT_EQ(float(i * 10 + j + 200), *ptr); | |||
++ptr; | |||
} | |||
} | |||
slice0->fill_zero(); | |||
Tensor tmp; | |||
tmp.copy_from(*slice0); | |||
float* tmp_ptr = static_cast<float*>(tmp.get_memory_ptr()); | |||
for (size_t i = 0; i < 20; i++) { | |||
for (size_t j = 0; j < 10; j++) { | |||
ASSERT_EQ(float(0), *tmp_ptr); | |||
++tmp_ptr; | |||
} | |||
} | |||
} | |||
TEST(TestTensor, ConcatDevice) { | |||
Layout layout{{5, 5, 5}, 3}; | |||
std::vector<Tensor> tensors; | |||
for (int i = 0; i < 4; i++) { | |||
Tensor tensor(LiteDeviceType::LITE_CPU, layout); | |||
auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
for (int n = 0; n < 5 * 5 * 5; n++) { | |||
ptr[n] = i; | |||
} | |||
tensors.push_back(tensor); | |||
} | |||
auto check = [&](int dim) { | |||
auto new_tensor = | |||
TensorUtils::concat(tensors, dim, LiteDeviceType::LITE_CUDA, 0); | |||
Tensor tensor(LiteDeviceType::LITE_CPU); | |||
tensor.copy_from(*new_tensor); | |||
auto ptr = static_cast<float*>(tensor.get_memory_ptr()); | |||
size_t stride = std::pow(5, (3 - dim)); | |||
for (int i = 0; i < 4; i++) { | |||
for (size_t j = 0; j < stride; j++) { | |||
ASSERT_EQ(ptr[i * stride + j], i); | |||
} | |||
} | |||
ASSERT_EQ(new_tensor->get_device_type(), LiteDeviceType::LITE_CUDA); | |||
ASSERT_EQ(new_tensor->get_device_id(), 0); | |||
}; | |||
check(0); | |||
check(1); | |||
check(2); | |||
} | |||
#endif | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,316 @@ | |||
/** | |||
* \file test/test_tensor_c.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include "lite_build_config.h" | |||
#if LITE_BUILD_WITH_MGE | |||
#include "../src/misc.h" | |||
#include "lite-c/global_c.h" | |||
#include "lite-c/tensor_c.h" | |||
#include <gtest/gtest.h> | |||
#include <memory> | |||
TEST(TestCapiTensor, Basic) { | |||
LiteTensor c_tensor0, c_tensor1; | |||
LiteTensorDesc description = default_desc; | |||
LITE_make_tensor(description, &c_tensor0); | |||
int is_pinned_host = false; | |||
LITE_is_pinned_host(c_tensor0, &is_pinned_host); | |||
ASSERT_FALSE(is_pinned_host); | |||
LiteDeviceType device_type; | |||
LITE_get_tensor_device_type(c_tensor0, &device_type); | |||
ASSERT_EQ(device_type, LiteDeviceType::LITE_CPU); | |||
size_t length = 0; | |||
LITE_get_tensor_total_size_in_byte(c_tensor0, &length); | |||
ASSERT_EQ(length, 0); | |||
LiteLayout layout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
description.device_type = LiteDeviceType::LITE_CPU; | |||
description.layout = layout; | |||
description.is_pinned_host = true; | |||
LITE_make_tensor(description, &c_tensor1); | |||
LITE_is_pinned_host(c_tensor1, &is_pinned_host); | |||
ASSERT_TRUE(is_pinned_host); | |||
LITE_get_tensor_total_size_in_byte(c_tensor1, &length); | |||
ASSERT_EQ(length, 1 * 3 * 224 * 224 * 4); | |||
LiteLayout get_layout; | |||
LITE_get_tensor_layout(c_tensor1, &get_layout); | |||
ASSERT_EQ(get_layout.ndim, layout.ndim); | |||
ASSERT_EQ(get_layout.data_type, layout.data_type); | |||
ASSERT_EQ(get_layout.shapes[0], layout.shapes[0]); | |||
ASSERT_EQ(get_layout.shapes[1], layout.shapes[1]); | |||
ASSERT_EQ(get_layout.shapes[2], layout.shapes[2]); | |||
ASSERT_EQ(get_layout.shapes[3], layout.shapes[3]); | |||
//! test error | |||
ASSERT_EQ(LITE_is_pinned_host(c_tensor0, nullptr), -1); | |||
ASSERT_NE(strlen(LITE_get_last_error()), 0); | |||
printf("The last error is: %s\n", LITE_get_last_error()); | |||
LITE_destroy_tensor(c_tensor0); | |||
LITE_destroy_tensor(c_tensor1); | |||
} | |||
TEST(TestCapiTensor, SetLayoutReAlloc) { | |||
LiteTensor c_tensor0; | |||
LiteTensorDesc description = default_desc; | |||
description.layout = | |||
LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor0); | |||
void *old_ptr, *new_ptr; | |||
LITE_get_tensor_memory(c_tensor0, &old_ptr); | |||
LiteLayout new_layout = | |||
LiteLayout{{1, 3, 100, 100}, 4, LiteDataType::LITE_INT8}; | |||
LITE_set_tensor_layout(c_tensor0, new_layout); | |||
LITE_get_tensor_memory(c_tensor0, &new_ptr); | |||
size_t length = 0; | |||
LITE_get_tensor_total_size_in_byte(c_tensor0, &length); | |||
ASSERT_EQ(length, 1 * 3 * 100 * 100); | |||
ASSERT_EQ(old_ptr, new_ptr); | |||
} | |||
TEST(TestCapiTensor, Reset) { | |||
LiteTensor c_tensor0, c_tensor1; | |||
LiteTensorDesc description = default_desc; | |||
description.layout = LiteLayout{{3, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor0); | |||
LITE_make_tensor(description, &c_tensor1); | |||
void *old_ptr0, *old_ptr1; | |||
LITE_get_tensor_memory(c_tensor0, &old_ptr0); | |||
LITE_get_tensor_memory(c_tensor1, &old_ptr1); | |||
//! make sure memory is allocted | |||
ASSERT_NO_THROW(memcpy(old_ptr0, old_ptr1, 3 * 20 * 4)); | |||
std::shared_ptr<float> new_ptr0(new float[3 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
std::shared_ptr<float> new_ptr1(new float[3 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
LITE_reset_tensor_memory(c_tensor0, new_ptr0.get(), 3 * 20 * 4); | |||
LITE_reset_tensor_memory(c_tensor1, new_ptr1.get(), 3 * 20 * 4); | |||
void *tmp_ptr0, *tmp_ptr1; | |||
LITE_get_tensor_memory(c_tensor0, &tmp_ptr0); | |||
LITE_get_tensor_memory(c_tensor1, &tmp_ptr1); | |||
ASSERT_EQ(tmp_ptr0, new_ptr0.get()); | |||
ASSERT_EQ(tmp_ptr1, new_ptr1.get()); | |||
ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4)); | |||
LiteLayout layout1{{6, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
std::shared_ptr<float> ptr2(new float[6 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
std::shared_ptr<float> ptr3(new float[6 * 20], | |||
[](float* ptr) { delete[] ptr; }); | |||
LITE_reset_tensor(c_tensor0, layout1, new_ptr0.get()); | |||
LITE_reset_tensor(c_tensor1, layout1, new_ptr1.get()); | |||
//! memory is not freed by Tensor reset | |||
ASSERT_NO_THROW(memcpy(new_ptr0.get(), new_ptr1.get(), 3 * 20 * 4)); | |||
LiteLayout tmp_layout0, tmp_layout1; | |||
LITE_get_tensor_layout(c_tensor0, &tmp_layout0); | |||
LITE_get_tensor_layout(c_tensor1, &tmp_layout1); | |||
ASSERT_EQ(tmp_layout0.ndim, tmp_layout1.ndim); | |||
ASSERT_EQ(tmp_layout0.data_type, tmp_layout1.data_type); | |||
ASSERT_EQ(tmp_layout0.shapes[0], tmp_layout1.shapes[0]); | |||
ASSERT_EQ(tmp_layout0.shapes[1], tmp_layout1.shapes[1]); | |||
LITE_destroy_tensor(c_tensor0); | |||
LITE_destroy_tensor(c_tensor1); | |||
} | |||
TEST(TestCapiTensor, CrossCNCopy) { | |||
LiteTensor c_tensor0, c_tensor1, c_tensor2; | |||
LiteTensorDesc description = default_desc; | |||
LITE_make_tensor(description, &c_tensor0); | |||
description.layout = | |||
LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor1); | |||
LITE_make_tensor(description, &c_tensor2); | |||
LITE_tensor_copy(c_tensor1, c_tensor2); | |||
LITE_tensor_copy(c_tensor2, c_tensor1); | |||
void *old_ptr1, *old_ptr2, *new_ptr1, *new_ptr2; | |||
LITE_get_tensor_memory(c_tensor1, &old_ptr1); | |||
LITE_get_tensor_memory(c_tensor2, &old_ptr2); | |||
//! test source tenor is empty | |||
ASSERT_EQ(LITE_tensor_copy(c_tensor1, c_tensor0), -1); | |||
ASSERT_NE(strlen(LITE_get_last_error()), 0); | |||
printf("The last error is: %s\n", LITE_get_last_error()); | |||
LITE_tensor_copy(c_tensor0, c_tensor1); | |||
LITE_tensor_copy(c_tensor1, c_tensor2); | |||
LITE_tensor_copy(c_tensor2, c_tensor0); | |||
LITE_get_tensor_memory(c_tensor1, &new_ptr1); | |||
LITE_get_tensor_memory(c_tensor2, &new_ptr2); | |||
ASSERT_EQ(old_ptr1, new_ptr1); | |||
ASSERT_EQ(old_ptr2, new_ptr2); | |||
LITE_destroy_tensor(c_tensor0); | |||
LITE_destroy_tensor(c_tensor1); | |||
LITE_destroy_tensor(c_tensor2); | |||
} | |||
TEST(TestCapiTensor, ShareMemoryWith) { | |||
LiteTensor c_tensor0, c_tensor1; | |||
LiteTensorDesc description = default_desc; | |||
LITE_make_tensor(description, &c_tensor0); | |||
description.layout = | |||
LiteLayout{{1, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor1); | |||
ASSERT_EQ(LITE_tensor_share_memory_with(c_tensor1, c_tensor0), -1); | |||
LITE_tensor_share_memory_with(c_tensor0, c_tensor1); | |||
void *ptr0, *ptr1; | |||
LITE_get_tensor_memory(c_tensor0, &ptr0); | |||
LITE_get_tensor_memory(c_tensor1, &ptr1); | |||
ASSERT_EQ(ptr0, ptr1); | |||
LITE_destroy_tensor(c_tensor0); | |||
LITE_destroy_tensor(c_tensor1); | |||
} | |||
TEST(TestCapiTensor, Reshape) { | |||
LiteTensor c_tensor0; | |||
LiteTensorDesc description = default_desc; | |||
description.layout = | |||
LiteLayout{{8, 8, 100, 100}, 4, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor0); | |||
void* old_ptr; | |||
LITE_get_tensor_memory(c_tensor0, &old_ptr); | |||
auto check = [&](std::vector<size_t> expect, const LiteTensor& tensor) { | |||
LiteLayout get_layout; | |||
LITE_get_tensor_layout(tensor, &get_layout); | |||
ASSERT_EQ(get_layout.ndim, expect.size()); | |||
for (size_t i = 0; i < expect.size(); i++) { | |||
ASSERT_EQ(get_layout.shapes[i], expect[i]); | |||
} | |||
void* new_ptr; | |||
LITE_get_tensor_memory(tensor, &new_ptr); | |||
ASSERT_EQ(old_ptr, new_ptr); | |||
}; | |||
{ | |||
int shape[2] = {-1, 50}; | |||
LITE_tensor_reshape(c_tensor0, shape, 2); | |||
check({8 * 8 * 100 * 2, 50}, c_tensor0); | |||
} | |||
{ | |||
int shape[3] = {64, 100, 100}; | |||
LITE_tensor_reshape(c_tensor0, shape, 3); | |||
check({8 * 8, 100, 100}, c_tensor0); | |||
} | |||
{ | |||
int shape[3] = {16, 100, -1}; | |||
LITE_tensor_reshape(c_tensor0, shape, 3); | |||
check({16, 100, 400}, c_tensor0); | |||
} | |||
LITE_destroy_tensor(c_tensor0); | |||
} | |||
TEST(TestCapiTensor, Slice) { | |||
LiteTensor c_tensor0; | |||
LiteTensorDesc description = default_desc; | |||
description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor0); | |||
void* old_ptr; | |||
LITE_get_tensor_memory(c_tensor0, &old_ptr); | |||
for (size_t i = 0; i < 20 * 20; i++) { | |||
*(static_cast<float*>(old_ptr) + i) = i; | |||
} | |||
auto check = [&](size_t start, size_t end, size_t step, bool have_step) { | |||
LiteTensor tensor, slice_tensor; | |||
LITE_make_tensor(default_desc, &tensor); | |||
size_t start_ptr[2] = {start, start}; | |||
size_t end_ptr[2] = {end, end}; | |||
size_t step_ptr[2] = {step, step}; | |||
if (have_step) { | |||
LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, step_ptr, 2, | |||
&slice_tensor); | |||
} else { | |||
LITE_tensor_slice(c_tensor0, start_ptr, end_ptr, nullptr, 2, | |||
&slice_tensor); | |||
} | |||
int is_continue = true; | |||
LITE_is_memory_continue(slice_tensor, &is_continue); | |||
ASSERT_FALSE(is_continue); | |||
LITE_tensor_copy(tensor, slice_tensor); | |||
void* new_ptr; | |||
LITE_get_tensor_memory(tensor, &new_ptr); | |||
float* ptr = static_cast<float*>(new_ptr); | |||
for (size_t i = start; i < end; i += step) { | |||
for (size_t j = start; j < end; j += step) { | |||
ASSERT_EQ(float(i * 20 + j), *ptr); | |||
++ptr; | |||
} | |||
} | |||
LITE_destroy_tensor(tensor); | |||
}; | |||
check(1, 8, 1, true); | |||
check(1, 8, 1, false); | |||
check(2, 10, 2, true); | |||
check(10, 18, 4, true); | |||
check(10, 18, 1, false); | |||
LITE_destroy_tensor(c_tensor0); | |||
} | |||
TEST(TestCapiTensor, Memset) { | |||
LiteTensor c_tensor0; | |||
LiteTensorDesc description = default_desc; | |||
description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor0); | |||
void* ptr; | |||
uint8_t* uint8_ptr; | |||
LITE_get_tensor_memory(c_tensor0, &ptr); | |||
LITE_tensor_fill_zero(c_tensor0); | |||
uint8_ptr = static_cast<uint8_t*>(ptr); | |||
for (size_t i = 0; i < 20 * 20; i++) { | |||
ASSERT_EQ(0, *uint8_ptr); | |||
uint8_ptr++; | |||
} | |||
LITE_destroy_tensor(c_tensor0); | |||
} | |||
TEST(TestCapiTensor, GetMemoryByIndex) { | |||
LiteTensor c_tensor0; | |||
LiteTensorDesc description = default_desc; | |||
description.layout = LiteLayout{{20, 20}, 2, LiteDataType::LITE_FLOAT}; | |||
LITE_make_tensor(description, &c_tensor0); | |||
void *ptr0, *ptr1, *ptr2, *ptr3; | |||
LITE_get_tensor_memory(c_tensor0, &ptr0); | |||
size_t index0[] = {3, 4}; | |||
LITE_get_tensor_memory_with_index(c_tensor0, &index0[0], 2, &ptr1); | |||
size_t index1[] = {5, 7}; | |||
LITE_get_tensor_memory_with_index(c_tensor0, &index1[0], 2, &ptr2); | |||
size_t index2[] = {5}; | |||
LITE_get_tensor_memory_with_index(c_tensor0, &index2[0], 1, &ptr3); | |||
ASSERT_EQ(ptr1, static_cast<float*>(ptr0) + 3 * 20 + 4); | |||
ASSERT_EQ(ptr2, static_cast<float*>(ptr0) + 5 * 20 + 7); | |||
ASSERT_EQ(ptr3, static_cast<float*>(ptr0) + 5 * 20); | |||
LITE_destroy_tensor(c_tensor0); | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -0,0 +1,26 @@ | |||
#! /bin/bash -e | |||
set -e | |||
if [ $# -lt 2 ] ; then | |||
echo "USAGE: $0 src dst" | |||
echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl" | |||
echo " e.g.: $0 ~/xxx.mdl ~/xxx.encrypted.mdl key" | |||
exit 1; | |||
fi | |||
IV=`openssl rand -hex 16` | |||
Key=000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F | |||
if [ $# == 3 ] ; then | |||
Key=$3 | |||
fi | |||
# get file size | |||
size=`wc -c $1` | |||
echo "encrypt aes-256-cbc ..." | |||
openssl enc -e -aes-256-cbc -in $1 -out $1.tmp -K $Key -iv $IV | |||
echo $IV | xxd -r -p | cat - $1.tmp > $2 | |||
# write size into file | |||
printf "%016x" ${size%\ *} | xxd -r -p >> $2 | |||
rm -f $1.tmp |
@@ -0,0 +1,134 @@ | |||
#!/usr/bin/env mdl | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
from megskull.graph import NodeFilter, FpropEnv | |||
from megskull.opr.all import AssertEqual, DataProvider, BatchNormalization | |||
from megskull.utils.logconf import get_logger | |||
from meghair.utils import io | |||
import megbrain as mgb | |||
import argparse | |||
import struct | |||
import re | |||
import os | |||
import numpy as np | |||
import cv2 | |||
logger = get_logger(__name__) | |||
def optimize_for_inference(args, outputs): | |||
args_map = { | |||
'enable_io16xc32': 'f16_io_f32_comp', | |||
'enable_ioc16': 'f16_io_comp', | |||
'enable_hwcd4': 'use_nhwcd4', | |||
'enable_nchw4': 'use_nchw4', | |||
'enable_nchw88': 'use_nchw88', | |||
'enable_nchw44': 'use_nchw44', | |||
'enable_nchw44_dot': 'use_nchw44_dot', | |||
'enable_nchw32': 'use_nchw32', | |||
'enable_chwn4': 'use_chwn4', | |||
'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity', | |||
'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z', | |||
} | |||
kwargs = {} | |||
for k, v in args_map.items(): | |||
if getattr(args, k): | |||
assert args.optimize_for_inference, ( | |||
'optimize_for_inference should be set when {} is given'.format( | |||
k)) | |||
kwargs[v] = True | |||
if args.optimize_for_inference: | |||
return mgb.optimize_for_inference(outputs, **kwargs) | |||
return outputs | |||
def main(): | |||
parser = argparse.ArgumentParser( | |||
description='Dump the Python Megbrain model to C++ model, by the way ' | |||
'optimizing for inference', | |||
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |||
) | |||
parser.add_argument('input', help='input pkl model file ') | |||
parser.add_argument('-o', '--output', help='output file', required=True) | |||
parser.add_argument('--init-bn', action='store_true', | |||
help='initialize untrained batch-normalization, to ' | |||
'avoid NaN or Inf results') | |||
parser.add_argument('--silent', action='store_true', | |||
help='set verbose to False in AssertEqual opr') | |||
parser.add_argument('--optimize-for-inference', action='store_true', | |||
help='enbale optimization for inference') | |||
parser.add_argument('--discard-var-name', action='store_true', | |||
help='discard variable and param names in the ' | |||
'generated output') | |||
parser.add_argument('--output-strip-info', action='store_true', | |||
help='output code strip information') | |||
parser.add_argument('--enable-io16xc32', action='store_true', | |||
help='transform the mode to float16 io float32 compute') | |||
parser.add_argument('--enable-ioc16', action='store_true', | |||
help='transform the dtype of the model to float16 io ' | |||
'and compute') | |||
parser.add_argument('--enable-fuse-conv-bias-nonlinearity', | |||
action='store_true', | |||
help='fuse convolution bias and nonlinearity opr to a ' | |||
'conv_bias opr and compute') | |||
parser.add_argument('--enable-hwcd4', action='store_true', | |||
help='transform the model format from NCHW to NHWCD4 ' | |||
'for inference; you may need to disable CUDA and set ' | |||
'MGB_USE_MEGDNN_DBG=2') | |||
parser.add_argument('--enable-nchw4', action='store_true', | |||
help='transform the model format from NCHW to NCHW4 ' | |||
'for inference') | |||
parser.add_argument('--enable-nchw88', action='store_true', | |||
help='transform the model format from NCHW to NCHW88 ' | |||
'for inference') | |||
parser.add_argument('--enable-nchw44', action='store_true', | |||
help='transform the model format from NCHW to NCHW44 ' | |||
'for inference') | |||
parser.add_argument('--enable-nchw44-dot', action='store_true', | |||
help='transform the model format from NCHW to NCHW44_DOT ' | |||
'for optimizing armv8.2 dot in inference') | |||
parser.add_argument('--enable-chwn4', action='store_true', | |||
help='transform the model format to CHWN4 ' | |||
'for inference, mainly used for nvidia tensorcore') | |||
parser.add_argument('--enable-nchw32', action='store_true', | |||
help='transform the model format from NCHW4 to NCHW32 ' | |||
'for inference on nvidia TensoCore') | |||
parser.add_argument('--enable-fuse-conv-bias-with-z', action='store_true', | |||
help='fuse conv_bias with z input for inference on ' | |||
'nvidia GPU (this optimization pass will result in mismatch ' | |||
'of the precision of output of training and inference)') | |||
args = parser.parse_args() | |||
env = FpropEnv(verbose_fprop=False) | |||
outputs = io.load_network(args.input).outputs | |||
output_mgbvars = list(map(env.get_mgbvar, outputs)) | |||
output_mgbvars = optimize_for_inference(args, output_mgbvars) | |||
if args.discard_var_name: | |||
sereg_kwargs = dict(keep_var_name=0, keep_param_name=False) | |||
else: | |||
sereg_kwargs = dict(keep_var_name=2, keep_param_name=True) | |||
stat = mgb.serialize_comp_graph_to_file( | |||
args.output, output_mgbvars, append=False, | |||
output_strip_info=args.output_strip_info, | |||
**sereg_kwargs) | |||
logger.info('graph dump sizes: tot_size={:.3f}KiB overhead={:.3f}KiB'. | |||
format(stat.tot_bytes / 1024, | |||
(stat.tot_bytes - stat.tensor_value_bytes) / 1024)) | |||
if __name__ == '__main__': | |||
main() |
@@ -0,0 +1,75 @@ | |||
#!/usr/bin/env bash | |||
set -e | |||
function usage() { | |||
echo "$0 args1 args2 .." | |||
echo "available args detail:" | |||
echo "-i info.json : input info.json file" | |||
echo "-m model: model name" | |||
echo "-e encryption mode: encryption mode rc4 encrypt_predefined_rc4 " | |||
echo "-o output name: output name" | |||
echo "-n input model name: input model name match with info.json" | |||
echo "-h : show usage" | |||
exit -1 | |||
} | |||
while getopts "i:m:e:o:n:h" arg | |||
do | |||
case $arg in | |||
i) | |||
INFO_NAME=$OPTARG | |||
;; | |||
m) | |||
MODEL_NAME=$OPTARG | |||
;; | |||
n) | |||
INPUT_MODEL_NAME=$OPTARG | |||
;; | |||
e) | |||
ENCRYPT_MODE=$OPTARG | |||
;; | |||
o) | |||
OUTPUT_NAME=$OPTARG | |||
;; | |||
h) | |||
usage | |||
;; | |||
\?) | |||
echo "show usage" | |||
usage | |||
;; | |||
esac | |||
done | |||
echo "----------------------------------------------------" | |||
echo "commad args summary:" | |||
echo "INFO_NAME: $INFO_NAME" | |||
echo "MODEL_NAME: $MODEL_NAME" | |||
echo "ENCRYPT_MODE: $ENCRYPT_MODE" | |||
echo "OUTPUT_NAME: $OUTPUT_NAME" | |||
echo "INPUT_MODEL_NAME: $INPUT_MODEL_NAME" | |||
echo "----------------------------------------------------" | |||
if [[ $INFO_NAME == '' ]]; then | |||
echo "INFO_NAME is NULL,exit now..." | |||
exit -1 | |||
fi | |||
if [[ $MODEL_NAME == '' ]]; then | |||
echo "MODEL_NAME is NULL,exit now..." | |||
exit -1 | |||
fi | |||
if [[ $INPUT_MODEL_NAME == '' ]]; then | |||
echo "INPUT_MODEL_NAME is NULL,exit now..." | |||
exit -1 | |||
fi | |||
if [[ $OUTPUT_NAME == '' ]]; then | |||
echo "OUTPUT_NAME is NULL,exit now..." | |||
exit -1 | |||
fi | |||
ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod | |||
ENCRYPT_MODEL_NAME=$MODEL_NAME.pr_rc4.emod | |||
./rc4_encryptor $ENCRYPT_MODE $INFO_NAME $INFO_NAME.pr_rc4.emod | |||
./rc4_encryptor $ENCRYPT_MODE $MODEL_NAME $MODEL_NAME.pr_rc4.emod | |||
ENCRYPT_INFO_NAME=$INFO_NAME.pr_rc4.emod | |||
python3 pack_model_and_info.py --input-model=$ENCRYPT_MODEL_NAME --model-name=$INPUT_MODEL_NAME --model-cryption="RC4_default" --info-cryption="RC4_default" --input-info=$ENCRYPT_INFO_NAME --info-parser="LITE_default" -o $OUTPUT_NAME |
@@ -0,0 +1,135 @@ | |||
#!/usr/bin/python3 | |||
# -*- coding: utf-8 -*- | |||
# | |||
# This file is part of MegEngine, a deep learning framework developed by | |||
# Megvii. | |||
# | |||
# copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
import argparse | |||
import struct | |||
import os | |||
import subprocess | |||
import flatbuffers | |||
def generate_flatbuffer(): | |||
status, path = subprocess.getstatusoutput('which flatc') | |||
if not status: | |||
cwd = os.path.dirname(os.path.dirname(__file__)) | |||
fbs_file = os.path.abspath(os.path.join(cwd, | |||
"../../src/parse_model/pack_model.fbs")) | |||
cmd = path + ' -p -b '+fbs_file | |||
ret, _ = subprocess.getstatusoutput(str(cmd)) | |||
if ret: | |||
raise Exception("flatc generate error!") | |||
else: | |||
raise Exception('no flatc in current environment, please build flatc ' | |||
'and put in the system PATH!') | |||
def main(): | |||
parser = argparse.ArgumentParser( | |||
description='load a encrypted or not encrypted model and a ' | |||
'json format of the infomation of the model, pack them to a file ' | |||
'which can be loaded by lite.') | |||
parser.add_argument('--input-model', help='input a encrypted or not encrypted model') | |||
parser.add_argument('--input-info', help='input a encrypted or not encrypted ' | |||
'json format file.') | |||
parser.add_argument('--model-name', help='the model name, this must match ' | |||
'with the model name in model info', default = 'NONE') | |||
parser.add_argument('--model-cryption', help='the model encryption method ' | |||
'name, this is used to find the right decryption method. e.g. ' | |||
'--model_cryption = "AES_default", default is NONE.', default = | |||
'NONE') | |||
parser.add_argument('--info-cryption', help='the info encryption method ' | |||
'name, this is used to find the right decryption method. e.g. ' | |||
'--model_cryption = "AES_default", default is NONE.', default = | |||
'NONE') | |||
parser.add_argument('--info-parser', help='The information parse method name ' | |||
'default is "LITE_default". ', default = 'LITE_default') | |||
parser.add_argument('--append', '-a', help='append another model to a ' | |||
'packed model.') | |||
parser.add_argument('--output', '-o', help='output file of packed model.') | |||
args = parser.parse_args() | |||
generate_flatbuffer() | |||
assert not args.append, ('--append is not support yet') | |||
assert args.input_model, ('--input_model must be given') | |||
with open(args.input_model, 'rb') as fin: | |||
raw_model = fin.read() | |||
model_length = len(raw_model) | |||
if args.input_info: | |||
with open(args.input_info, 'rb') as fin: | |||
raw_info = fin.read() | |||
info_length = len(raw_info) | |||
else: | |||
raw_info = None | |||
info_length = 0 | |||
# Generated by `flatc`. | |||
from model_parse import Model, ModelData, ModelHeader, ModelInfo, PackModel | |||
builder = flatbuffers.Builder(1024) | |||
model_name = builder.CreateString(args.model_name) | |||
model_cryption = builder.CreateString(args.model_cryption) | |||
info_cryption = builder.CreateString(args.info_cryption) | |||
info_parser = builder.CreateString(args.info_parser) | |||
info_data = builder.CreateByteVector(raw_info) | |||
arr_data = builder.CreateByteVector(raw_model) | |||
#model header | |||
ModelHeader.ModelHeaderStart(builder) | |||
ModelHeader.ModelHeaderAddName(builder, model_name) | |||
ModelHeader.ModelHeaderAddModelDecryptionMethod(builder, model_cryption) | |||
ModelHeader.ModelHeaderAddInfoDecryptionMethod(builder, info_cryption) | |||
ModelHeader.ModelHeaderAddInfoParseMethod(builder, info_parser) | |||
model_header = ModelHeader.ModelHeaderEnd(builder) | |||
#model info | |||
ModelInfo.ModelInfoStart(builder) | |||
ModelInfo.ModelInfoAddData(builder, info_data) | |||
model_info = ModelInfo.ModelInfoEnd(builder) | |||
#model data | |||
ModelData.ModelDataStart(builder) | |||
ModelData.ModelDataAddData(builder, arr_data) | |||
model_data = ModelData.ModelDataEnd(builder) | |||
Model.ModelStart(builder) | |||
Model.ModelAddHeader(builder, model_header) | |||
Model.ModelAddData(builder, model_data) | |||
Model.ModelAddInfo(builder, model_info) | |||
model = Model.ModelEnd(builder) | |||
PackModel.PackModelStartModelsVector(builder, 1) | |||
builder.PrependUOffsetTRelative(model) | |||
models = builder.EndVector(1) | |||
PackModel.PackModelStart(builder) | |||
PackModel.PackModelAddModels(builder, models) | |||
packed_model = PackModel.PackModelEnd(builder) | |||
builder.Finish(packed_model) | |||
buff = builder.Output() | |||
result = struct.pack(str(len("packed_model")) + 's', "packed_model".encode('ascii')) | |||
result += buff | |||
assert args.output, ('--output must be given') | |||
with open(args.output, 'wb') as fin: | |||
fin.write(result) | |||
print("Model packaged successfully!!!") | |||
print("model name is: {}.".format(args.model_name)) | |||
print("model encryption method is: {}. ".format(args.model_cryption)) | |||
print("model json infomation encryption method is: {}. ".format(args.info_cryption)) | |||
print("model json infomation parse method is: {}. ".format(args.info_parser)) | |||
print("packed model is write to {} ".format(args.output)) | |||
if __name__ == '__main__': | |||
main() |
@@ -0,0 +1,211 @@ | |||
/** \file tools/rc4_encrypt.cpp | |||
* | |||
* This file is part of MegEngine, a deep learning framework developed by | |||
* Megvii. | |||
* | |||
* \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved. | |||
*/ | |||
#include <stdio.h> | |||
#include <algorithm> | |||
#include <string> | |||
#include <unordered_map> | |||
#include <vector> | |||
#include <memory> | |||
#include "../src/decryption/rc4/rc4_cryption_base.h" | |||
#include "../src/decryption/rc4_cryption.h" | |||
using namespace lite; | |||
std::shared_ptr<void> read_file(std::string file_path, size_t& size) { | |||
FILE* fin = fopen(file_path.c_str(), "rb"); | |||
if (!fin) { | |||
printf("failed to open %s.", file_path.c_str()); | |||
}; | |||
fseek(fin, 0, SEEK_END); | |||
size = ftell(fin); | |||
fseek(fin, 0, SEEK_SET); | |||
void* ptr = malloc(size); | |||
std::shared_ptr<void> buf{ptr, ::free}; | |||
fread(buf.get(), 1, size, fin); | |||
fclose(fin); | |||
return buf; | |||
} | |||
void write_file(std::string file_path, const std::vector<uint8_t>& data) { | |||
FILE* fin = fopen(file_path.c_str(), "wb"); | |||
if (!fin) { | |||
printf("failed to open %s.", file_path.c_str()); | |||
}; | |||
fwrite(data.data(), 1, data.size(), fin); | |||
fclose(fin); | |||
} | |||
typedef int (*CommandHandler)(int, char**); | |||
const char* usage = | |||
"Usage:\n" | |||
" rc4_encryptor encrypt_predefined_rc4 <input file> <output file>\n" | |||
" rc4_encryptor encrypt_rc4 <hash key> <enc key> <input file> <output " | |||
"file>\n" | |||
" rc4_encryptor encrypt_predefined_sfrc4 <input file> <output file>\n" | |||
" rc4_encryptor encrypt_sfrc4 <hash key> <enc key> <input file> " | |||
"<output " | |||
"file>\n" | |||
" rc4_encryptor hash <input file>\n"; | |||
int command_encrypt_predefined_rc4(int argc, char** argv) { | |||
if (argc != 4) { | |||
printf("Invalid encrypt_predefined_rc4 arguments.\n"); | |||
return 1; | |||
} | |||
const char* input_file_path = argv[2]; | |||
const char* output_file_path = argv[3]; | |||
size_t size = 0; | |||
auto keys = RC4::get_decrypt_key(); | |||
auto input = read_file(input_file_path, size); | |||
printf("Reading input file ...\n"); | |||
auto output = RC4::encrypt_model(input.get(), size, keys); | |||
write_file(output_file_path, output); | |||
printf("Done.\n"); | |||
return 0; | |||
} | |||
int command_encrypt_rc4(int argc, char** argv) { | |||
if (argc != 6) { | |||
printf("Invalid encrypt_rc4 arguments.\n"); | |||
return 1; | |||
} | |||
uint64_t hash_key = std::stoull(argv[2], 0, 0); | |||
uint64_t enc_key = std::stoull(argv[3], 0, 0); | |||
const char* input_file_path = argv[4]; | |||
const char* output_file_path = argv[5]; | |||
std::vector<uint8_t> keys(128, 0); | |||
uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
data[0] = hash_key; | |||
data[1] = enc_key; | |||
size_t size = 0; | |||
auto input = read_file(input_file_path, size); | |||
printf("Reading input file ...\n"); | |||
auto output = RC4::encrypt_model(input.get(), size, keys); | |||
printf("Encrypting ...\n"); | |||
write_file(output_file_path, output); | |||
printf("Done.\n"); | |||
return 0; | |||
} | |||
int command_encrypt_predefined_sfrc4(int argc, char** argv) { | |||
if (argc != 4) { | |||
printf("Invalid encrypt_predefined_rc4 arguments.\n"); | |||
return 1; | |||
} | |||
const char* input_file_path = argv[2]; | |||
const char* output_file_path = argv[3]; | |||
size_t size = 0; | |||
auto keys = SimpleFastRC4::get_decrypt_key(); | |||
auto input = read_file(input_file_path, size); | |||
printf("Reading input file ...\n"); | |||
auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys); | |||
write_file(output_file_path, output); | |||
printf("Done.\n"); | |||
return 0; | |||
} | |||
int command_encrypt_sfrc4(int argc, char** argv) { | |||
if (argc != 6) { | |||
printf("Invalid encrypt_rc4 arguments.\n"); | |||
return 1; | |||
} | |||
uint64_t hash_key = std::stoull(argv[2], 0, 0); | |||
uint64_t enc_key = std::stoull(argv[3], 0, 0); | |||
const char* input_file_path = argv[4]; | |||
const char* output_file_path = argv[5]; | |||
std::vector<uint8_t> keys(128, 0); | |||
uint64_t* data = reinterpret_cast<uint64_t*>(keys.data()); | |||
data[0] = hash_key; | |||
data[1] = enc_key; | |||
size_t size = 0; | |||
auto input = read_file(input_file_path, size); | |||
printf("Reading input file ...\n"); | |||
auto output = SimpleFastRC4::encrypt_model(input.get(), size, keys); | |||
printf("Encrypting ...\n"); | |||
write_file(output_file_path, output); | |||
printf("Done.\n"); | |||
return 0; | |||
} | |||
int command_hash(int argc, char** argv) { | |||
if (argc != 3) { | |||
printf("Invalid hash arguments.\n"); | |||
return 1; | |||
} | |||
const char* input_file_path = argv[2]; | |||
size_t len = 0; | |||
auto input = read_file(input_file_path, len); | |||
rc4::FastHash64 hasher(rc4::key_gen_hash_key()); | |||
auto start = static_cast<const char*>(input.get()); | |||
auto ptr = reinterpret_cast<const uint64_t*>(start); | |||
while (reinterpret_cast<const char*>(ptr + 1) <= start + len) { | |||
hasher.feed(*ptr); | |||
++ptr; | |||
} | |||
auto cptr = reinterpret_cast<const char*>(ptr); | |||
if (cptr < start + len) { | |||
uint64_t v = 0; | |||
std::copy(cptr, start + len, reinterpret_cast<char*>(&v)); | |||
hasher.feed(v); | |||
} | |||
printf("%llx\n", static_cast<unsigned long long>(hasher.get())); | |||
return 0; | |||
} | |||
std::unordered_map<std::string, CommandHandler> commands = { | |||
{"encrypt_predefined_rc4", command_encrypt_predefined_rc4}, | |||
{"encrypt_rc4", command_encrypt_rc4}, | |||
{"encrypt_predefined_sfrc4", command_encrypt_predefined_sfrc4}, | |||
{"encrypt_sfrc4", command_encrypt_sfrc4}, | |||
{"hash", command_hash}, | |||
}; | |||
int main(int argc, char** argv) { | |||
if (argc == 1) { | |||
printf("%s", usage); | |||
return 1; | |||
} | |||
auto it = commands.find(argv[1]); | |||
if (it == commands.end()) { | |||
printf("Invalid command arguments.\n"); | |||
printf("%s", usage); | |||
return 1; | |||
} | |||
return it->second(argc, argv); | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -209,6 +209,35 @@ function do_build() { | |||
echo "comapt whl name: ${compat_whl_name}" | |||
cp ${BUILD_DIR}/staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name} | |||
# handle megenginelite | |||
cd ${BUILD_DIR} | |||
rm -rf lite_staging | |||
mkdir -p lite_staging/megenginelite | |||
cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ | |||
cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ | |||
cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ | |||
VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py | |||
if [ -f ${VER_FILE} ];then | |||
cp ${VER_FILE} lite_staging/megenginelite | |||
else | |||
echo "ERROR: can not find version file" | |||
exit -1 | |||
fi | |||
mkdir -p ${BUILD_DIR}/lite_staging/megenginelite/libs | |||
LITE_LIB=${BUILD_DIR}/lite_staging/megenginelite/libs/liblite_shared.dylib | |||
cp ${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/lite/liblite_shared.dylib ${LITE_LIB} | |||
llvm-strip -s ${LITE_LIB} | |||
cd ${BUILD_DIR}/lite_staging/ | |||
${PYTHON_DIR}/bin/python3 setup.py bdist_wheel | |||
cd ${BUILD_DIR}/lite_staging/dist/ | |||
org_whl_name=`ls Meg*.whl` | |||
index=`awk -v a="${org_whl_name}" -v b="-macosx" 'BEGIN{print index(a,b)}'` | |||
compat_whl_name=`echo ${org_whl_name} |cut -b -$index`macosx_10_14_x86_64.whl | |||
echo "megenginelite org whl name: ${org_whl_name}" | |||
echo "megenginelite comapt whl name: ${compat_whl_name}" | |||
cp ${BUILD_DIR}/lite_staging/dist/Meg*.whl ${MACOS_WHL_HOME}/${compat_whl_name} | |||
cd ${SRC_DIR} | |||
echo "" | |||
echo "##############################################################################################" | |||
@@ -155,6 +155,33 @@ do | |||
echo "comapt whl name: ${compat_whl_name}" | |||
mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name} | |||
# handle megenginelite | |||
cd ${BUILD_DIR} | |||
rm -rf lite_staging | |||
mkdir -p lite_staging/megenginelite | |||
cp ${SRC_DIR}/lite/pylite/megenginelite/* lite_staging/megenginelite/ | |||
cp ${SRC_DIR}/lite/pylite/setup.py lite_staging/ | |||
cp ${SRC_DIR}/lite/pylite/requires.txt lite_staging/ | |||
VER_FILE=${SRC_DIR}/imperative/python/megengine/version.py | |||
if [ -f ${VER_FILE} ];then | |||
cp ${VER_FILE} lite_staging/megenginelite | |||
else | |||
echo "ERROR: can not find version file" | |||
exit -1 | |||
fi | |||
patch_elf_depend_lib_megenginelite | |||
cd ${BUILD_DIR}/lite_staging/ | |||
${PYTHON_DIR}/bin/python setup.py bdist_wheel | |||
cd /home/output | |||
mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME} | |||
cd ${BUILD_DIR}/lite_staging/dist/ | |||
org_whl_name=`ls Meg*${ver}*.whl` | |||
compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` | |||
echo "megenginelite org whl name: ${org_whl_name}" | |||
echo "megenginelite comapt whl name: ${compat_whl_name}" | |||
mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${SDK_NAME}/${compat_whl_name} | |||
cd /home/output | |||
chown -R ${UID}.${UID} . | |||
# compat for root-less docker env to remove output at host side | |||