From 4e0c9ad3c669489a95c53dc63ef7efafd959b65a Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 8 Jun 2020 06:35:41 +0000 Subject: [PATCH] feat(mgb/external): extern-c-opr dumper and loader for MACE GitOrigin-RevId: bfe5420b1de97eae9ee943c4d3adaf652d795e03 --- ACKNOWLEDGMENTS | 2 + sdk/c-opr-loaders/mace/Makefile | 43 ++++++ sdk/c-opr-loaders/mace/README.md | 63 ++++++++ sdk/c-opr-loaders/mace/dump_model.py | 115 +++++++++++++++ sdk/c-opr-loaders/mace/extern_c_opr.h | 1 + sdk/c-opr-loaders/mace/mace_loader.cpp | 260 +++++++++++++++++++++++++++++++++ 6 files changed, 484 insertions(+) create mode 100644 sdk/c-opr-loaders/mace/Makefile create mode 100644 sdk/c-opr-loaders/mace/README.md create mode 100644 sdk/c-opr-loaders/mace/dump_model.py create mode 120000 sdk/c-opr-loaders/mace/extern_c_opr.h create mode 100644 sdk/c-opr-loaders/mace/mace_loader.cpp diff --git a/ACKNOWLEDGMENTS b/ACKNOWLEDGMENTS index f2a476b9..3045a7f5 100644 --- a/ACKNOWLEDGMENTS +++ b/ACKNOWLEDGMENTS @@ -727,6 +727,8 @@ This software has been modified by Megvii Inc. 4. FlatBuffers Copyright 2014 Google Inc. All rights reserved. +5. MACE +Copyright 2018 Xiaomi Inc. All rights reserved. Terms of Apache License Version 2.0 diff --git a/sdk/c-opr-loaders/mace/Makefile b/sdk/c-opr-loaders/mace/Makefile new file mode 100644 index 00000000..c054cab7 --- /dev/null +++ b/sdk/c-opr-loaders/mace/Makefile @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +ifeq ($(SDKPATH),) + CURDIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + SDKPATH = $(CURDIR)/arm64-v8a +endif + +TARGET := libmace_loader.so +all: $(TARGET) + +CXX = aarch64-linux-android-g++ + +CXXFLAGS = -std=c++14 -I$(SDKPATH)/include \ + -Dmgb_mace_loader_init=mgb_c_opr_init + +DEBUG ?= 0 +ifeq ($(DEBUG), 1) + CXXFLAGS += -O0 -g +else + CXXFLAGS += -O2 +endif + +LDFLAGS = -L$(SDKPATH)/lib \ + -lmace_static -lcore -lgenerated_version \ + -lport_android -lport_linux_base -lport_posix -lport_base \ + -lproto -lutils -lgenerated_opencl_kernel \ + -lops -lcore -lprotobuf-lite -llog \ + -lpthread -l:rpcmem.a + +LDLIBS = -shared -fPIC + +$(TARGET): mace_loader.cpp extern_c_opr.h + $(CXX) $(CXXFLAGS) $(LDLIBS) -o $@ $< $(LDFLAGS) + +clean: + @rm -f $(TARGET) diff --git a/sdk/c-opr-loaders/mace/README.md b/sdk/c-opr-loaders/mace/README.md new file mode 100644 index 00000000..fdf5e581 --- /dev/null +++ b/sdk/c-opr-loaders/mace/README.md @@ -0,0 +1,63 @@ +# Extern-C-Opr with MACE + +### Build MegEngine `load_and_run` for arm64-v8a + +```bash +cd $MEGENGINE_HOME +./scripts/cmake-build/cross_build_android_arm_inference.sh -a arm64-v8a +``` + +After successfully built, load_and_run should be in `$MEGENGINE_HOME/build_dir/android/arm64-v8a/Release/install/bin` + +### Build MACE libraries for arm64-v8a with GPU runtime + +```bash +cd $MACE_HOME +RUNTIME=GPU bash tools/cmake/cmake-build-arm64-v8a.sh + +cp -r $MACE_HOME/build/cmake-build/arm64-v8a/install $MEGENGINE_HOME/sdk/c-opr-loaders/mace/arm64-v8a +``` + +### Build MACE loader for MegEngine + +``` +SDK_PATH=/path/to/mace-sdk make +``` + +If `SDK_PATH` is not set, by default it's `./arm64-v8a` + +You can run with debug mode(by adding `DEBUG=1` to make command), which will show more running information + +### Prepare a MACE model(for example: resnet_50), wrap it with MegEngine extern c opr + +``` +python3 dump_model.py path/to/resnet_50.pb path/to/resnet_50.data path/to/resnet_50.mdl path/to/resnet_50.yml +``` + +`*.pb` file denotes the model structure, `*.data` denotes the model parameters + +Check [here](https://github.com/XiaoMi/mace-models) to learn how to write yml files for MACE + +### Run with load-and-run + +First of all, send all files to the executed device: + +- load_and_run +- resnet_50.mdl +- libmace_loader.so +- opencl library(something like libOpenCL.so, libmali.so or libEGL.so ...) if you want to run it on GPU + +``` +RUNTIME=GPU OPENCPATH=/path/to/opencl DATAFORMAT=NCHW /path/to/load_and_run /path/to/resnet_50.mdl --c-opr-lib /path/to/libmace_loader.so +``` + +RUNTIME candidates: + +- CPU +- GPU + +Running with GPU runtime on android needs opencl library, one can set `OPENCLPATH` by using environment variable + +We mainly use NCHW data format, if you have NHWC model, use environment `DATAFORMAT=NHWC` + +if you want to run with HEXAGON runtime, more efforts should be made, please check [here](https://mace.readthedocs.io/en/latest/faq.html#why-is-mace-not-working-on-dsp). diff --git a/sdk/c-opr-loaders/mace/dump_model.py b/sdk/c-opr-loaders/mace/dump_model.py new file mode 100644 index 00000000..ed5e0bb1 --- /dev/null +++ b/sdk/c-opr-loaders/mace/dump_model.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +import argparse + +import megengine._internal as mgb +import numpy as np +import yaml + + +# "1,3,224,224" -> (1,3,224,224) +def str2tuple(x): + x = x.split(",") + x = [int(a) for a in x] + x = tuple(x) + return x + + +def main(): + parser = argparse.ArgumentParser( + description="load a .pb model and convert to corresponding " + "load-and-run model" + ) + parser.add_argument("input", help="mace model file") + parser.add_argument("param", help="mace param file") + parser.add_argument( + "output", help="converted model that can be fed to dump_with_testcase_mge.py" + ) + parser.add_argument("config", help="config file with yaml format") + args = parser.parse_args() + + with open(args.config, "r") as f: + configs = yaml.load(f) + + for model_name in configs["models"]: + # ignore several sub models currently + sub_model = configs["models"][model_name]["subgraphs"][0] + + # input/output shapes + isizes = [str2tuple(x) for x in sub_model["input_shapes"]] + + # input/output names + input_names = sub_model["input_tensors"] + if "check_tensors" in sub_model: + output_names = sub_model["check_tensors"] + osizes = [str2tuple(x) for x in sub_model["check_shapes"]] + else: + output_names = sub_model["output_tensors"] + osizes = [str2tuple(x) for x in sub_model["output_shapes"]] + + with open(args.input, "rb") as fin: + raw_model = fin.read() + with open(args.param, "rb") as fin: + raw_param = fin.read() + + model_size = (len(raw_model)).to_bytes(4, byteorder="little") + param_size = (len(raw_param)).to_bytes(4, byteorder="little") + + n_inputs = (len(input_names)).to_bytes(4, byteorder="little") + n_outputs = (len(output_names)).to_bytes(4, byteorder="little") + + names_buffer = n_inputs + n_outputs + for iname in input_names: + names_buffer += (len(iname)).to_bytes(4, byteorder="little") + names_buffer += str.encode(iname) + for oname in output_names: + names_buffer += (len(oname)).to_bytes(4, byteorder="little") + names_buffer += str.encode(oname) + + shapes_buffer = n_outputs + for oshape in osizes: + shapes_buffer += (len(oshape)).to_bytes(4, byteorder="little") + for oi in oshape: + shapes_buffer += oi.to_bytes(4, byteorder="little") + + # raw content contains: + # input/output names + output shapes + model buffer + param buffer + wk_raw_content = ( + names_buffer + + shapes_buffer + + model_size + + raw_model + + param_size + + raw_param + ) + + # cn not ensured + cn = mgb.comp_node("xpux") + cg = mgb.comp_graph() + + inp = [ + mgb.make_shared( + comp_node=cn, + comp_graph=cg, + shape=isizes[i], + name=input_names[i], + dtype=np.float32, + ) + for i in range(len(isizes)) + ] + + oup = mgb.opr.extern_c_opr_placeholder( + inp, osizes, dump_name="mace", dump_data=wk_raw_content, + ) + + mgb.serialize_comp_graph_to_file(args.output, oup) + + +if __name__ == "__main__": + main() diff --git a/sdk/c-opr-loaders/mace/extern_c_opr.h b/sdk/c-opr-loaders/mace/extern_c_opr.h new file mode 120000 index 00000000..a957b9b1 --- /dev/null +++ b/sdk/c-opr-loaders/mace/extern_c_opr.h @@ -0,0 +1 @@ +../../../src/serialization/include/megbrain/serialization/extern_c_opr.h \ No newline at end of file diff --git a/sdk/c-opr-loaders/mace/mace_loader.cpp b/sdk/c-opr-loaders/mace/mace_loader.cpp new file mode 100644 index 00000000..7395a871 --- /dev/null +++ b/sdk/c-opr-loaders/mace/mace_loader.cpp @@ -0,0 +1,260 @@ +/** + * \file sdk/c-opr-loaders/mace/mace_loader.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include +#include + +#include "mace/public/mace.h" +#include "extern_c_opr.h" + +#define ASSERT(x, msg) \ + do { \ + if (!(x)) { \ + printf("error at %s:%d %s\n", __FILE__, __LINE__, __FUNCTION__); \ + printf(msg); \ + __builtin_trap(); \ + } \ + } while (0) + + +class MGBOprDescImpl { + struct UserData { + std::shared_ptr engine; + size_t nr_inputs, nr_outputs; + std::vector> output_shapes; + std::vector input_names, output_names; + }; + + static UserData* user_data(const MGBOprDesc* self) { + return static_cast(self->user_data); + } + + static void release(MGBOprDesc* self) { + // free all data buffers + delete user_data(self); + delete self; + } + + static size_t hash(const MGBOprDesc* self) { + return reinterpret_cast(self); + } + + static int is_same(const MGBOprDesc* self, const MGBOprDesc* rhs) { + return self == rhs; + } + + static void infer_shape(const MGBOprDesc* self, const MGBTensorShape* input, + MGBTensorShape* output) { + auto ud = user_data(self); + + // infer output shape from user data + for (size_t i = 0; i < ud->nr_outputs; i++) { + output[i].ndim = ud->output_shapes[i].size(); + for (size_t j = 0; j < output[i].ndim; j++) { + output[i].shape[j] = ud->output_shapes[i][j]; + } + } + } + + static void infer_dtype(const MGBOprDesc*, const MGBDType* input, MGBDType* output) { + ASSERT(input[0] == MGB_DTYPE_FLOAT32, "Input dtype is not float32"); + output[0] = MGB_DTYPE_FLOAT32; + } + + static void execute(const MGBOprDesc* self, const MGBTensor* input, + const MGBTensor* output) { + auto ud = user_data(self); + + // create input and output tensor buffers + std::map mace_inputs; + std::map mace_outputs; + + auto mace_data_format = mace::DataFormat::NCHW; + char *data_format = getenv("DATAFORMAT"); + if (!strcmp(data_format, "NHWC")) { + mace_data_format = mace::DataFormat::NHWC; + } + + for (size_t i = 0; i < ud->nr_inputs; ++i) { + // allocate input + uint32_t ndim = input[i].layout.shape.ndim; + auto input_shape = std::vector(input[i].layout.shape.shape, + input[i].layout.shape.shape + ndim); + + int64_t input_size = + std::accumulate(input_shape.begin(), input_shape.end(), 1, + std::multiplies()); + auto buffer_in = std::shared_ptr(new float[input_size], + std::default_delete()); + memcpy(buffer_in.get(), input[i].data, input_size * sizeof(float)); + mace_inputs[ud->input_names[i]] = + mace::MaceTensor(input_shape, buffer_in, mace_data_format); + } + + for (size_t i = 0; i < ud->nr_outputs; ++i) { + // allocate output + uint32_t ndim = output[i].layout.shape.ndim; + auto output_shape = std::vector(output[i].layout.shape.shape, + output[i].layout.shape.shape + ndim); + + int64_t output_size = + std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + auto buffer_out = std::shared_ptr(new float[output_size], + std::default_delete()); + mace_outputs[ud->output_names[i]] = + mace::MaceTensor(output_shape, buffer_out, mace_data_format); + } + + // run the model + auto status = (ud->engine)->Run(mace_inputs, &mace_outputs); + ASSERT(status == mace::MaceStatus::MACE_SUCCESS, + "Error in running mace engine"); + + // send computed output to MGB + int idx = 0; + for (auto it = mace_outputs.begin(); it != mace_outputs.end(); it++) { + float* to = &((float *)output[idx++].data)[0]; + to = (it->second).data().get(); + } + } + +public: + static MGBOprDesc* make(size_t nr_input, const void *buf, size_t buf_len) { + auto ud = std::make_unique(); + + std::shared_ptr engine; + mace::DeviceType device_type; + + char *runtime_mode = getenv("RUNTIME"); + if (!strcmp(runtime_mode, "GPU")) { + device_type = mace::DeviceType::GPU; + } else { + device_type = mace::DeviceType::CPU; + } + mace::MaceEngineConfig config(device_type); + + // set gpu context, mainly opencl path + if (device_type == mace::DeviceType::GPU) { + std::shared_ptr gpu_context; + + char *opencl_path = getenv("OPENCLPATH"); + ASSERT(opencl_path, "Please set opencl library path"); + std::string storage_path(opencl_path); + gpu_context = mace::GPUContextBuilder() + .SetStoragePath(storage_path) + .Finalize(); + + config.SetGPUContext(gpu_context); + config.SetGPUHints( + static_cast(mace::GPUPerfHint::PERF_HIGH), + static_cast(mace::GPUPriorityHint::PRIORITY_HIGH)); + } + + std::vector input_names, output_names; + + // extract all information from buf + + void *buffer = const_cast(buf); + + ud->nr_inputs = *reinterpret_cast(buffer); + ud->nr_outputs = *(reinterpret_cast(buffer) + 1); + + // interpret input names + char *name_buf = reinterpret_cast(buffer) + 8; + for (size_t i = 0; i < ud->nr_inputs; i++) { + size_t ilen = *reinterpret_cast(name_buf); + input_names.push_back(std::string(name_buf + 4, ilen)); + name_buf += (ilen + 4); + } + + // interpret output names + buffer = name_buf; + name_buf = reinterpret_cast(buffer); + for (size_t i = 0; i < ud->nr_outputs; i++) { + size_t olen = *reinterpret_cast(name_buf); + output_names.push_back(std::string(name_buf + 4, olen)); + name_buf += (olen + 4); + } + + ud->input_names = input_names; + ud->output_names = output_names; + + // interpret output shapes + buffer = name_buf; + uint32_t *shape_buf = reinterpret_cast(buffer) + 1; + for (size_t i = 0; i < ud->nr_outputs; i++) { + size_t olen = *reinterpret_cast(shape_buf); + ud->output_shapes.push_back( + std::vector(shape_buf + 1, shape_buf + olen + 1) + ); + shape_buf += (olen + 1); + } + + buffer = shape_buf; + const size_t model_buf_len = *reinterpret_cast(buffer); + unsigned char *model_buf = reinterpret_cast(buffer) + 4; + + const size_t param_buf_len = *reinterpret_cast(model_buf + model_buf_len); + unsigned char *param_buf = model_buf + model_buf_len + 4; + + // create mace engine + auto create_engine_status = mace::CreateMaceEngineFromProto( + model_buf, + model_buf_len, + param_buf, + param_buf_len, + input_names, + output_names, + config, + &engine + ); + ASSERT(create_engine_status == mace::MaceStatus::MACE_SUCCESS, + "Error in creating mace engine"); + + ud->engine = engine; + + auto ret = std::make_unique(); + mgb_init_opr_desc(ret.get(), ud->nr_outputs, "mace"); +#define a(n) ret->n = &n; + MGB_OPR_DESC_FOREACH_MEM_FN(a); + a(infer_dtype); +#undef a + ret->user_data = ud.release(); + return ret.release(); + } +}; + +class MGBOprLoaderImpl { + static MGBOprDesc* create_desc(size_t nr_input, const void *buf, + size_t buf_len) + { + return MGBOprDescImpl::make(nr_input, buf, buf_len); + } +public: + static MGBOprLoader make() { + return {"mace", create_desc}; + } +}; + +extern "C" { + +// public interface +__attribute__((visibility("default"))) +void MGB_C_OPR_INIT_FUNC(const MGBExternCOprApi* (*get_api)(int)) +{ + const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION); + ASSERT(api, "Create api failed"); + MGBOprLoader loader = MGBOprLoaderImpl::make(); + api->register_loader(&loader); +} + +} // extern "C"