From 4e0c9ad3c669489a95c53dc63ef7efafd959b65a Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 8 Jun 2020 06:35:41 +0000
Subject: [PATCH] feat(mgb/external): extern-c-opr dumper and loader for MACE

GitOrigin-RevId: bfe5420b1de97eae9ee943c4d3adaf652d795e03
---
 ACKNOWLEDGMENTS                        |   2 +
 sdk/c-opr-loaders/mace/Makefile        |  43 ++++++
 sdk/c-opr-loaders/mace/README.md       |  63 ++++++++
 sdk/c-opr-loaders/mace/dump_model.py   | 115 +++++++++++++++
 sdk/c-opr-loaders/mace/extern_c_opr.h  |   1 +
 sdk/c-opr-loaders/mace/mace_loader.cpp | 260 +++++++++++++++++++++++++++++++++
 6 files changed, 484 insertions(+)
 create mode 100644 sdk/c-opr-loaders/mace/Makefile
 create mode 100644 sdk/c-opr-loaders/mace/README.md
 create mode 100644 sdk/c-opr-loaders/mace/dump_model.py
 create mode 120000 sdk/c-opr-loaders/mace/extern_c_opr.h
 create mode 100644 sdk/c-opr-loaders/mace/mace_loader.cpp

diff --git a/ACKNOWLEDGMENTS b/ACKNOWLEDGMENTS
index f2a476b9..3045a7f5 100644
--- a/ACKNOWLEDGMENTS
+++ b/ACKNOWLEDGMENTS
@@ -727,6 +727,8 @@ This software has been modified by Megvii Inc.
 4. FlatBuffers
 Copyright 2014 Google Inc. All rights reserved.
 
+5. MACE
+Copyright 2018 Xiaomi Inc. All rights reserved.
 
 
 Terms of Apache License Version 2.0
diff --git a/sdk/c-opr-loaders/mace/Makefile b/sdk/c-opr-loaders/mace/Makefile
new file mode 100644
index 00000000..c054cab7
--- /dev/null
+++ b/sdk/c-opr-loaders/mace/Makefile
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+ifeq ($(SDKPATH),)
+	CURDIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+	SDKPATH = $(CURDIR)/arm64-v8a
+endif
+
+TARGET := libmace_loader.so
+all: $(TARGET)
+
+CXX = aarch64-linux-android-g++
+
+CXXFLAGS = -std=c++14  -I$(SDKPATH)/include \
+			-Dmgb_mace_loader_init=mgb_c_opr_init
+
+DEBUG ?= 0
+ifeq ($(DEBUG), 1)
+    CXXFLAGS += -O0 -g
+else
+    CXXFLAGS += -O2
+endif
+
+LDFLAGS = -L$(SDKPATH)/lib \
+			-lmace_static -lcore -lgenerated_version \
+			-lport_android -lport_linux_base -lport_posix -lport_base \
+			-lproto -lutils -lgenerated_opencl_kernel \
+			-lops -lcore -lprotobuf-lite -llog \
+			-lpthread -l:rpcmem.a
+
+LDLIBS = -shared -fPIC
+
+$(TARGET): mace_loader.cpp extern_c_opr.h
+	$(CXX) $(CXXFLAGS) $(LDLIBS) -o $@ $< $(LDFLAGS)
+
+clean:
+	@rm -f $(TARGET)
diff --git a/sdk/c-opr-loaders/mace/README.md b/sdk/c-opr-loaders/mace/README.md
new file mode 100644
index 00000000..fdf5e581
--- /dev/null
+++ b/sdk/c-opr-loaders/mace/README.md
@@ -0,0 +1,63 @@
+# Extern-C-Opr with MACE
+
+### Build MegEngine `load_and_run` for arm64-v8a
+
+```bash
+cd $MEGENGINE_HOME
+./scripts/cmake-build/cross_build_android_arm_inference.sh -a arm64-v8a
+```
+
+After successfully built, load_and_run should be in `$MEGENGINE_HOME/build_dir/android/arm64-v8a/Release/install/bin`
+
+### Build MACE libraries for arm64-v8a with GPU runtime
+
+```bash
+cd $MACE_HOME
+RUNTIME=GPU bash tools/cmake/cmake-build-arm64-v8a.sh
+
+cp -r $MACE_HOME/build/cmake-build/arm64-v8a/install $MEGENGINE_HOME/sdk/c-opr-loaders/mace/arm64-v8a
+```
+
+### Build MACE loader for MegEngine
+
+```
+SDK_PATH=/path/to/mace-sdk make
+```
+
+If `SDK_PATH` is not set, by default it's `./arm64-v8a`
+
+You can run with debug mode(by adding `DEBUG=1` to make command), which will show more running information
+
+### Prepare a MACE model(for example: resnet_50), wrap it with MegEngine extern c opr
+
+```
+python3 dump_model.py path/to/resnet_50.pb path/to/resnet_50.data path/to/resnet_50.mdl path/to/resnet_50.yml
+```
+
+`*.pb` file denotes the model structure, `*.data` denotes the model parameters
+
+Check [here](https://github.com/XiaoMi/mace-models) to learn how to write yml files for MACE
+
+### Run with load-and-run
+
+First of all, send all files to the executed device:
+
+- load_and_run
+- resnet_50.mdl
+- libmace_loader.so
+- opencl library(something like libOpenCL.so, libmali.so or libEGL.so ...) if you want to run it on GPU
+
+```
+RUNTIME=GPU OPENCPATH=/path/to/opencl DATAFORMAT=NCHW /path/to/load_and_run /path/to/resnet_50.mdl --c-opr-lib /path/to/libmace_loader.so
+```
+
+RUNTIME candidates:
+
+- CPU
+- GPU
+
+Running with GPU runtime on android needs opencl library, one can set `OPENCLPATH` by using environment variable
+
+We mainly use NCHW data format, if you have NHWC model, use environment `DATAFORMAT=NHWC`
+
+if you want to run with HEXAGON runtime, more efforts should be made, please check [here](https://mace.readthedocs.io/en/latest/faq.html#why-is-mace-not-working-on-dsp).
diff --git a/sdk/c-opr-loaders/mace/dump_model.py b/sdk/c-opr-loaders/mace/dump_model.py
new file mode 100644
index 00000000..ed5e0bb1
--- /dev/null
+++ b/sdk/c-opr-loaders/mace/dump_model.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+
+import megengine._internal as mgb
+import numpy as np
+import yaml
+
+
+# "1,3,224,224" -> (1,3,224,224)
+def str2tuple(x):
+    x = x.split(",")
+    x = [int(a) for a in x]
+    x = tuple(x)
+    return x
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="load a .pb model and convert to corresponding "
+        "load-and-run model"
+    )
+    parser.add_argument("input", help="mace model file")
+    parser.add_argument("param", help="mace param file")
+    parser.add_argument(
+        "output", help="converted model that can be fed to dump_with_testcase_mge.py"
+    )
+    parser.add_argument("config", help="config file with yaml format")
+    args = parser.parse_args()
+
+    with open(args.config, "r") as f:
+        configs = yaml.load(f)
+
+    for model_name in configs["models"]:
+        # ignore several sub models currently
+        sub_model = configs["models"][model_name]["subgraphs"][0]
+
+        # input/output shapes
+        isizes = [str2tuple(x) for x in sub_model["input_shapes"]]
+
+        # input/output names
+        input_names = sub_model["input_tensors"]
+        if "check_tensors" in sub_model:
+            output_names = sub_model["check_tensors"]
+            osizes = [str2tuple(x) for x in sub_model["check_shapes"]]
+        else:
+            output_names = sub_model["output_tensors"]
+            osizes = [str2tuple(x) for x in sub_model["output_shapes"]]
+
+        with open(args.input, "rb") as fin:
+            raw_model = fin.read()
+        with open(args.param, "rb") as fin:
+            raw_param = fin.read()
+
+        model_size = (len(raw_model)).to_bytes(4, byteorder="little")
+        param_size = (len(raw_param)).to_bytes(4, byteorder="little")
+
+        n_inputs = (len(input_names)).to_bytes(4, byteorder="little")
+        n_outputs = (len(output_names)).to_bytes(4, byteorder="little")
+
+        names_buffer = n_inputs + n_outputs
+        for iname in input_names:
+            names_buffer += (len(iname)).to_bytes(4, byteorder="little")
+            names_buffer += str.encode(iname)
+        for oname in output_names:
+            names_buffer += (len(oname)).to_bytes(4, byteorder="little")
+            names_buffer += str.encode(oname)
+
+        shapes_buffer = n_outputs
+        for oshape in osizes:
+            shapes_buffer += (len(oshape)).to_bytes(4, byteorder="little")
+            for oi in oshape:
+                shapes_buffer += oi.to_bytes(4, byteorder="little")
+
+        # raw content contains:
+        # input/output names + output shapes + model buffer + param buffer
+        wk_raw_content = (
+            names_buffer
+            + shapes_buffer
+            + model_size
+            + raw_model
+            + param_size
+            + raw_param
+        )
+
+        # cn not ensured
+        cn = mgb.comp_node("xpux")
+        cg = mgb.comp_graph()
+
+        inp = [
+            mgb.make_shared(
+                comp_node=cn,
+                comp_graph=cg,
+                shape=isizes[i],
+                name=input_names[i],
+                dtype=np.float32,
+            )
+            for i in range(len(isizes))
+        ]
+
+        oup = mgb.opr.extern_c_opr_placeholder(
+            inp, osizes, dump_name="mace", dump_data=wk_raw_content,
+        )
+
+        mgb.serialize_comp_graph_to_file(args.output, oup)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sdk/c-opr-loaders/mace/extern_c_opr.h b/sdk/c-opr-loaders/mace/extern_c_opr.h
new file mode 120000
index 00000000..a957b9b1
--- /dev/null
+++ b/sdk/c-opr-loaders/mace/extern_c_opr.h
@@ -0,0 +1 @@
+../../../src/serialization/include/megbrain/serialization/extern_c_opr.h
\ No newline at end of file
diff --git a/sdk/c-opr-loaders/mace/mace_loader.cpp b/sdk/c-opr-loaders/mace/mace_loader.cpp
new file mode 100644
index 00000000..7395a871
--- /dev/null
+++ b/sdk/c-opr-loaders/mace/mace_loader.cpp
@@ -0,0 +1,260 @@
+/**
+ * \file sdk/c-opr-loaders/mace/mace_loader.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include <numeric>
+#include <iostream>
+
+#include "mace/public/mace.h"
+#include "extern_c_opr.h"
+
+#define ASSERT(x, msg)                                                       \
+    do {                                                                     \
+        if (!(x)) {                                                          \
+            printf("error at %s:%d %s\n", __FILE__, __LINE__, __FUNCTION__); \
+            printf(msg);                                                     \
+            __builtin_trap();                                                \
+        }                                                                    \
+    } while (0)
+
+
+class MGBOprDescImpl {
+    struct UserData {
+        std::shared_ptr<mace::MaceEngine> engine;
+        size_t nr_inputs, nr_outputs;
+        std::vector<std::vector<int64_t>> output_shapes;
+        std::vector<std::string> input_names, output_names;
+    };
+
+    static UserData* user_data(const MGBOprDesc* self) {
+        return static_cast<UserData*>(self->user_data);
+    }
+
+    static void release(MGBOprDesc* self) {
+        // free all data buffers
+        delete user_data(self);
+        delete self;
+    }
+
+    static size_t hash(const MGBOprDesc* self) {
+        return reinterpret_cast<size_t>(self);
+    }
+
+    static int is_same(const MGBOprDesc* self, const MGBOprDesc* rhs) {
+        return self == rhs;
+    }
+
+    static void infer_shape(const MGBOprDesc* self, const MGBTensorShape* input,
+                            MGBTensorShape* output) {
+        auto ud = user_data(self);
+
+        // infer output shape from user data
+        for (size_t i = 0; i < ud->nr_outputs; i++) {
+            output[i].ndim = ud->output_shapes[i].size();
+            for (size_t j = 0; j < output[i].ndim; j++) {
+                output[i].shape[j] = ud->output_shapes[i][j];
+            }
+        }
+    }
+
+    static void infer_dtype(const MGBOprDesc*, const MGBDType* input, MGBDType* output) {
+        ASSERT(input[0] == MGB_DTYPE_FLOAT32, "Input dtype is not float32");
+        output[0] = MGB_DTYPE_FLOAT32;
+    }
+
+    static void execute(const MGBOprDesc* self, const MGBTensor* input,
+                        const MGBTensor* output) {
+        auto ud = user_data(self);
+
+        // create input and output tensor buffers
+        std::map<std::string, mace::MaceTensor> mace_inputs;
+        std::map<std::string, mace::MaceTensor> mace_outputs;
+
+        auto mace_data_format = mace::DataFormat::NCHW;
+        char *data_format = getenv("DATAFORMAT");
+        if (!strcmp(data_format, "NHWC")) {
+            mace_data_format = mace::DataFormat::NHWC;
+        }
+
+        for (size_t i = 0; i < ud->nr_inputs; ++i) {
+            // allocate input
+            uint32_t ndim = input[i].layout.shape.ndim;
+            auto input_shape = std::vector<int64_t>(input[i].layout.shape.shape,
+                                                    input[i].layout.shape.shape + ndim);
+
+            int64_t input_size =
+                std::accumulate(input_shape.begin(), input_shape.end(), 1,
+                                std::multiplies<uint64_t>());
+            auto buffer_in = std::shared_ptr<float>(new float[input_size],
+                                                    std::default_delete<float[]>());
+            memcpy(buffer_in.get(), input[i].data, input_size * sizeof(float));
+            mace_inputs[ud->input_names[i]] =
+                mace::MaceTensor(input_shape, buffer_in, mace_data_format);
+        }
+
+        for (size_t i = 0; i < ud->nr_outputs; ++i) {
+            // allocate output
+            uint32_t ndim = output[i].layout.shape.ndim;
+            auto output_shape = std::vector<int64_t>(output[i].layout.shape.shape,
+                                                     output[i].layout.shape.shape + ndim);
+
+            int64_t output_size =
+                std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int64_t>());
+            auto buffer_out = std::shared_ptr<float>(new float[output_size],
+                                                     std::default_delete<float[]>());
+            mace_outputs[ud->output_names[i]] =
+                mace::MaceTensor(output_shape, buffer_out, mace_data_format);
+        }
+
+        // run the model
+        auto status = (ud->engine)->Run(mace_inputs, &mace_outputs);
+        ASSERT(status == mace::MaceStatus::MACE_SUCCESS,
+               "Error in running mace engine");
+
+        // send computed output to MGB
+        int idx = 0;
+        for (auto it = mace_outputs.begin(); it != mace_outputs.end(); it++) {
+            float* to = &((float *)output[idx++].data)[0];
+            to = (it->second).data().get();
+        }
+    }
+
+public:
+    static MGBOprDesc* make(size_t nr_input, const void *buf, size_t buf_len) {
+        auto ud = std::make_unique<UserData>();
+
+        std::shared_ptr<mace::MaceEngine> engine;
+        mace::DeviceType device_type;
+
+        char *runtime_mode = getenv("RUNTIME");
+        if (!strcmp(runtime_mode, "GPU")) {
+            device_type = mace::DeviceType::GPU;
+        } else {
+            device_type = mace::DeviceType::CPU;
+        }
+        mace::MaceEngineConfig config(device_type);
+
+        // set gpu context, mainly opencl path
+        if (device_type == mace::DeviceType::GPU) {
+            std::shared_ptr<mace::GPUContext> gpu_context;
+
+            char *opencl_path = getenv("OPENCLPATH");
+            ASSERT(opencl_path, "Please set opencl library path");
+            std::string storage_path(opencl_path);
+            gpu_context = mace::GPUContextBuilder()
+                            .SetStoragePath(storage_path)
+                            .Finalize();
+
+            config.SetGPUContext(gpu_context);
+            config.SetGPUHints(
+                static_cast<mace::GPUPerfHint>(mace::GPUPerfHint::PERF_HIGH),
+                static_cast<mace::GPUPriorityHint>(mace::GPUPriorityHint::PRIORITY_HIGH));
+        }
+
+        std::vector<std::string> input_names, output_names;
+
+        // extract all information from buf
+
+        void *buffer = const_cast<void *>(buf);
+
+        ud->nr_inputs = *reinterpret_cast<uint32_t*>(buffer);
+        ud->nr_outputs = *(reinterpret_cast<uint32_t*>(buffer) + 1);
+
+        // interpret input names
+        char *name_buf = reinterpret_cast<char*>(buffer) + 8;
+        for (size_t i = 0; i < ud->nr_inputs; i++) {
+            size_t ilen = *reinterpret_cast<uint32_t*>(name_buf);
+            input_names.push_back(std::string(name_buf + 4, ilen));
+            name_buf += (ilen + 4);
+        }
+
+        // interpret output names
+        buffer = name_buf;
+        name_buf = reinterpret_cast<char*>(buffer);
+        for (size_t i = 0; i < ud->nr_outputs; i++) {
+            size_t olen = *reinterpret_cast<uint32_t*>(name_buf);
+            output_names.push_back(std::string(name_buf + 4, olen));
+            name_buf += (olen + 4);
+        }
+
+        ud->input_names = input_names;
+        ud->output_names = output_names;
+
+        // interpret output shapes
+        buffer = name_buf;
+        uint32_t *shape_buf = reinterpret_cast<uint32_t*>(buffer) + 1;
+        for (size_t i = 0; i < ud->nr_outputs; i++) {
+            size_t olen = *reinterpret_cast<int*>(shape_buf);
+            ud->output_shapes.push_back(
+                std::vector<int64_t>(shape_buf + 1, shape_buf + olen + 1)
+            );
+            shape_buf += (olen + 1);
+        }
+
+        buffer = shape_buf;
+        const size_t model_buf_len = *reinterpret_cast<int*>(buffer);
+        unsigned char *model_buf = reinterpret_cast<unsigned char*>(buffer) + 4;
+
+        const size_t param_buf_len = *reinterpret_cast<int*>(model_buf + model_buf_len);
+        unsigned char *param_buf = model_buf + model_buf_len + 4;
+
+        // create mace engine
+        auto create_engine_status = mace::CreateMaceEngineFromProto(
+            model_buf,
+            model_buf_len,
+            param_buf,
+            param_buf_len,
+            input_names,
+            output_names,
+            config,
+            &engine
+        );
+        ASSERT(create_engine_status == mace::MaceStatus::MACE_SUCCESS,
+               "Error in creating mace engine");
+
+        ud->engine = engine;
+
+        auto ret = std::make_unique<MGBOprDesc>();
+        mgb_init_opr_desc(ret.get(), ud->nr_outputs, "mace");
+#define a(n) ret->n = &n;
+        MGB_OPR_DESC_FOREACH_MEM_FN(a);
+        a(infer_dtype);
+#undef a
+        ret->user_data = ud.release();
+        return ret.release();
+    }
+};
+
+class MGBOprLoaderImpl {
+    static MGBOprDesc* create_desc(size_t nr_input, const void *buf,
+            size_t buf_len)
+    {
+        return MGBOprDescImpl::make(nr_input, buf, buf_len);
+    }
+public:
+    static MGBOprLoader make() {
+        return {"mace", create_desc};
+    }
+};
+
+extern "C" {
+
+// public interface
+__attribute__((visibility("default")))
+void MGB_C_OPR_INIT_FUNC(const MGBExternCOprApi* (*get_api)(int))
+{
+    const MGBExternCOprApi* api = get_api(MGB_EXTERN_C_OPR_VERSION);
+    ASSERT(api, "Create api failed");
+    MGBOprLoader loader = MGBOprLoaderImpl::make();
+    api->register_loader(&loader);
+}
+
+}  // extern "C"