From b8f810ee31bdd6139e977e5cc4fcfc1c7857f7c8 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 25 Oct 2021 19:57:49 +0800
Subject: [PATCH] feat(megbrain): add mc20 runtime opr

GitOrigin-RevId: 2ee07b213bbdf465621842b072f3b450165c286a
---
 src/core/impl/comp_node/comp_node.cpp          |  10 +-
 src/opr/impl/mc20_runtime_op.cpp               | 379 +++++++++++++++++++++++++
 src/opr/impl/mc20_runtime_op.oprdecl           |  17 ++
 src/opr/impl/mc20_runtime_op.sereg.h           |  72 +++++
 src/opr/include/megbrain/opr/mc20_runtime_op.h |  83 ++++++
 5 files changed, 556 insertions(+), 5 deletions(-)
 create mode 100644 src/opr/impl/mc20_runtime_op.cpp
 create mode 100644 src/opr/impl/mc20_runtime_op.oprdecl
 create mode 100644 src/opr/impl/mc20_runtime_op.sereg.h
 create mode 100644 src/opr/include/megbrain/opr/mc20_runtime_op.h
diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp
index c37c6247..af2d13df 100644
--- a/src/core/impl/comp_node/comp_node.cpp
+++ b/src/core/impl/comp_node/comp_node.cpp
@@ -173,11 +173,11 @@ CompNode::Locator CompNode::Locator::parse(const std::string& id) {
         dev_type = DeviceType::CAMBRICON;
         ptr += 9;
     } else if (ptr[0] == 'm') {
-            if (strncmp(ptr, "multithread", 11)) {
-                err();
-            }
-            dev_type = DeviceType::MULTITHREAD;
-            ptr += 11;
+                if (strncmp(ptr, "multithread", 11)) {
+                    err();
+                }
+                dev_type = DeviceType::MULTITHREAD;
+                ptr += 11;
     }
     else {
         if (ptr[1] != 'p' || ptr[2] != 'u') {
diff --git a/src/opr/impl/mc20_runtime_op.cpp b/src/opr/impl/mc20_runtime_op.cpp
new file mode 100644
index 00000000..79ce214f
--- /dev/null
+++ b/src/opr/impl/mc20_runtime_op.cpp
@@ -0,0 +1,379 @@
+/**
+ * \file src/opr/impl/mc20_runtime_op.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/opr/mc20_runtime_op.h"
+#include "megbrain/common.h"
+#include "megbrain/graph/event.h"
+#include "megdnn/dtype.h"
+
+#include <memory>
+
+#if MGB_MC20
+
+using namespace mgb;
+using namespace opr;
+
+namespace {
+TensorShape mc20_shape_to_mgb_shape(AX_NPU_SDK_EX_TENSOR_META_T tensor_meta) {
+    TensorShape ret;
+    ret.ndim = tensor_meta.nShapeNDim;
+    for (size_t i = 0; i < ret.ndim; ++i) {
+        ret[i] = tensor_meta.pShape[i];
+    }
+    return ret;
+}
+DType mc20_dtype_to_mgb_dtype(AX_NPU_SDK_EX_ADV_TENSOR_DTYPE data_type) {
+    switch (data_type) {
+        case AX_NPU_TDT_UINT8:
+            return dtype::Uint8();
+        case AX_NPU_TDT_FLOAT32:
+            return dtype::Float32();
+        case AX_NPU_TDT_INT16:
+            return dtype::Int16();
+        case AX_NPU_TDT_INT32:
+            return dtype::Int32();
+        default:
+            mgb_throw(
+                    MegBrainError, "MC20DataType %d is not supported by MegBrain.",
+                    static_cast<int>(data_type));
+    }
+}
+
+};  // namespace
+
+constexpr AX_NPU_SDK_EX_HANDLE_T MC20RuntimeOpr::INVALID_MODEL_HANDLE;
+
+/* ====================== MC20RuntimeOpr ==================== */
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(MC20RuntimeOpr);
+MC20RuntimeOpr::MC20RuntimeOpr(
+        SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
+        const VarNodeArray& inputs, const OperatorNodeConfig& config)
+        : Super(inputs[0]->owner_graph(), config, "mc20_runtime", inputs),
+          m_buffer{std::move(buf)},
+          m_model_handle(model_handle) {
+    mgb_assert(
+            inputs[0]->comp_node().device_type() == CompNode::DeviceType::MC20,
+            "MC20RuntimeOpr can only be used on mc20 comp node; "
+            "got %s",
+            inputs[0]->comp_node().to_string().c_str());
+
+    for (auto i : inputs) {
+        add_input({i});
+    }
+    if (m_model_handle == INVALID_MODEL_HANDLE) {
+        MGB_MC20_CHECK(AX_NPU_SDK_EX_Create_handle(
+                &m_model_handle, m_buffer.data(), m_buffer.size()));
+        m_is_model_holder = true;
+    }
+
+    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
+            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
+
+    size_t nr_outputs = io_info->nOutputSize;
+    bool has_workspace = false;
+    if (nr_outputs == 1) {
+        const auto& tensor_meta = *(io_info->pOutputs[0].pTensorMeta);
+        add_output(std::string(reinterpret_cast<char*>(tensor_meta.pName)));
+        if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
+            mgb_assert(tensor_meta.nInnerSize > 0);
+            has_workspace = true;
+        }
+
+    } else {
+        for (size_t i = 0; i < nr_outputs; ++i) {
+            const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
+            add_output(std::string(reinterpret_cast<char*>(tensor_meta.pName)));
+            if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
+                mgb_assert(tensor_meta.nInnerSize > 0);
+                has_workspace = true;
+            }
+        }
+    }
+    mgb_assert(has_workspace, "Currently only support model with cpu tail");
+
+    //! \warning There is no interface in MC20 to get the batch size of
+    //! model.MC20 supports multi-batch by changing the input of n-batch to n
+    //! 1-batch input.
+    mgb_assert(
+            io_info->nInputSize % inputs.size() == 0,
+            "The number of inputs in the neu model should be multiple of "
+            "the number of inputs in megbrain, but got %zu(neu model) vs "
+            "%zu(mgb model)",
+            io_info->nInputSize, inputs.size());
+    m_model_batch = reinterpret_cast<size_t>(io_info->nInputSize / inputs.size());
+
+    add_equivalence_component<mgb::ScalarHash<const void*>>(m_buffer.data());
+    cg::add_workspace_output(this);
+};
+
+MC20RuntimeOpr::~MC20RuntimeOpr() {
+    if (m_is_model_holder) {
+        MGB_MC20_CHECK(AX_NPU_SDK_EX_Destroy_handle(m_model_handle));
+    }
+}
+
+void MC20RuntimeOpr::execute_mc20() {
+    auto&& mc20_env = CompNodeEnv::from_comp_node(input(0)->comp_node()).mc20_env();
+    mc20_env.activate();
+
+    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
+            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
+
+    AX_NPU_SDK_EX_IO_T npu_io;
+    memset(&npu_io, 0, sizeof(npu_io));
+    size_t batch_size = input(0)->dev_tensor().layout().shape[0];
+    for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx += m_model_batch) {
+        //! prepare input
+        npu_io.nInputSize = io_info->nInputSize;
+        auto inputs = std::make_unique<AX_NPU_SDK_EX_BUF_T[]>(npu_io.nInputSize);
+        npu_io.pInputs = inputs.get();
+        for (size_t i = 0; i < npu_io.nInputSize; i++) {
+            // get input addr info
+            size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
+            AX_VOID* p_virtual_addr = input(inp_idx)->dev_tensor().raw_ptr();
+            AX_U64 phy_addr = MC20MemoryManager::Instance().get_phyaddr(p_virtual_addr);
+            auto nr_bytes_per_batch =
+                    input(inp_idx)->layout().span().dist_byte() / batch_size;
+            // add batch offset
+            p_virtual_addr = reinterpret_cast<AX_VOID*>(
+                    reinterpret_cast<AX_U64>(p_virtual_addr) +
+                    nr_bytes_per_batch * (batch_idx + i % m_model_batch));
+            phy_addr += nr_bytes_per_batch * (batch_idx + i % m_model_batch);
+
+            MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
+                    phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr,
+                    p_virtual_addr, nr_bytes_per_batch, &npu_io.pInputs[i]));
+        }
+
+        //! prepare output
+        npu_io.nOutputSize = io_info->nOutputSize;
+        auto outputs = std::make_unique<AX_NPU_SDK_EX_BUF_T[]>(npu_io.nOutputSize);
+        npu_io.pOutputs = outputs.get();
+        AX_U32 offset = 0;
+        AX_VOID* inner_virtual_addr_start = nullptr;
+        AX_U64 inner_phy_addr_start = 0;
+        // get innder addr form workspace
+        inner_virtual_addr_start = output(npu_io.nOutputSize)->dev_tensor().raw_ptr();
+        inner_phy_addr_start =
+                MC20MemoryManager::Instance().get_phyaddr(inner_virtual_addr_start);
+        for (size_t i = 0; i < npu_io.nOutputSize; i++) {
+            // get output addr info
+            AX_VOID* p_virtual_addr = output(i)->dev_tensor().raw_ptr();
+            AX_U64 phy_addr = 0;
+            auto nr_bytes_per_batch =
+                    output(i)->layout().span().dist_byte() / batch_size;
+            // add batch offset
+            p_virtual_addr = reinterpret_cast<AX_VOID*>(
+                    reinterpret_cast<AX_U64>(p_virtual_addr) +
+                    nr_bytes_per_batch * batch_idx);
+            phy_addr += nr_bytes_per_batch * batch_idx;
+
+            const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
+            if (tensor_meta.eMemoryType == AX_NPU_MT_PHYSICAL) {
+                MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
+                        phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr,
+                        p_virtual_addr, nr_bytes_per_batch, &npu_io.pOutputs[i]));
+            } else if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
+                auto p_inner_virtual_addr = reinterpret_cast<AX_VOID*>(
+                        reinterpret_cast<AX_U64>(inner_virtual_addr_start) + offset);
+                auto innerphy_addr = inner_phy_addr_start + offset;
+                MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
+                        phy_addr, p_virtual_addr, nr_bytes_per_batch, innerphy_addr,
+                        p_inner_virtual_addr, tensor_meta.nInnerSize,
+                        &npu_io.pOutputs[i]));
+
+                offset += tensor_meta.nInnerSize;
+            }
+        }
+
+        MGB_MC20_CHECK(AX_NPU_SDK_EX_Run_task_sync(m_model_handle, &npu_io));
+    }
+}
+
+void MC20RuntimeOpr::init_output_comp_node() {
+    //! set output to cpu compnode if has cpu tail
+    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
+            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
+
+    CompNode input_cn;
+    for (auto&& i : input()) {
+        if (!input_cn.valid()) {
+            input_cn = i->comp_node();
+        } else {
+            mgb_assert(
+                    input_cn.mem_node() == i->comp_node().mem_node(),
+                    "opr %s{%s} requires all input to be on the same memory "
+                    "node expect=%s cur_var=%s cur_cn=%s",
+                    this->cname(), this->dyn_typeinfo()->name,
+                    input_cn.to_string().c_str(), i->cname(),
+                    i->comp_node().to_string().c_str());
+        }
+    }
+    for (size_t i = 0; i < io_info->nOutputSize; i++) {
+        //! compnode of the var should be default_cpu as the output will be
+        //! proxy to user
+        output(i)->comp_node(CompNode::default_cpu());
+    }
+    //! the last output is workspace, which should be the same as input
+    output(io_info->nOutputSize)->comp_node(input_cn);
+}
+
+MC20RuntimeOpr::NodeProp* MC20RuntimeOpr::do_make_node_prop() const {
+    auto ret = Super::do_make_node_prop();
+    ret->add_flag(NodeProp::Flag::CROSS_COMP_NODE_MEMORY);
+    return ret;
+}
+
+void MC20RuntimeOpr::do_execute(ExecEnv& env) {
+    CompNode cn = output(0)->comp_node();
+    auto runner = [this, cn]() {
+        this->owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(this, cn);
+        cn.activate();
+        execute_mc20();
+        this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(this, cn);
+    };
+    env.dispatch_on_comp_node(cn, runner);
+
+    // Send BeforeKernel/AfterKernel event on every different comp_node
+    ThinHashSet<mgb::CompNode> st = cg::get_opr_comp_node_set(this);
+    for (auto cn : st) {
+        auto send_event = [this, cn]() {
+            this->owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(
+                    this, cn);
+            this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(
+                    this, cn);
+        };
+        env.dispatch_on_comp_node(cn, send_event);
+    }
+}
+
+void MC20RuntimeOpr::on_output_comp_node_stream_changed() {
+    mgb_throw(SystemError, "comp node of output should not change");
+}
+
+void MC20RuntimeOpr::get_output_var_shape(
+        const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
+    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
+            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
+    size_t nr_inputs = io_info->nInputSize;
+
+    for (size_t i = 0; i < nr_inputs; ++i) {
+        const auto& tensor_meta = *(io_info->pInputs[i].pTensorMeta);
+        auto model_shape = mc20_shape_to_mgb_shape(tensor_meta);
+        size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
+        // enable mutibatch
+        mgb_assert(
+                inp_shape[inp_idx][0] % model_shape[0] == 0 &&
+                        (inp_shape[inp_idx][0] / model_shape[0]) % m_model_batch == 0,
+                "input %zu batch is %zu, while model's input batch is %zu", i,
+                inp_shape[inp_idx][0], model_shape[0]);
+        model_shape[0] = inp_shape[inp_idx][0];
+        mgb_assert(
+                model_shape.eq_shape(inp_shape[inp_idx]),
+                "shape mismatch of input %zu, expected: %s got: %s", i,
+                model_shape.to_string().c_str(),
+                inp_shape[inp_idx].to_string().c_str());
+    }
+    size_t input_batch = (io_info->pInputs[0].pTensorMeta)->pShape[0];
+    //! \warning mc20 sdk implement multi-batch by breaking an n-batch input up
+    //! into n 1-batch inputs
+    mgb_assert(input_batch == 1, "input batch: %d, net's input batch: 1", input_batch);
+    AX_U32 workspace_size = 0;
+    for (size_t i = 0; i < io_info->nOutputSize; ++i) {
+        const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
+        out_shape[i] = mc20_shape_to_mgb_shape(tensor_meta);
+        // enable mutibatch
+        out_shape[i][0] =
+                out_shape[i][0] * inp_shape[0][0] / input_batch / m_model_batch;
+        if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
+            workspace_size += tensor_meta.nInnerSize;
+        }
+    }
+    out_shape.back() = {workspace_size};
+}
+
+void MC20RuntimeOpr::add_input_layout_constraint() {
+    //! default contiguous
+    for (auto i : input()) {
+        i->add_layout_constraint_contiguous();
+    }
+}
+
+void MC20RuntimeOpr::init_output_dtype() {
+    DType dt_mc20, dt_input;
+    const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
+            AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
+    for (size_t i = 0; i < io_info->nInputSize; ++i) {
+        dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pInputs[i].eDType);
+        size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
+        dt_input = input(inp_idx)->dtype();
+        mgb_assert(
+                dt_mc20.valid() && dt_input.valid() &&
+                        dt_mc20.enumv() == dt_input.enumv(),
+                "dtype mismatch of input %zu: expected %s, "
+                "got %s",
+                i, dt_mc20.name(), dt_input.name());
+    }
+
+    for (size_t i = 0; i < io_info->nOutputSize; ++i) {
+        dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pOutputs[i].eDType);
+        mgb_assert(
+                dt_mc20.valid(),
+                "output dtype checking failed: invalid dtype returned.");
+        if (!output(i)->dtype().valid())
+            output(i)->dtype(dt_mc20);
+    }
+}
+
+SymbolVarArray MC20RuntimeOpr::make(
+        SharedBuffer buf, const SymbolVarArray& src, const OperatorNodeConfig& config) {
+    VarNodeArray var_node_array = cg::to_var_node_array(src);
+    auto mc20_runtime_opr = std::make_unique<MC20RuntimeOpr>(
+            std::move(buf), INVALID_MODEL_HANDLE, var_node_array, config);
+    auto ret = cg::to_symbol_var_array(src[0].node()
+                                               ->owner_graph()
+                                               ->insert_opr(std::move(mc20_runtime_opr))
+                                               ->output());
+    ret.pop_back();  // remove workspace
+    return ret;
+}
+
+SymbolVarArray MC20RuntimeOpr::make(
+        const void* buf, size_t size, const SymbolVarArray& src,
+        const OperatorNodeConfig& config) {
+    mgb_throw_if(
+            !CompNode::get_device_count(CompNode::DeviceType::MC20), SystemError,
+            "can not create MC20RuntimeOpr when mc20 is not "
+            "available");
+    std::shared_ptr<uint8_t> shptr{new uint8_t[size], [](uint8_t* p) { delete[] p; }};
+    memcpy(shptr.get(), buf, size);
+    SharedBuffer buffer{std::move(shptr), size};
+    return make(std::move(buffer), src, config);
+}
+
+SymbolVarArray MC20RuntimeOpr::make(
+        SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
+        const SymbolVarArray& src, const OperatorNodeConfig& config) {
+    VarNodeArray var_node_array = cg::to_var_node_array(src);
+    auto mc20_runtime_opr = std::make_unique<MC20RuntimeOpr>(
+            std::move(buf), model_handle, var_node_array, config);
+    auto ret = cg::to_symbol_var_array(src[0].node()
+                                               ->owner_graph()
+                                               ->insert_opr(std::move(mc20_runtime_opr))
+                                               ->output());
+    ret.pop_back();  // remove workspace
+    return ret;
+}
+
+#endif  // MGB_MC20
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/impl/mc20_runtime_op.oprdecl b/src/opr/impl/mc20_runtime_op.oprdecl
new file mode 100644
index 00000000..3cc4c366
--- /dev/null
+++ b/src/opr/impl/mc20_runtime_op.oprdecl
@@ -0,0 +1,17 @@
+decl_raw_opr(
+    'mc20_runtime',
+    desc='create an operator that could load and run mc20 model',
+    inputs=[
+        Doc('inputs', 'input vars', 'list of :class:`.SymbolVar`'),
+        Doc('data_bytes', 'serialized mc20 model'),
+    ],
+    body=[
+        'assert isinstance(data_bytes, bytes), '
+            '"data must be bytes; got {}".format(type(data_bytes))',
+        'output = _mgb._Opr.mc20_runtime(inputs, data_bytes, config)',
+        'cvt_result_kwargs["explode_single"] = False',
+    ],
+)
+
+# vim: ft=python
+
diff --git a/src/opr/impl/mc20_runtime_op.sereg.h b/src/opr/impl/mc20_runtime_op.sereg.h
new file mode 100644
index 00000000..dbedb692
--- /dev/null
+++ b/src/opr/impl/mc20_runtime_op.sereg.h
@@ -0,0 +1,72 @@
+/**
+ * \file src/opr/impl/mc20_runtime_op.sereg.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/opr/mc20_runtime_op.h"
+#include "megbrain/serialization/sereg.h"
+
+#if MGB_MC20
+namespace mgb {
+using MC20RuntimeOpr = opr::MC20RuntimeOpr;
+namespace serialization {
+
+template <>
+struct OprLoadDumpImpl<MC20RuntimeOpr, 0> {
+    static void dump(OprDumpContext& ctx, const cg::OperatorNodeBase& opr_) {
+        auto&& opr = opr_.cast_final_safe<opr::MC20RuntimeOpr>();
+        auto&& buf = opr.buffer();
+        auto&& name = opr.name();
+        ctx.dump_buf_with_len(buf.data(), buf.size());
+        ctx.dump_buf_with_len(name.c_str(), name.size());
+    }
+
+    static cg::OperatorNodeBase* load(
+            OprLoadContext& ctx, const cg::VarNodeArray& inputs,
+            const OperatorNodeConfig& config) {
+        inputs.at(0)->comp_node().activate();
+        auto buf = ctx.load_shared_buf_with_len();
+        auto name = ctx.load_shared_buf_with_len();
+        std::string c_name(reinterpret_cast<const char*>(name.data()), name.size());
+        OperatorNodeConfig& c_config = const_cast<OperatorNodeConfig&>(config);
+        c_config.name(c_name);
+        return opr::MC20RuntimeOpr::make(
+                       std::move(buf), cg::to_symbol_var_array(inputs), c_config)
+                .at(0)
+                .node()
+                ->owner_opr();
+    }
+};
+
+}  // namespace serialization
+
+namespace opr {
+cg::OperatorNodeBase* opr_shallow_copy_mc20_runtime_opr(
+        const serialization::OprShallowCopyContext& ctx,
+        const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs,
+        const OperatorNodeConfig& config) {
+    MGB_MARK_USED_VAR(ctx);
+    auto&& opr = opr_.cast_final_safe<MC20RuntimeOpr>();
+    return MC20RuntimeOpr::make(
+                   opr.buffer(), opr.model_handle(), cg::to_symbol_var_array(inputs),
+                   config)
+            .at(0)
+            .node()
+            ->owner_opr();
+}
+
+MGB_SEREG_OPR(MC20RuntimeOpr, 0);
+MGB_REG_OPR_SHALLOW_COPY(MC20RuntimeOpr, opr_shallow_copy_mc20_runtime_opr);
+}  // namespace opr
+}  // namespace mgb
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/include/megbrain/opr/mc20_runtime_op.h b/src/opr/include/megbrain/opr/mc20_runtime_op.h
new file mode 100644
index 00000000..fba0f454
--- /dev/null
+++ b/src/opr/include/megbrain/opr/mc20_runtime_op.h
@@ -0,0 +1,83 @@
+/**
+ * \file src/opr/include/megbrain/opr/mc20_runtime_op.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include <memory>
+#include "megbrain/comp_node_env.h"
+#include "megbrain/graph.h"
+#include "megbrain/graph/operator_node.h"
+#include "megbrain/serialization/file.h"
+#include "megdnn/thin/function.h"
+
+#if MGB_MC20
+#include "megbrain/mc20/mc20_memory_manager.h"
+
+namespace mgb {
+namespace opr {
+
+MGB_DEFINE_OPR_CLASS(
+        MC20RuntimeOpr, cg::OutshapePureByInshapeOpr<cg::OperatorNodeBase>) // {
+public:
+    using SharedBuffer = mgb::serialization::SharedBuffer;
+
+    void do_execute(ExecEnv& env) override;
+    void get_output_var_shape(
+            const TensorShapeArray& inp_shape,
+            TensorShapeArray& out_shape) const override;
+    void add_input_layout_constraint() override;
+    void init_output_dtype() override;
+    void init_output_comp_node() override;
+    void on_output_comp_node_stream_changed() override;
+
+    /**
+     * \brief create MC20RuntimeOpr with buf
+     */
+    MC20RuntimeOpr(
+            SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T m_model_handle,
+            const VarNodeArray& inputs, const OperatorNodeConfig& config);
+    ~MC20RuntimeOpr();
+
+    const SharedBuffer& buffer() const { return m_buffer; }
+
+    AX_NPU_SDK_EX_HANDLE_T model_handle() const { return m_model_handle; }
+
+    static SymbolVarArray make(
+            SharedBuffer buf, const SymbolVarArray& src,
+            const OperatorNodeConfig& config = {});
+
+    static SymbolVarArray make(
+            const void* buf, size_t size, const SymbolVarArray& src,
+            const OperatorNodeConfig& config = {});
+
+    static SymbolVarArray make(
+            SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
+            const SymbolVarArray& src, const OperatorNodeConfig& config = {});
+
+private:
+    NodeProp* do_make_node_prop() const override;
+
+    void execute_mc20();
+    size_t m_model_batch;
+    SharedBuffer m_buffer;
+    constexpr static AX_NPU_SDK_EX_HANDLE_T INVALID_MODEL_HANDLE = nullptr;
+    AX_NPU_SDK_EX_HANDLE_T m_model_handle = INVALID_MODEL_HANDLE;
+    //! if set true, it will release model
+    bool m_is_model_holder = false;
+};  // namespace opr
+
+}  // namespace opr
+}  // namespace mgb
+
+#endif  // MGB_MC20
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}