feat(lite): replace warp when src is discrete input

GitOrigin-RevId: 2bf7980ac6
2 years ago · dc0ab9b64e
--- a/lite/include/lite/network.h
+++ b/lite/include/lite/network.h
@@ -117,6 +117,9 @@ struct LITE_API Options {
 *
 * @param auto_optimize_inference lite will detect the device information add
 * set the options heuristically
 *
 * @param discrete_input_name configure which input is composed of discrete
 * multiple tensors
 */
 struct LITE_API Config {
    bool has_compression = false;
@@ -126,6 +129,7 @@ struct LITE_API Config {
    std::string bare_model_cryption_name = {};
    Options options = {};
    bool auto_optimize_inference = false;
    std::string discrete_input_name = {};
 };

 /*!
@@ -289,9 +293,22 @@ public:
    std::shared_ptr<Tensor> get_io_tensor(
            std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_IO);

    /** @brief get the network input tensors which input consists of discrete multiple
     * tensors, layout (1, c, h, w)
     *
     * @param io_name the name of the tensor
     * @param phase indicate the tensor is input tensor
     */
    std::vector<std::shared_ptr<Tensor>> get_io_tensors(
            std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_INPUT);

    //! get the network input tensor by index
    std::shared_ptr<Tensor> get_input_tensor(size_t index);

    //! get the network input tensors which input consists of discrete multiple tensors
    //! by index
    std::vector<std::shared_ptr<Tensor>> get_input_tensors(size_t index);

    //! get the network output tensor by index
    std::shared_ptr<Tensor> get_output_tensor(size_t index);

--- a/lite/lite-c/include/lite-c/network_c.h
+++ b/lite/lite-c/include/lite-c/network_c.h
@@ -103,6 +103,9 @@ extern LITE_API const LiteOptions default_option;

 *\param auto_optimize_inference lite will detect the device information add
 * set the options heuristically
 *
 * \param discrete_input_name configure which input is composed of discrete
 * multiple tensors
 */
 typedef struct LiteConfig {
    int has_compression;
@@ -112,6 +115,7 @@ typedef struct LiteConfig {
    const char* bare_model_cryption_name;
    LiteOptions options;
    int auto_optimize_inference;
    const char* discrete_input_name;
 } LiteConfig;

 //! get default config
@@ -299,6 +303,19 @@ LITE_API int LITE_get_io_tensor(
        LiteTensor* tensor);

 /**
 * \brief get the n'th tensor in the network input tensors whose input
 * consists of discrete multiple tensors and name is io_name, layout (1, c, h, w)
 * \param[in] network The loaded model
 * \param[in] io_name The input name
 * \param[in] n_idx The index of tensor
 * \param[in] phase The tensor phase
 * \param[out] tensor The IO tensor get from the network
 */
 LITE_API int LITE_get_io_tensors(
        LiteNetwork network, const char* io_name, size_t n_idx, LiteTensorPhase phase,
        LiteTensor* tensor);

 /**
 * \brief get the input tensor name in the order in loaded model
 * \param[in] network The loaded model
 * \param[in] index The index of input tensor
--- a/lite/lite-c/src/network.cpp
+++ b/lite/lite-c/src/network.cpp
@@ -43,7 +43,8 @@ LiteConfig default_config_t = {
        .backend = LiteBackend::LITE_DEFAULT,
        .bare_model_cryption_name = nullptr,
        .options = default_option,
        .auto_optimize_inference = false};
        .auto_optimize_inference = false,
        .discrete_input_name = nullptr};
 LiteConfig* default_config() {
    return &default_config_t;
 }
@@ -135,6 +136,9 @@ lite::Config convert_to_lite_config(const LiteConfig c_config) {
    lite_config.options.enable_nchw64 = c_config.options.enable_nchw64;

    lite_config.auto_optimize_inference = c_config.auto_optimize_inference;
    if (c_config.discrete_input_name) {
        lite_config.discrete_input_name = c_config.discrete_input_name;
    }

    return lite_config;
 }
@@ -274,6 +278,20 @@ int LITE_get_io_tensor(
    LITE_CAPI_END();
 }

 int LITE_get_io_tensors(
        LiteNetwork network, const char* io_name, size_t n_idx, LiteTensorPhase phase,
        LiteTensor* tensor) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network, "The network pass to LITE api is null");
    auto io_tensors =
            static_cast<lite::Network*>(network)->get_io_tensors(io_name, phase);
    LITE_ASSERT(
            n_idx < io_tensors.size(), "n_idx should be less than %zu",
            io_tensors.size());
    *tensor = io_tensors[n_idx].get();
    LITE_CAPI_END();
 }

 int LITE_get_input_name(const LiteNetwork network, size_t index, const char** name) {
    LITE_CAPI_BEGIN();
    LITE_ASSERT(network && name, "The network pass to LITE api is null");
--- a/lite/pylite/megenginelite/network.py
+++ b/lite/pylite/megenginelite/network.py
@@ -173,6 +173,8 @@ class LiteConfig(Structure):

        auto_optimize_inference: lite will detect the device information add set the options heuristically

        discrete_input_name: configure which input is composed of discrete multiple tensors

    Examples:
        .. code-block::

@@ -193,6 +195,7 @@ class LiteConfig(Structure):
        ("_bare_model_cryption_name", c_char_p),
        ("options", LiteOptions),
        ("auto_optimize_inference", c_int),
        ("discrete_input_name", c_char_p),
    ]

    def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None):
@@ -207,6 +210,7 @@ class LiteConfig(Structure):
        self.has_compression = 0
        self.backend = LiteBackend.LITE_DEFAULT
        self.auto_optimize_inference = 0
        self.discrete_input_name = c_char_p(b"")

    @property
    def bare_model_cryption_name(self):
@@ -229,6 +233,7 @@ class LiteConfig(Structure):
            "bare_model_cryption_name": self.bare_model_cryption_name,
            "options": self.options,
            "auto_optimize_inference": self.auto_optimize_inference,
            "discrete_input_name": self.discrete_input_name,
        }
        return data.__repr__()

@@ -536,6 +541,10 @@ class _NetworkAPI(_LiteCObjBase):
            [c_char_p, c_size_t, LiteConfig, POINTER(_LiteNetworkIO)],
        ),
        ("LITE_extra_configure", [_Cnetwork, LiteExtraConfig]),
        (
            "LITE_get_io_tensors",
            [_Cnetwork, c_char_p, c_size_t, c_int, POINTER(_Ctensor)],
        ),
    ]


@@ -736,6 +745,30 @@ class LiteNetwork(object):
        tensor.update()
        return tensor

    def get_io_tensors(self, name, n_idx, phase=LiteTensorPhase.LITE_INPUT):
        """
        get the n_idx'th tensor in the network input tensors whose
        input consists of discrete multiple tensors and tensor name is name

        Args:
            name: the name of input tensor
            n_idx: the tensor index
            phase: the type of LiteTensor, this is useful to separate input tensor with the same name

        Returns:
            the tensors with given name and type
        """
        if type(name) == str:
            c_name = c_char_p(name.encode("utf-8"))
        else:
            c_name = c_char_p(name)
        tensor = LiteTensor(physic_construct=False)
        self._api.LITE_get_io_tensors(
            self._network, c_name, n_idx, phase, byref(tensor._tensor)
        )
        tensor.update()
        return tensor

    def get_input_name(self, index):
        """
        get the input name by the index in the network
--- a/lite/pylite/test/test_network.py
+++ b/lite/pylite/test/test_network.py
@@ -500,3 +500,45 @@ class TestNetwork(TestShuffleNet):

        os.remove(fast_run_cache)
        os.remove(global_layout_transform_model)


 class TestDiscreteInputNet(unittest.TestCase):
    source_dir = os.getenv("LITE_TEST_RESOURCE")
    data0_path = os.path.join(source_dir, "data0.npy")
    data1_path = os.path.join(source_dir, "data1.npy")
    data2_path = os.path.join(source_dir, "data2.npy")
    model_path = os.path.join(source_dir, "test_discrete_input.mge")
    data0 = np.load(data0_path)
    data1 = np.load(data1_path)
    data2 = np.load(data2_path)

    def do_forward(self, network, times=3):
        data_name = network.get_input_name(1)
        datas = []
        datas.append(network.get_io_tensors(data_name, 0))
        datas.append(network.get_io_tensors(data_name, 1))
        datas.append(network.get_io_tensors(data_name, 2))

        datas[0].set_data_by_copy(self.data0)
        datas[1].set_data_by_copy(self.data1)
        datas[2].set_data_by_copy(self.data2)
        for i in range(times):
            network.forward()
            network.wait()


 class TestDiscreteInput(TestDiscreteInputNet):
    def test_discrete_input(self):
        config = LiteConfig()
        config.discrete_input_name = "data".encode("utf-8")
        input_io = LiteIO(
            "data",
            is_host=True,
            io_type=LiteIOType.LITE_IO_VALUE,
            layout=LiteLayout([3, 3, 224, 224]),
        )
        ios = LiteNetworkIO()
        ios.add_input(input_io)
        network = LiteNetwork(config, ios)
        network.load(self.model_path)
        self.do_forward(network)
--- a/lite/src/mge/network_impl.cpp
+++ b/lite/src/mge/network_impl.cpp
@@ -13,6 +13,7 @@
 #include "megbrain/comp_node_env.h"
 #include "megbrain/graph.h"
 #include "megbrain/graph/cg.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/tensor.h"
@@ -259,6 +260,88 @@ void NetworkImplDft::make_output_spec() {
    }
 }

 void NetworkImplDft::replace_src_discrete_input_opr_pass() {
    mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> out_var_map;

    auto dest_with_extra_deps =
            get_dest_vars_with_extra_deps(m_load_result.output_var_list);
    gopt::SubGraph graph{dest_with_extra_deps};
    auto rewriter = graph.make_rewriter();

    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
        if (opr->same_type<mgb::opr::WarpPerspective>()) {
            bool is_h2d = true;
            if (opr->input(0)->owner_opr()->same_type<mgb::opr::Host2DeviceCopy>())
                is_h2d = true;
            else if (opr->input(0)
                             ->owner_opr()
                             ->same_type<mgb::opr::VolatileSharedDeviceTensor>())
                is_h2d = false;
            else
                return;

            SymbolVarArray srcs;
            if (is_h2d) {
                auto h2d = opr->input(0)->owner_opr();
                for (auto&& inp : get_io_tensors(m_user_config->discrete_input_name)) {
                    auto val = TensorHelper::implement(inp)
                                       ->cast_final_safe<TensorImplDft>()
                                       .m_host_tensor;
                    LITE_ASSERT(val);
                    srcs.push_back(mgb::opr::Host2DeviceCopy::make(
                            *m_load_result.graph, val, h2d->config()));
                }
            } else {
                auto volatiled = opr->input(0)->owner_opr();
                for (auto&& inp : get_io_tensors(m_user_config->discrete_input_name)) {
                    auto val = TensorHelper::implement(inp)
                                       ->cast_final_safe<TensorImplDft>()
                                       .m_dev_tensor;
                    LITE_ASSERT(val);
                    srcs.push_back(mgb::opr::VolatileSharedDeviceTensor::make(
                            *m_load_result.graph, val, volatiled->config()));
                }
            }

            auto& warp = opr->cast_final<mgb::opr::WarpPerspective>();
            SymbolVar new_out;
            if (opr->input().size() == 3) {
                new_out = mgb::opr::WarpPerspective::make(
                        srcs, warp.input(1), warp.input(2), warp.param(),
                        warp.config());
            } else {
                LITE_ASSERT(opr->input().size() == 4);
                new_out = mgb::opr::WarpPerspective::make(
                        srcs, warp.input(1), warp.input(2), warp.input(3), warp.param(),
                        warp.config());
            }
            rewriter.replace_var(
                    warp.output(0), new_out.node(),
                    "replace WarpPerspective to WarpPerspective multi src version.");
        } else {
            rewriter.auto_replace_outputs(opr);
        }
    };
    graph.iter(on_opr);
    rewriter.apply_inplace();
    auto new_ovar = graph.endpoint_vars();
    new_ovar.resize(m_load_result.output_var_list.size());

    for (size_t i = 0; i < new_ovar.size(); ++i) {
        out_var_map[m_load_result.output_var_list[i]] = new_ovar[i];
    }
    for (auto&& i : m_load_result.output_var_map) {
        i.second = out_var_map.at(i.second);
    }
    for (auto&& i : m_load_result.output_var_map_id) {
        i.second = out_var_map.at(i.second);
    }
    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
        new_ovar[i].rename(m_load_result.output_var_list[i].node()->name());
    }
    m_load_result.output_var_list = std::move(new_ovar);
 }

 void NetworkImplDft::replace_dev_input_pass() {
    mgb::CompNode::Locator locator;
    m_load_config.comp_node_mapper(locator);
@@ -528,6 +611,8 @@ void NetworkImplDft::configure_after_loaded() {

 void NetworkImplDft::compile_graph() {
    replace_dev_input_pass();
    if (!m_user_config->discrete_input_name.empty())
        replace_src_discrete_input_opr_pass();
    make_output_spec();
    m_execute_func = m_load_result.graph_compile(m_output_spec);
 }
@@ -691,6 +776,11 @@ void NetworkImplDft::update_input() {
            m_network_io->inputs.push_back(io_in);
        }
    }

    if (!m_user_config->discrete_input_name.empty()) {
        update_input_lite_tensors();
    }

    //! delete the IO that is not the network
    for (auto it = m_network_io->inputs.begin(); it != m_network_io->inputs.end();) {
        if (it->lite_tensor == nullptr) {
@@ -702,6 +792,79 @@ void NetworkImplDft::update_input() {
    }
 }

 void NetworkImplDft::update_input_lite_tensors() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
    auto stream_id = m_compnode_locator.stream;

    for (auto&& in_tensor_iter : m_load_result.tensor_map) {
        if (in_tensor_iter.first != m_user_config->discrete_input_name) {
            continue;
        }
        bool found = false;
        for (auto&& config_in : m_network_io->inputs) {
            if (in_tensor_iter.first == config_in.name) {
                found = true;
                size_t bs = in_tensor_iter.second->shape(0);
                auto shape = in_tensor_iter.second->shape();
                shape.shape[0] = 1;
                if (config_in.config_layout.ndim) {
                    bs = config_in.config_layout.shapes[0];
                    shape.shape[1] = config_in.config_layout.shapes[1];
                    shape.shape[2] = config_in.config_layout.shapes[2];
                    shape.shape[3] = config_in.config_layout.shapes[3];
                }
                HostTensorND tensor(
                        in_tensor_iter.second->comp_node(), shape,
                        in_tensor_iter.second->dtype(),
                        in_tensor_iter.second->format());
                for (size_t i = 0; i < bs; ++i) {
                    if (config_in.is_host) {
                        config_in.lite_tensors.push_back(std::make_shared<Tensor>(
                                device_id, stream_id, device_type, true));
                        TensorHelper::implement(config_in.lite_tensors[i])
                                ->cast_final_safe<TensorImplDft>()
                                .m_host_tensor = std::make_shared<HostTensorND>(tensor);
                        config_in.lite_tensors[i]->update_from_implement();
                    } else {
                        config_in.lite_tensors.push_back(std::make_shared<Tensor>(
                                device_id, stream_id, device_type));
                        config_in.lite_tensors[i]->set_layout(
                                to_lite_layout(tensor.layout()));
                    }
                    TensorHelper::implement(config_in.lite_tensors[i])
                            ->cast_final_safe<TensorImplDft>()
                            .m_record_reset =
                            m_user_config->options.comp_node_seq_record_level > 0;
                }
            }
        }
        if (!found) {
            size_t bs = in_tensor_iter.second->shape(0);
            auto shape = in_tensor_iter.second->shape();
            shape.shape[0] = 1;
            HostTensorND tensor(
                    in_tensor_iter.second->comp_node(), shape,
                    in_tensor_iter.second->dtype(), in_tensor_iter.second->format());
            IOInner io_in;
            io_in.name = in_tensor_iter.first;
            for (size_t i = 0; i < bs; ++i) {
                io_in.lite_tensors.push_back(std::make_shared<Tensor>(
                        device_id, stream_id, device_type, true));
                TensorHelper::implement(io_in.lite_tensors[i])
                        ->cast_final_safe<TensorImplDft>()
                        .m_host_tensor = std::make_shared<HostTensorND>(tensor);
                TensorHelper::implement(io_in.lite_tensors[i])
                        ->cast_final_safe<TensorImplDft>()
                        .m_record_reset =
                        m_user_config->options.comp_node_seq_record_level > 0;
                io_in.lite_tensors[i]->update_from_implement();
            }
            m_network_io->inputs.push_back(io_in);
        }
    }
 }

 void NetworkImplDft::update_output() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
@@ -855,10 +1018,29 @@ std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(
    return nullptr;
 }

 std::vector<std::shared_ptr<Tensor>> NetworkImplDft::get_io_tensors(
        std::string io_name, LiteTensorPhase phase) {
    if (phase == LiteTensorPhase::LITE_INPUT) {
        for (auto&& config_in : m_network_io->inputs) {
            if (io_name == config_in.name &&
                config_in.name == m_user_config->discrete_input_name) {
                return config_in.lite_tensors;
            }
        }
    }
    LITE_THROW(mgb::ssprintf(
            "tensor name must be %s input tensor name.", io_name.c_str()));
    return {};
 }

 std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) {
    return get_io_tensor(get_input_name(index));
 }

 std::vector<std::shared_ptr<Tensor>> NetworkImplDft::get_input_tensors(size_t index) {
    return get_io_tensors(get_input_name(index));
 }

 std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) {
    return get_io_tensor(get_output_name(index));
 }
--- a/lite/src/mge/network_impl.h
+++ b/lite/src/mge/network_impl.h
@@ -57,9 +57,19 @@ public:
            std::string io_name,
            LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override;

    //! get the network input tensors which input consists of discrete multiple tensors,
    //! layout (1, c, h, w)
    std::vector<std::shared_ptr<Tensor>> get_io_tensors(
            std::string io_name,
            LiteTensorPhase phase = LiteTensorPhase::LITE_INPUT) override;

    //! get the input tensor by index in the load_result tensormap
    std::shared_ptr<Tensor> get_input_tensor(size_t index) override;

    //! get the network input tensors which input consists of discrete multiple tensors
    //! by index
    std::vector<std::shared_ptr<Tensor>> get_input_tensors(size_t index) override;

    //! get the output tensor by index in the load_result output_var_list
    std::shared_ptr<Tensor> get_output_tensor(size_t index) override;

@@ -190,6 +200,11 @@ private:
    //! VolatileSharedDeviceTensor Opr
    void replace_dev_input_pass();

    //! if the input to the network is a list of tensors, this pass will replace
    //! the opr that supports the input of a list of tensors with the corresponding
    //! version, current support WarpPerspective
    void replace_src_discrete_input_opr_pass();

    //! check whether the model is cross compnode
    void cross_compnode_model_detect();

@@ -199,6 +214,8 @@ private:

    void update_input();
    void update_output();
    //! initialization lite_tensors when input is composed of discrete multiple tensors
    void update_input_lite_tensors();

    //! when the model info have loaded, update the config according the model
    //! info, finaly use it in compute graph
--- a/lite/src/network.cpp
+++ b/lite/src/network.cpp
@@ -127,6 +127,15 @@ std::shared_ptr<Tensor> Network::get_io_tensor(
    LITE_ERROR_HANDLER_END
 }

 std::vector<std::shared_ptr<Tensor>> Network::get_io_tensors(
        std::string name, LiteTensorPhase phase) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_io_tensors(name, phase);
    LITE_ERROR_HANDLER_END
 }

 std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_input_tensor should be used after model loaded.");
@@ -135,6 +144,14 @@ std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) {
    LITE_ERROR_HANDLER_END
 }

 std::vector<std::shared_ptr<Tensor>> Network::get_input_tensors(size_t index) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_input_tensor should be used after model loaded.");
    LITE_CHECK_NON_NULL_POINTER(m_impl);
    return m_impl->get_input_tensors(index);
    LITE_ERROR_HANDLER_END
 }

 std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) {
    LITE_ERROR_HANDLER_BEGIN
    LITE_ASSERT(m_loaded, "get_output_tensor should be used after model loaded.");
--- a/lite/src/network_impl_base.h
+++ b/lite/src/network_impl_base.h
@@ -42,6 +42,9 @@ public:
    bool have_sync = false;
    //! Real input and output data location
    std::shared_ptr<Tensor> lite_tensor = nullptr;
    //! If the input is consists of discrete multiple tensors, lite_tensors is real
    //! input data location
    std::vector<std::shared_ptr<Tensor>> lite_tensors;

    IOInner() = default;
    IOInner(const IO& io) {
@@ -86,9 +89,22 @@ public:
    virtual std::shared_ptr<Tensor> get_io_tensor(
            std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0;

    //! get the network input tensors which input consists of discrete multiple tensors,
    //! layout (1, c, h, w)
    virtual std::vector<std::shared_ptr<Tensor>> get_io_tensors(
            std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_INPUT) {
        return {};
    }

    //! get the input tensor by index in the load_result tensormap
    virtual std::shared_ptr<Tensor> get_input_tensor(size_t index) = 0;

    //! get the network input tensors which input consists of discrete multiple tensors
    //! by index
    virtual std::vector<std::shared_ptr<Tensor>> get_input_tensors(size_t index) {
        return {};
    }

    //! get the output tensor by index in the load_result output_var_list
    virtual std::shared_ptr<Tensor> get_output_tensor(size_t index) = 0;

--- a/lite/test/test_network.cpp
+++ b/lite/test/test_network.cpp
@@ -1387,6 +1387,96 @@ TEST(TestNetWork, DeviceAsyncExec) {
 }

 #endif

 TEST(TestNetWork, Discrete_Input) {
    auto data = get_input_data("./data_b3.npy");
    auto data_0 = get_input_data("./data0.npy");
    auto data_1 = get_input_data("./data1.npy");
    auto data_2 = get_input_data("./data2.npy");
    std::string model_path = "./test_discrete_input.mge";

    Config config;
    config.device_type = LiteDeviceType::LITE_CUDA;

    std::shared_ptr<Network> network0 = std::make_shared<Network>(config);
    network0->load_model(model_path);

    std::shared_ptr<Tensor> data_tensor = network0->get_io_tensor("data");
    data_tensor->share_memory_with(*data);

    network0->forward();
    network0->wait();
    std::shared_ptr<Tensor> output_tensor0 = network0->get_output_tensor(0);

    config.discrete_input_name = "data";
    NetworkIO ios;
    bool is_host = true;
    Layout d_ly{{3, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
    ios.inputs.push_back({"data", is_host, LiteIOType::LITE_IO_VALUE, d_ly});

    std::shared_ptr<Network> network1 = std::make_shared<Network>(config, ios);
    network1->load_model(model_path);

    std::vector<std::shared_ptr<Tensor>> data_tensors =
            network1->get_io_tensors("data");
    data_tensors[0]->share_memory_with(*data_0);
    data_tensors[1]->share_memory_with(*data_1);
    data_tensors[2]->share_memory_with(*data_2);

    network1->forward();
    network1->wait();
    std::shared_ptr<Tensor> output_tensor1 = network1->get_output_tensor(0);

    compare_lite_tensor<float>(output_tensor0, output_tensor1);
 }

 TEST(TestNetWork, Discrete_Input_Device) {
    auto data = get_input_data("./data_b3.npy");
    auto data_0 = get_input_data("./data0.npy");
    auto data_1 = get_input_data("./data1.npy");
    auto data_2 = get_input_data("./data2.npy");
    std::string model_path = "./test_discrete_input.mge";

    Config config;
    config.device_type = LiteDeviceType::LITE_CUDA;

    std::shared_ptr<Network> network0 = std::make_shared<Network>(config);
    network0->load_model(model_path);

    std::shared_ptr<Tensor> data_tensor = network0->get_io_tensor("data");
    data_tensor->share_memory_with(*data);

    network0->forward();
    network0->wait();
    std::shared_ptr<Tensor> output_tensor0 = network0->get_output_tensor(0);

    config.discrete_input_name = "data";
    NetworkIO ios;
    bool is_host = false;
    Layout d_ly{{3, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
    ios.inputs.push_back({"data", is_host, LiteIOType::LITE_IO_VALUE, d_ly});

    std::shared_ptr<Network> network1 = std::make_shared<Network>(config, ios);
    network1->load_model(model_path);

    std::vector<std::shared_ptr<Tensor>> data_tensors =
            network1->get_io_tensors("data");
    auto d0_cuda = Tensor(LiteDeviceType::LITE_CUDA, d_ly);
    auto d1_cuda = Tensor(LiteDeviceType::LITE_CUDA, d_ly);
    auto d2_cuda = Tensor(LiteDeviceType::LITE_CUDA, d_ly);
    d0_cuda.copy_from(*data_0);
    d1_cuda.copy_from(*data_1);
    d2_cuda.copy_from(*data_2);
    data_tensors[0]->share_memory_with(d0_cuda);
    data_tensors[1]->share_memory_with(d1_cuda);
    data_tensors[2]->share_memory_with(d2_cuda);

    network1->forward();
    network1->wait();
    std::shared_ptr<Tensor> output_tensor1 = network1->get_output_tensor(0);

    compare_lite_tensor<float>(output_tensor0, output_tensor1);
 }
 #endif

 #if MGB_ATLAS || MGB_CAMBRICON
--- a/lite/test/test_network_c.cpp
+++ b/lite/test/test_network_c.cpp
@@ -290,6 +290,48 @@ TEST(TestCapiNetWork, GetAllNameAhead) {
    ASSERT_TRUE(ios_mem.outputs->config_layout.shapes[1] == 1000);
 }

 TEST(TestCapiNetWork, Discrete_Input) {
    std::vector<std::shared_ptr<lite::Tensor>> datas;
    datas.push_back(lite::get_input_data("./data0.npy"));
    datas.push_back(lite::get_input_data("./data1.npy"));
    datas.push_back(lite::get_input_data("./data2.npy"));
    size_t data_length_in_byte = datas[0]->get_tensor_total_size_in_byte();

    LiteIO input_io = default_io;
    input_io.is_host = true;
    input_io.name = "data";
    LiteLayout d_ly;
    d_ly.ndim = 4;
    d_ly.data_type = LiteDataType::LITE_FLOAT;
    std::vector<size_t> input_shape = {3, 3, 224, 224};
    for (size_t i = 0; i < d_ly.ndim; i++) {
        d_ly.shapes[i] = input_shape[i];
    }
    input_io.config_layout = d_ly;

    LiteNetworkIO network_io = *default_network_io();
    network_io.inputs = &input_io;
    network_io.input_size = 1;

    LiteConfig c_config = *default_config();
    c_config.discrete_input_name = "data";
    LiteNetwork c_network;
    LITE_CAPI_CHECK(LITE_make_network(&c_network, c_config, network_io));
    std::string model_path = "./test_discrete_input.mge";
    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str()));

    std::vector<LiteTensor> c_data_tensors(3, nullptr);
    for (size_t i = 0; i < 3; i++) {
        LITE_CAPI_CHECK(LITE_get_io_tensors(
                c_network, "data", i, LITE_INPUT, &c_data_tensors[i]));
        LITE_CAPI_CHECK(LITE_reset_tensor_memory(
                c_data_tensors[i], datas[i]->get_memory_ptr(), data_length_in_byte));
    }

    ForwardNetwork;
    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
 }

 #if LITE_BUILD_WITH_RKNPU

 static int GetTop(
--- a/src/core/include/megbrain/graph/helper.h
+++ b/src/core/include/megbrain/graph/helper.h
@@ -381,7 +381,7 @@ public:
 };

 //! shortcut for calling ExtraDependencyMerger
 SymbolVarArray get_dest_vars_with_extra_deps(
 MGE_WIN_DECLSPEC_FUC SymbolVarArray get_dest_vars_with_extra_deps(
        const SymbolVarArray& dest_vars, SpecialOprStat* sopr_stat = nullptr);

 }  // namespace cg
--- a/src/gopt/include/megbrain/gopt/framework.h
+++ b/src/gopt/include/megbrain/gopt/framework.h
@@ -44,13 +44,14 @@ public:
    //! rewrite vars in a graph
    class Rewriter;

    SubGraph(const SymbolVarArray& endpoint_vars);
    MGE_WIN_DECLSPEC_FUC SubGraph(const SymbolVarArray& endpoint_vars);

    //! get the associated ComputingGraph
    ComputingGraph* comp_graph() const { return m_comp_graph; }

    //! iterate in topology order
    void iter(const Callback& cb, std::shared_ptr<ExtraDep> = nullptr) const;
    MGE_WIN_DECLSPEC_FUC void iter(
            const Callback& cb, std::shared_ptr<ExtraDep> = nullptr) const;

    //! make a Rewriter bound to this graph
    inline Rewriter make_rewriter();
@@ -99,7 +100,7 @@ public:
     * \return new operator that uses new inputs; it would be
     *      opr if no input is changed
     */
    OperatorNodeBase* auto_replace_outputs(OperatorNodeBase* opr);
    MGE_WIN_DECLSPEC_FUC OperatorNodeBase* auto_replace_outputs(OperatorNodeBase* opr);

    //! get current var: if var has been replaced, return its
    //! new value; otherwise return var itself
@@ -119,11 +120,11 @@ public:
     *
     * \param msg see OptState::on_var_replaced
     */
    void replace_var(VarNode* src, VarNode* dst, const char* msg);
    MGE_WIN_DECLSPEC_FUC void replace_var(VarNode* src, VarNode* dst, const char* msg);

    //! apply this rewriter to the owner graph and modify owner
    //! SubGraph inplace
    void apply_inplace() const;
    MGE_WIN_DECLSPEC_FUC void apply_inplace() const;
 };
 SubGraph::Rewriter SubGraph::make_rewriter() {
    return {this};
--- a/src/opr/impl/imgproc.cpp
+++ b/src/opr/impl/imgproc.cpp
@@ -160,18 +160,6 @@ void WarpPerspectiveForward::outshape_by_symvar_do_get_output_shape(
                "out2d=%s",
                imgshp.to_string().c_str(), matshp.to_string().c_str(),
                oshp2d.to_string().c_str());
        if (input().size() - m_srcs_size == 2) {
            mgb_assert(
                    m_srcs_size == matshp[0], "batchsize mismatch: img=%zu mat=%zu",
                    m_srcs_size, matshp[0]);
        } else {
            mgb_assert(input().size() - m_srcs_size == 3);
            mat_idx_shp = shpinfo.shape_inp_shp.at(m_srcs_size + 1);
            mgb_assert(
                    mat_idx_shp[0] == matshp[0] && mat_idx_shp.ndim == 1,
                    "invalid mat_idx shape: mat=%zu mat_idx=%s", matshp[0],
                    mat_idx_shp.to_string().c_str());
        }
        size_t height_idx = 0;
        if (param().format == Param::Format::NCHW) {
            height_idx = 2;
--- a/src/opr/include/megbrain/opr/imgproc.h
+++ b/src/opr/include/megbrain/opr/imgproc.h
@@ -22,7 +22,7 @@ namespace opr {
 * Impl note: this operator might have 3 or 4 inputs depending on whether
 * \p mat_idx is given
 */
 MGB_DEFINE_OPR_CLASS(
 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
        WarpPerspectiveForward,
        intl::WorkspaceSizeInfer<intl::OutshapeBySymvarSCNOpr<
                mixin::MegDNNOprHolderImpl<megdnn::WarpPerspectiveForward>>>) // {