fix(lite): fix input invalid bug in lar for fitting mode

GitOrigin-RevId: 45d81c9a96
2 years ago · fca6c76a0e
--- a/.gitattributes
+++ b/.gitattributes
@@ -23,3 +23,10 @@ imperative/python/test/unit/module/MagicMindRuntimeOprTest.GraphShapeMutable.mlu
 lite/test/resource/lite/ax_data_input.npy filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/ax_data_output.npy filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/ax_model.mge filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/add_demo_input.json filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/add_demo.mge filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/resnet50_b10.mdl filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/resnet50_input.npy filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/resnet50.mge filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/resnet50_uint8.mge filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/cat.ppm filter=lfs diff=lfs merge=lfs -text
--- a/lite/load_and_run/examples/example.sh
+++ b/lite/load_and_run/examples/example.sh
@@ -112,7 +112,7 @@ function prepare_model_and_data(){

    #prepare mge model
    python3 script/resnet50_mge.py --dir model_source
    python3 script/resnet50_mge.py  --dir model_source -d int8
    python3 script/resnet50_mge.py  --dir model_source -d uint8
    python3 script/resnet50_mge.py  --dir model_source --inputs "#rand(0,255)"

    #make input_data
--- a/lite/load_and_run/examples/script/add_demo.py
+++ b/lite/load_and_run/examples/script/add_demo.py
@@ -43,7 +43,7 @@ if __name__ == "__main__":
    @jit.trace(symbolic=True, capture_as_const=True)
    def fun(data):
        return net(data)
    data = tensor([3,4,5])
    data = tensor([3.0,4.0,5.0])
    fun(data)
    if args.inputs == "":
        fun.dump(
--- a/lite/load_and_run/examples/script/mge_input_data.py
+++ b/lite/load_and_run/examples/script/mge_input_data.py
@@ -26,7 +26,7 @@ cv2.imwrite("input_data/cat.ppm",processed_img)
 #json 
 data_obj = {
    "shape": [1,3],
    "type": "int32",
    "type": "float32",
    "raw": [2,3,4]
 }
 with open("input_data/add_demo_input.json", "w") as f:
--- a/lite/load_and_run/examples/script/resnet50_mge.py
+++ b/lite/load_and_run/examples/script/resnet50_mge.py
@@ -348,4 +348,4 @@ if __name__ == "__main__":
            )

    else:
        raise TypeError("dtype should be float32")
        raise TypeError("dtype should be float32 or uint8")
--- a/lite/load_and_run/src/helpers/data_parser.h
+++ b/lite/load_and_run/src/helpers/data_parser.h
@@ -18,6 +18,8 @@ struct DataParser {
    };
    void feed(const std::string& path);

    ~DataParser() { inputs.clear(); };

    std::unordered_map<std::string, mgb::HostTensorND> inputs;

 private:
--- a/lite/load_and_run/src/helpers/json_loader.cpp
+++ b/lite/load_and_run/src/helpers/json_loader.cpp
@@ -321,10 +321,10 @@ std::unique_ptr<JsonLoader::Value> JsonLoader::load(const char* path) {
    const size_t size = ftell(fin.get());
    std::fseek(fin.get(), 0, SEEK_SET);

    std::unique_ptr<char> buf(static_cast<char*>(malloc(size)));
    std::vector<char> buf(size + 1);

    auto nr = std::fread(buf.get(), 1, size, fin.get());
    auto nr = std::fread(buf.data(), 1, size, fin.get());
    mgb_assert(nr == size);

    return load(buf.get(), size);
    return load(buf.data(), size);
 }
--- a/lite/load_and_run/src/models/model_mdl.cpp
+++ b/lite/load_and_run/src/models/model_mdl.cpp
@@ -179,4 +179,41 @@ std::vector<uint8_t> ModelMdl::get_model_data() {
            mgb::serialization::GraphDumper::make(std::move(out_file), m_format.val());
    dumper->dump(m_load_result.output_var_list, config);
    return out_data;
 }

 void ModelMdl::update_io() {
    //! update output varlist when input shape maybe change(some pass excution
    //! time depends on the shape of init input)
    mgb::thin_hash_table::ThinHashMap<mgb::cg::SymbolVar, mgb::cg::SymbolVar> varmap;
    auto&& network = m_load_result;
    std::unordered_map<void*, std::string> tensor_name_map;
    for (auto& input : network.tensor_map) {
        tensor_name_map.insert({input.second->raw_ptr(), input.first});
    }
    mgb::cg::DepOprIter dep([&](mgb::cg::OperatorNodeBase* opr) {
        if (auto h2d = opr->try_cast_final<mgb::opr::Host2DeviceCopy>()) {
            if (tensor_name_map.find(h2d->host_data()->raw_ptr()) !=
                tensor_name_map.end()) {
                //! make new h2d opr with new host tensor shape
                std::string name = tensor_name_map[h2d->host_data()->raw_ptr()];
                std::shared_ptr<mgb::HostTensorND> new_tensor =
                        std::make_shared<mgb::HostTensorND>();
                new_tensor->copy_from(*h2d->host_data());

                auto h2d_opr = mgb::opr::Host2DeviceCopy::make(
                        *h2d->owner_graph(), new_tensor, h2d->param(), h2d->config());
                //! rename new h2d with given name
                h2d_opr.node()->owner_opr()->name(name);
                varmap[h2d->output(0)] = h2d_opr;
            }
        }
    });
    //! get replace var map
    for (auto&& i : network.output_var_list)
        dep.add(i);
    //! replace new h2d and update related var shape
    if (!varmap.empty()) {
        auto output_vars = mgb::cg::replace_vars(network.output_var_list, varmap);
        network.output_var_list = output_vars;
    }
 }
--- a/lite/load_and_run/src/models/model_mdl.h
+++ b/lite/load_and_run/src/models/model_mdl.h
@@ -108,6 +108,8 @@ public:

    std::vector<uint8_t> get_model_data() override;

    void update_io();

 private:
    bool share_model_mem;
    std::string model_path;
--- a/lite/load_and_run/src/options/extern_c_opr_options.cpp
+++ b/lite/load_and_run/src/options/extern_c_opr_options.cpp
@@ -18,6 +18,11 @@ void COprLibOption::config_model_internel(
                    "lite model dont't support run with external c opr "
                    "parmeter");
        }
        if (m_c_opr_init_func != MGB_C_OPR_INIT_FUNC_STR) {
            LITE_THROW(
                    "lite model dont't support to set the c_opr_init_func to another "
                    "API");
        }
    }
 }
 template <>
--- a/lite/load_and_run/src/options/io_options.cpp
+++ b/lite/load_and_run/src/options/io_options.cpp
@@ -26,32 +26,89 @@ void InputOption::config_model_internel<ModelLite>(
        auto&& parser = model->get_input_parser();
        auto&& network = model->get_lite_network();

        //! datd type map from mgb data type to lite data type
        std::map<megdnn::DTypeEnum, LiteDataType> type_map = {
                {megdnn::DTypeEnum::Float32, LiteDataType::LITE_FLOAT},
                {megdnn::DTypeEnum::Int32, LiteDataType::LITE_INT},
                {megdnn::DTypeEnum::Int8, LiteDataType::LITE_INT8},
                {megdnn::DTypeEnum::Uint8, LiteDataType::LITE_UINT8}};

        for (auto& i : parser.inputs) {
            //! get tensor information from data parser
            auto tensor = i.second;
            auto data_type = tensor.dtype();
            auto tensor_shape = tensor.shape();
            mgb::dt_byte* src = tensor.raw_ptr();

            //! set lite layout
            lite::Layout layout;
            layout.ndim = tensor_shape.ndim;
            for (size_t idx = 0; idx < tensor_shape.ndim; idx++) {
                layout.shapes[idx] = tensor_shape[idx];
        //! datd type map from lite data type to  mgb data type
        std::map<LiteDataType, megdnn::DTypeEnum> type_map = {
                {LiteDataType::LITE_FLOAT, megdnn::DTypeEnum::Float32},
                {LiteDataType::LITE_INT, megdnn::DTypeEnum::Int32},
                {LiteDataType::LITE_INT8, megdnn::DTypeEnum::Int8},
                {LiteDataType::LITE_UINT8, megdnn::DTypeEnum::Uint8}};

        if (m_force_batch_size > 0) {
            LITE_WARN("force set batch size to %d", m_force_batch_size);
            auto all_inputs_name = network->get_all_input_name();
            for (auto& name : all_inputs_name) {
                std::shared_ptr<lite::Tensor> input_tensor =
                        network->get_io_tensor(name);
                //! set lite layout
                lite::Layout layout;
                mgb::TensorShape new_shape;
                new_shape.ndim = input_tensor->get_layout().ndim;
                layout.ndim = input_tensor->get_layout().ndim;
                for (size_t idx = 0; idx < new_shape.ndim; idx++) {
                    new_shape.shape[idx] = input_tensor->get_layout().shapes[idx];
                    layout.shapes[idx] = new_shape.shape[idx];
                }
                new_shape.shape[0] = m_force_batch_size;
                layout.shapes[0] = m_force_batch_size;

                //! gengrate tesnor copy from origin tensor
                mgb::HostTensorND hv;
                hv.comp_node(mgb::CompNode::default_cpu(), true)
                        .dtype(megdnn::DType::from_enum(
                                type_map[input_tensor->get_layout().data_type]))
                        .resize(new_shape);
                mgb::dt_byte* raw_ptr = hv.raw_ptr();
                //! single batch input size
                size_t batch_stride = hv.dtype().size() * hv.layout().total_nr_elems() /
                                      m_force_batch_size;
                size_t curr_batch_size = m_force_batch_size;
                //! copy data from origin input_tensor
                size_t init_batch = input_tensor->get_layout().shapes[0];
                while (curr_batch_size > init_batch) {
                    memcpy((char*)raw_ptr, (char*)(input_tensor->get_memory_ptr()),
                           batch_stride * init_batch);
                    curr_batch_size -= init_batch;
                    raw_ptr += batch_stride * init_batch;
                }
                memcpy((char*)raw_ptr, (char*)(input_tensor->get_memory_ptr()),
                       batch_stride * curr_batch_size);

                input_tensor->reset(hv.raw_ptr(), layout);
                parser.inputs[name] = std::move(hv);
            }
            layout.data_type = type_map[data_type.enumv()];
        } else {
            for (auto& i : parser.inputs) {
                //! get tensor information from data parser
                auto tensor = i.second;
                auto tensor_shape = tensor.shape();
                mgb::dt_byte* src = tensor.raw_ptr();
                std::shared_ptr<lite::Tensor> input_tensor =
                        network->get_io_tensor(i.first);
                //! set lite layout
                lite::Layout layout;
                layout.ndim = tensor_shape.ndim;
                for (size_t idx = 0; idx < tensor_shape.ndim; idx++) {
                    layout.shapes[idx] = tensor_shape[idx];
                }
                layout.data_type = input_tensor->get_layout().data_type;

            //! set network input tensor
            std::shared_ptr<lite::Tensor> input_tensor =
                    network->get_io_tensor(i.first);
            input_tensor->reset(src, layout);
                //! set data for only given shape
                if (tensor.storage().empty()) {
                    mgb::HostTensorND hv;
                    hv.comp_node(mgb::CompNode::default_cpu(), true)
                            .dtype(megdnn::DType::from_enum(type_map[layout.data_type]))
                            .resize(tensor.shape());
                    mgb::dt_byte* raw_ptr = hv.raw_ptr();
                    //! set all value in tesnor to 1
                    memset((char*)raw_ptr, 1,
                           hv.layout().total_nr_elems() * hv.dtype().size());
                    parser.inputs[i.first] = std::move(hv);
                    input_tensor->reset(raw_ptr, layout);
                } else {
                    //! set network input tensor
                    input_tensor->reset(src, layout);
                }
            }
        }
    }
 }
@@ -67,22 +124,58 @@ void InputOption::config_model_internel<ModelMdl>(
    } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        auto&& parser = model->get_input_parser();
        auto&& network = model->get_mdl_load_result();
        auto tensormap = network.tensor_map;
        for (auto& i : parser.inputs) {
            mgb_assert(
                    tensormap.find(i.first) != tensormap.end(),
                    "can't find tesnor named %s", i.first.c_str());
            auto& in = tensormap.find(i.first)->second;
            if (i.second.storage().empty()) {
        auto&& tensormap = network.tensor_map;

        if (m_force_batch_size > 0) {
            mgb_log_warn("force set batch size to %d", m_force_batch_size);
            for (auto& iter : tensormap) {
                auto& in = iter.second;
                mgb::HostTensorND hv;
                mgb::TensorShape new_shape = in->shape();
                new_shape[0] = m_force_batch_size;
                hv.comp_node(mgb::CompNode::default_cpu(), true)
                        .dtype(in->dtype())
                        .resize(i.second.shape());
                        .resize(new_shape);
                mgb::dt_byte* raw_ptr = hv.raw_ptr();
                memset((char*)raw_ptr, 1, hv.layout().total_nr_elems());

                //! copy given batch data into new tensor
                size_t batch_stride = in->dtype().size() *
                                      in->layout().total_nr_elems() / (in->shape()[0]);
                size_t curr_batch_size = m_force_batch_size;

                //! copy data from origin input_tensor
                size_t init_batch = in->shape()[0];
                while (curr_batch_size > init_batch) {
                    memcpy((char*)raw_ptr, (char*)(in->raw_ptr()),
                           batch_stride * init_batch);
                    curr_batch_size -= init_batch;
                    raw_ptr += batch_stride * init_batch;
                }
                memcpy((char*)raw_ptr, (char*)(in->raw_ptr()),
                       batch_stride * curr_batch_size);
                //! set input tensor
                in->copy_from(hv);
            } else {
                in->copy_from(i.second);
                parser.inputs[iter.first] = std::move(hv);
            }
        } else {
            for (auto& i : parser.inputs) {
                mgb_assert(
                        tensormap.find(i.first) != tensormap.end(),
                        "can't find tesnor named %s", i.first.c_str());
                auto& in = tensormap.find(i.first)->second;
                if (i.second.storage().empty()) {
                    mgb::HostTensorND hv;
                    hv.comp_node(mgb::CompNode::default_cpu(), true)
                            .dtype(in->dtype())
                            .resize(i.second.shape());
                    mgb::dt_byte* raw_ptr = hv.raw_ptr();
                    memset((char*)raw_ptr, 1,
                           hv.layout().total_nr_elems() * hv.dtype().size());
                    in->copy_from(hv);
                    parser.inputs[i.first] = std::move(hv);
                } else {
                    in->copy_from(i.second);
                }
            }
        }
    }
@@ -191,6 +284,7 @@ void IOdumpOption::config_model_internel<ModelMdl>(
 using namespace lar;

 void InputOption::update() {
    data_path.clear();
    m_option_name = "input";
    size_t start = 0;
    auto end = FLAGS_input.find(";", start);
@@ -201,6 +295,7 @@ void InputOption::update() {
        end = FLAGS_input.find(";", start);
    }
    data_path.emplace_back(FLAGS_input.substr(start));
    m_force_batch_size = FLAGS_batch_size;
 }

 std::shared_ptr<lar::OptionBase> lar::InputOption::create_option() {
@@ -283,7 +378,10 @@ void IOdumpOption::config_model(
 ////////////////////// Input gflags ////////////////////////
 DEFINE_string(
        input, "", "Set up inputs data for model --input [ file_path | data_string]");

 DEFINE_int32(
        batch_size, -1,
        "set the batch size of input(especially for global layout transform "
        "optimization working on)");
 ////////////////////// OprIOdump gflags ////////////////////////

 DEFINE_string(io_dump, "", "set the io dump file path in text format");
@@ -299,4 +397,5 @@ DEFINE_string(
 DEFINE_bool(copy_to_host, false, "copy device data to host");

 REGIST_OPTION_CREATOR(input, lar::InputOption::create_option);

 REGIST_OPTION_CREATOR(iodump, lar::IOdumpOption::create_option);
--- a/lite/load_and_run/src/options/io_options.h
+++ b/lite/load_and_run/src/options/io_options.h
@@ -13,7 +13,7 @@ DECLARE_bool(io_dump_stderr);
 DECLARE_string(bin_io_dump);
 DECLARE_string(bin_out_dump);
 DECLARE_bool(copy_to_host);

 DECLARE_int32(batch_size);
 namespace lar {

 /*!
@@ -22,7 +22,7 @@ namespace lar {
 class InputOption final : public OptionBase {
 public:
    //! static function for registe options
    static bool is_valid() { return !FLAGS_input.empty(); };
    static bool is_valid() { return !FLAGS_input.empty() || FLAGS_batch_size > 0; };
    static std::shared_ptr<OptionBase> create_option();

    void config_model(
@@ -40,6 +40,7 @@ private:

    std::string m_option_name;
    std::vector<std::string> data_path;  // data string or data file path
    int32_t m_force_batch_size;
 };

 class IOdumpOption : public OptionBase {
--- a/lite/load_and_run/src/options/layout_trans_options.cpp
+++ b/lite/load_and_run/src/options/layout_trans_options.cpp
@@ -11,7 +11,7 @@ void GoptLayoutOption::config_model_internel<ModelLite>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
    if (runtime_param.stage == RunStage::AFTER_NETWORK_CREATED) {
        if (m_layout_transform) {
            LITE_LOG("using global layout transform optimization\n");
            LITE_LOG("using global layout transform optimization");
            if (m_layout_transform_target ==
                mgb::gopt::GraphTuningOptions::Target::CPU) {
                model->get_config().device_type = LiteDeviceType::LITE_CPU;
@@ -43,67 +43,25 @@ void GoptLayoutOption::config_model_internel<ModelMdl>(
        RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
    if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
        if (m_layout_transform) {
            mgb_log_debug("update input shape for global layout transform\n");
            auto&& load_result = model->get_mdl_load_result();
            if (m_force_batch_size > 0) {
                for (auto&& i : load_result.tensor_map) {
                    auto& in = i.second;
                    mgb::TensorShape new_shape = in->shape();
                    new_shape[0] = m_force_batch_size;
                    mgb::HostTensorND new_tensor;
                    new_tensor.comp_node(mgb::CompNode::default_cpu(), true)
                            .dtype(in->dtype())
                            .resize(new_shape);
                    mgb::dt_byte* raw_ptr = new_tensor.raw_ptr();
                    memset((char*)raw_ptr, 1, new_tensor.layout().total_nr_elems());
                    in->copy_from(new_tensor);
                }
            }
            for (auto&& item : load_result.output_var_list) {
                if (item.shape()[0] > 1) {
                    mgb_log_warn(
                            " model may be dumped with multi batch and will cost lots "
                            "of time to profile during global layout transform!!!\n");
                }
            }
            //! update output varlist when input shape maybe change(some pass excution
            //! time depends on the shape of init input)
            mgb::thin_hash_table::ThinHashMap<mgb::cg::SymbolVar, mgb::cg::SymbolVar>
                    varmap;
            mgb::cg::DepOprIter dep([&](mgb::cg::OperatorNodeBase* opr) {
                if (auto h2d = opr->try_cast_final<mgb::opr::Host2DeviceCopy>()) {
                    auto param = h2d->param();
                    mgb::TensorShape new_shape = h2d->host_data()->shape();
                    std::shared_ptr<mgb::HostTensorND> new_tensor =
                            std::make_shared<mgb::HostTensorND>(
                                    h2d->host_data()->comp_node(), new_shape,
                                    h2d->host_data()->dtype());
                    new_tensor->only_reset_raw_storage(h2d->host_data()->storage());
                    auto h2d_opr = mgb::opr::Host2DeviceCopy::make(
                            *h2d->owner_graph(), new_tensor, param, h2d->config());
                    varmap[h2d->output(0)] = h2d_opr;
                }
            });

            for (auto&& i : load_result.output_var_list)
                dep.add(i);

            if (!varmap.empty()) {
                auto output_vars =
                        mgb::cg::replace_vars(load_result.output_var_list, varmap);
                for (size_t i = 0; i < load_result.output_var_list.size(); ++i) {
                    output_vars[i].rename(
                            load_result.output_var_list[i].node()->name());
                            "of time to profile during global layout transform!!!");
                }
                load_result.output_var_list = output_vars;
            }
        }
    } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) {
        if (m_layout_transform) {
            mgb_log("using global layout transform optimization\n");
            mgb_log("using global layout transform optimization");
            auto&& load_result = model->get_mdl_load_result();
            load_result.output_var_list = mgb::gopt::layout_transform(
            auto output_vars = mgb::gopt::layout_transform(
                    load_result.output_var_list, m_layout_transform_target);
            for (size_t i = 0; i < load_result.output_var_list.size(); ++i) {
                output_vars[i].rename(load_result.output_var_list[i].node()->name());
            }
            load_result.output_var_list = output_vars;

            if (!m_layout_transform_dump_file.empty()) {
                auto out_file = mgb::serialization::OutputFile::make_fs(
@@ -176,8 +134,6 @@ void GoptLayoutOption::update() {
    }
    m_layout_transform_dump_file = FLAGS_layout_transform_dump;

    m_force_batch_size = FLAGS_layout_transform_batch_size;

    m_option = {
            {"layout_transform", lar::String::make("")},
    };
@@ -204,14 +160,6 @@ bool GoptLayoutOption::is_valid() {
        }
    }
    ret = ret || !FLAGS_layout_transform_dump.empty();
    if (FLAGS_layout_transform_batch_size > 0) {
        mgb_assert(
                FLAGS_layout_transform_batch_size > 0 &&
                        !FLAGS_layout_transform.empty(),
                "\"layout-transform-batch-size\" should be set with "
                "\"layout-transform\"");
        ret = ret || FLAGS_layout_transform_batch_size > 0;
    }
    return ret || m_valid;
 }

@@ -264,8 +212,5 @@ DEFINE_string(
        "The computing graph after global layout transform will be dumped to the given "
        "file path.");

 DEFINE_int32(
        layout_transform_batch_size, -1,
        "the batch size of input for global layout transform optimization working on");
 REGIST_OPTION_CREATOR(gopt_layout, lar::GoptLayoutOption::create_option);
 REGIST_OPTION_VALIDATER(gopt_layout, lar::GoptLayoutOption::set_valid);
--- a/lite/load_and_run/src/options/layout_trans_options.h
+++ b/lite/load_and_run/src/options/layout_trans_options.h
@@ -5,7 +5,6 @@
 #include "models/model.h"
 #include "option_base.h"
 DECLARE_string(layout_transform);
 DECLARE_int32(layout_transform_batch_size);
 DECLARE_string(layout_transform_dump);

 namespace lar {
@@ -41,6 +40,5 @@ private:
    mgb::gopt::GraphTuningOptions::Target m_layout_transform_target;
    static bool m_valid;
    OptionValMap m_option;
    int32_t m_force_batch_size;
 };
 }  // namespace lar
--- a/lite/load_and_run/src/options/plugin_options.cpp
+++ b/lite/load_and_run/src/options/plugin_options.cpp
@@ -199,7 +199,7 @@ void DebugOption::format_and_print(

    std::stringstream ss;
    ss << table;
    LITE_LOG("%s\n\n", ss.str().c_str());
    LITE_LOG("\n%s\n", ss.str().c_str());
 }

 template <>
@@ -243,7 +243,7 @@ void DebugOption::format_and_print(

    std::stringstream ss;
    ss << table;
    mgb_log("%s\n\n", ss.str().c_str());
    mgb_log("\n%s\n", ss.str().c_str());
 }

 template <>
--- a/lite/load_and_run/src/options/strategy_options.cpp
+++ b/lite/load_and_run/src/options/strategy_options.cpp
@@ -32,13 +32,19 @@ void StrategyOption::config_model(
        runtime_param.run_iter = run_iter;
        runtime_param.threads = threads;
        runtime_param.testcase_num = 1;
    } else if (runtime_param.stage == RunStage::UPDATE_IO) {
        if (model->type() == ModelType::MEGDL_MODEL) {
            auto model_ptr = std::static_pointer_cast<ModelMdl>(model);
            //! update input and output related varnode
            model_ptr->update_io();
        }
    } else if (runtime_param.stage == RunStage::BEFORE_OUTSPEC_SET) {
        if (model->type() == ModelType::MEGDL_MODEL) {
            auto model_ptr = std::static_pointer_cast<ModelMdl>(model);
            auto num = model_ptr->get_testcase_num();
            if (num != 0)
                runtime_param.testcase_num = num;

            //! make output specification
            model_ptr->make_output_spec();
        }
    }
--- a/lite/load_and_run/src/strategys/strategy_fitting.cpp
+++ b/lite/load_and_run/src/strategys/strategy_fitting.cpp
@@ -205,9 +205,9 @@ void OptionsTimeProfiler::profile_with_given_options(
        //! after load configure
        auto config_model_before_runing = [&]() {
            for (auto stage :
                 {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
                  RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET,
                  RunStage::MODEL_RUNNING}) {
                 {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
                  RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
                  RunStage::AFTER_OUTSPEC_SET, RunStage::MODEL_RUNNING}) {
                runtime_param.stage = stage;
                stage_config_model();
            }
@@ -453,9 +453,9 @@ void FittingStrategy::dump_best_options_with_model() {

    //! get model binary data after optimized
    for (auto stage :
         {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
          RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET,
          RunStage::MODEL_RUNNING}) {
         {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
          RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
          RunStage::AFTER_OUTSPEC_SET, RunStage::MODEL_RUNNING}) {
        runtime_param.stage = stage;
        stage_config_model();
    }
@@ -502,9 +502,9 @@ void FittingStrategy::AutoCleanFile::dump_model() {
    model->load_model();
    //! get model binary data after optimized
    for (auto stage :
         {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
          RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET,
          RunStage::MODEL_RUNNING}) {
         {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
          RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
          RunStage::AFTER_OUTSPEC_SET, RunStage::MODEL_RUNNING}) {
        runtime_param.stage = stage;
        stage_config_model();
    }
--- a/lite/load_and_run/src/strategys/strategy_normal.cpp
+++ b/lite/load_and_run/src/strategys/strategy_normal.cpp
@@ -53,8 +53,9 @@ void NormalStrategy::run_subline() {
    //! after load configure
    auto config_after_load = [&]() {
        for (auto stage :
             {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
              RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET}) {
             {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
              RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
              RunStage::AFTER_OUTSPEC_SET}) {
            m_runtime_param.stage = stage;
            stage_config_model();
        }
--- a/lite/test/test_io_options.cpp
+++ b/lite/test/test_io_options.cpp
@@ -0,0 +1,77 @@
 #include <gtest/gtest.h>
 #include <string.h>
 #include <memory>
 #include "test_options.h"

 using namespace lar;
 DECLARE_bool(lite);
 DECLARE_string(input);
 DECLARE_int32(batch_size);
 DECLARE_int32(iter);
 namespace {
 STRING_OPTION_WRAP(input, "");
 INT32_OPTION_WRAP(batch_size, -1);
 BOOL_OPTION_WRAP(lite);
 INT32_OPTION_WRAP(iter, 10);
 }  // anonymous namespace

 TEST(TestLarIO, INPUT) {
    DEFINE_INT32_WRAP(iter, 1);
    {
        std::string model_path = "./resnet50.mge";
        TEST_STRING_OPTION(input, "data:./resnet50_input.npy");
    }
    {
        std::string model_path = "./add_demo.mge";
        TEST_STRING_OPTION(input, "data:add_demo_input.json");
    }
    {
        std::string model_path = "./resnet50_uint8.mge";
        TEST_STRING_OPTION(input, "data:./cat.ppm");
    }
    {
        std::string model_path = "./add_demo.mge";
        TEST_STRING_OPTION(input, "data:[2.0,3.0,4.0]");
    }
    {
        std::string model_path = "./shufflenet.mge";
        TEST_STRING_OPTION(input, "data:{2,3,224,224}");
    }
    {
        std::string model_path = "./resnet50_b10.mdl";
        TEST_INT32_OPTION(batch_size, 1);
        TEST_INT32_OPTION(batch_size, 5);
        TEST_INT32_OPTION(batch_size, 11);
    }
 }

 TEST(TestLarIO, INPUT_LITE) {
    DEFINE_INT32_WRAP(iter, 1);
    DEFINE_BOOL_WRAP(lite);
    {
        std::string model_path = "./resnet50.mge";
        TEST_STRING_OPTION(input, "data:./resnet50_input.npy");
    }
    {
        std::string model_path = "./add_demo.mge";
        TEST_STRING_OPTION(input, "data:add_demo_input.json");
    }
    {
        std::string model_path = "./resnet50_uint8.mge";
        TEST_STRING_OPTION(input, "data:./cat.ppm");
    }
    {
        std::string model_path = "./add_demo.mge";
        TEST_STRING_OPTION(input, "data:[2.0,3.0,4.0]");
    }
    {
        std::string model_path = "./shufflenet.mge";
        TEST_STRING_OPTION(input, "data:{2,3,224,224}");
    }
    {
        std::string model_path = "./resnet50_b10.mdl";
        TEST_INT32_OPTION(batch_size, 1);
        TEST_INT32_OPTION(batch_size, 5);
        TEST_INT32_OPTION(batch_size, 11);
    }
 }
--- a/lite/test/test_lar_options.cpp
+++ b/lite/test/test_lar_options.cpp
@@ -24,7 +24,7 @@ BOOL_OPTION_WRAP(cuda);
 }  // anonymous namespace

 TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE) {
    DEFINE_WRAP(cpu);
    DEFINE_BOOL_WRAP(cpu);
    std::string model_path = "./shufflenet.mge";

    TEST_BOOL_OPTION(optimize_for_inference);
@@ -33,7 +33,7 @@ TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE) {
 #if LITE_WITH_OPENCL
 TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE_OPENCL) {
    REQUIRE_OPENCL();
    DEFINE_WRAP(opencl);
    DEFINE_BOOL_WRAP(opencl);
    std::string model_path = "./shufflenet.mge";

    TEST_BOOL_OPTION(optimize_for_inference);
@@ -43,7 +43,7 @@ TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE_OPENCL) {
 #if LITE_WITH_CUDA
 TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE_CUDA) {
    REQUIRE_CUDA();
    DEFINE_WRAP(cuda);
    DEFINE_BOOL_WRAP(cuda);
    std::string model_path = "./shufflenet.mge";

    TEST_BOOL_OPTION(optimize_for_inference);
--- a/lite/test/test_layout_options.cpp
+++ b/lite/test/test_layout_options.cpp
@@ -20,6 +20,7 @@ DECLARE_bool(enable_nchw64);
 DECLARE_bool(enable_nhwcd4);
 DECLARE_bool(enable_nchw44_dot);
 DECLARE_bool(fast_run);
 DECLARE_int32(iter);
 namespace {
 BOOL_OPTION_WRAP(enable_nchw4);
 BOOL_OPTION_WRAP(enable_chwn4);
@@ -30,6 +31,7 @@ BOOL_OPTION_WRAP(enable_nchw64);
 BOOL_OPTION_WRAP(enable_nhwcd4);
 BOOL_OPTION_WRAP(enable_nchw44_dot);
 BOOL_OPTION_WRAP(fast_run);
 INT32_OPTION_WRAP(iter, 10);

 BOOL_OPTION_WRAP(lite);
 BOOL_OPTION_WRAP(cpu);
@@ -39,7 +41,8 @@ BOOL_OPTION_WRAP(cuda);
 }  // anonymous namespace

 TEST(TestLarLayout, X86_CPU) {
    DEFINE_WRAP(cpu);
    DEFINE_INT32_WRAP(iter, 1);
    DEFINE_BOOL_WRAP(cpu);
    std::string model_path = "./shufflenet.mge";

    TEST_BOOL_OPTION(enable_nchw4);
@@ -52,8 +55,9 @@ TEST(TestLarLayout, X86_CPU) {
 }

 TEST(TestLarLayout, X86_CPU_LITE) {
    DEFINE_WRAP(cpu);
    DEFINE_WRAP(lite);
    DEFINE_INT32_WRAP(iter, 1);
    DEFINE_BOOL_WRAP(cpu);
    DEFINE_BOOL_WRAP(lite);
    std::string model_path = "./shufflenet.mge";

    TEST_BOOL_OPTION(enable_nchw4);
@@ -65,18 +69,20 @@ TEST(TestLarLayout, X86_CPU_LITE) {
 }

 TEST(TestLarLayoutFastRun, CPU_LITE) {
    DEFINE_WRAP(cpu);
    DEFINE_WRAP(lite);
    DEFINE_INT32_WRAP(iter, 1);
    DEFINE_BOOL_WRAP(cpu);
    DEFINE_BOOL_WRAP(lite);
    std::string model_path = "./shufflenet.mge";
    {
        DEFINE_WRAP(enable_nchw44);
        DEFINE_WRAP(fast_run);
        DEFINE_BOOL_WRAP(enable_nchw44);
        DEFINE_BOOL_WRAP(fast_run);
        run_NormalStrategy(model_path);
    }
 }
 #if LITE_WITH_CUDA
 TEST(TestLarLayout, CUDA) {
    DEFINE_WRAP(cuda);
    DEFINE_INT32_WRAP(iter, 1);
    DEFINE_BOOL_WRAP(cuda);
    std::string model_path = "./shufflenet.mge";
    TEST_BOOL_OPTION(enable_nchw4);
    TEST_BOOL_OPTION(enable_chwn4);
@@ -87,8 +93,9 @@ TEST(TestLarLayout, CUDA) {
 }

 TEST(TestLarLayout, CUDA_LITE) {
    DEFINE_WRAP(cuda);
    DEFINE_WRAP(lite);
    DEFINE_INT32_WRAP(iter, 1);
    DEFINE_BOOL_WRAP(cuda);
    DEFINE_BOOL_WRAP(lite);
    std::string model_path = "./shufflenet.mge";

    TEST_BOOL_OPTION(enable_nchw4);
--- a/lite/test/test_options.h
+++ b/lite/test/test_options.h
@@ -23,11 +23,35 @@ void run_NormalStrategy(std::string model_path);
        ~BoolOptionWrap_##option() { FLAGS_##option = false; } \
    };

 #define DEFINE_WRAP(option) BoolOptionWrap_##option flags_##option;
 #define STRING_OPTION_WRAP(option, default_val)                              \
    struct StringOptionWrap_##option {                                       \
        StringOptionWrap_##option(const char* val) { FLAGS_##option = val; } \
        ~StringOptionWrap_##option() { FLAGS_##option = default_val; }       \
    };

 #define INT32_OPTION_WRAP(option, default_val)                          \
    struct Int32OptionWrap_##option {                                   \
        Int32OptionWrap_##option(int32_t val) { FLAGS_##option = val; } \
        ~Int32OptionWrap_##option() { FLAGS_##option = default_val; }   \
    };
 #define DEFINE_BOOL_WRAP(option) BoolOptionWrap_##option flags_##option;
 #define DEFINE_STRING_WRAP(option, value) \
    StringOptionWrap_##option flags_##option(value);
 #define DEFINE_INT32_WRAP(option, value) Int32OptionWrap_##option flags_##option(value);

 #define TEST_BOOL_OPTION(option)        \
    {                                   \
        DEFINE_WRAP(option);            \
        DEFINE_BOOL_WRAP(option);       \
        run_NormalStrategy(model_path); \
    }
 #define TEST_STRING_OPTION(option, value)  \
    {                                      \
        DEFINE_STRING_WRAP(option, value); \
        run_NormalStrategy(model_path);    \
    }
 #define TEST_INT32_OPTION(option, value)  \
    {                                     \
        DEFINE_INT32_WRAP(option, value); \
        run_NormalStrategy(model_path);   \
    }
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -64,7 +64,8 @@ OperatorNodeBase* SubGraph::Rewriter::auto_replace_outputs(OperatorNodeBase* opr
            bool v0 = out0[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT),
                 v1 = out1[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT);
            mgb_assert(v0 == v1, "%s", err_msg().c_str());

            //! rename new var
            out1[i]->name(out0[i]->cname());
            auto&& ins = m_varmap.insert({out0[i], {true, nullptr}});
            mgb_assert(
                    ins.second || ins.first->second.first,