OpenI
/
MegEngine

/**
 * \file src/opr/test/tensor_manip.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "megbrain/opr/tensor_manip.h"
#include "megbrain/opr/basic_arith_wrapper.h"
#include "megbrain/opr/blas.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/misc.h"
#include "megbrain/opr/tensor_gen.h"
#include "megbrain/opr/utility.h"
#include "megbrain/test/autocheck.h"
#include "megbrain/test/helper.h"
#include "megbrain/test/megdnn_helper.h"
#include "megbrain/utils/arith_helper.h"

using namespace mgb;
using namespace opr;

TEST(TestTensorManip, GetVarShape) {
    HostTensorGenerator<> gen;
    auto host_x = gen({3, 1}), host_y = gen({1, 2});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Host2DeviceCopy::make(*graph, host_y),
         z0 = opr::GetVarShape::make({x, y, x.make_scalar(5)}),
         z1 = opr::GetVarShape::make({x, y}, 1);

    // ensure scalar is removed
    ASSERT_EQ(2u, z0.node()->owner_opr()->input().size());

    constexpr auto tdt = cg::OperatorNodeBase::NodeProp::DepType::SHAPE;
    auto&& dt = z0.node()->owner_opr()->node_prop().dep_map();
    ASSERT_EQ(2u, dt.size());
    ASSERT_EQ(tdt, dt.at(x.node()));
    ASSERT_EQ(tdt, dt.at(y.node()));

    auto as_shp = [](const HostTensorND& hv) {
        mgb_assert(hv.dtype() == dtype::Int32());
        mgb_assert(hv.shape().ndim == 1);
        TensorShape ret;
        ret.ndim = hv.shape()[0];
        auto p = hv.ptr<int>();
        for (size_t i = 0; i < ret.ndim; ++i)
            ret[i] = p[i];
        return ret;
    };
    HostTensorND host_z0, host_z1;
    auto func = graph->compile(
            {make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
    func->execute();

    ASSERT_EQ(TensorShape({3, 2}), as_shp(host_z0));
    ASSERT_EQ(TensorShape({2}), as_shp(host_z1));

    *host_x = *gen({5, 1, 6});
    *host_y = *gen({1, 8, 1});
    func->execute();

    ASSERT_EQ(TensorShape({5, 8, 6}), as_shp(host_z0));
    ASSERT_EQ(TensorShape({8}), as_shp(host_z1));
}

TEST(TestTensorManip, GetVarShapeBypass) {
    HostTensorGenerator<> gen;
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, gen({3, 2})),
         t = opr::Host2DeviceCopy::make(*graph, gen({2, 3})),
         tshp = opr::GetVarShape::make(t),
         y = opr::GetVarShape::make(opr::Reshape::make(x, tshp));
    ASSERT_EQ(tshp, y);
}

TEST(TestTensorManip, GetVarShapeNegativeAxis) {
    HostTensorGenerator<> gen;
    auto host_x = gen({1, 3}), host_y = gen({2, 1});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Host2DeviceCopy::make(*graph, host_y),
         z0 = opr::GetVarShape::make({x, y}, -1),
         z1 = opr::GetVarShape::make({x, y}, -2);

    // ensure scalar is removed
    ASSERT_EQ(2u, z0.node()->owner_opr()->input().size());

    constexpr auto tdt = cg::OperatorNodeBase::NodeProp::DepType::SHAPE;
    auto&& dt = z0.node()->owner_opr()->node_prop().dep_map();
    ASSERT_EQ(2u, dt.size());
    ASSERT_EQ(tdt, dt.at(x.node()));
    ASSERT_EQ(tdt, dt.at(y.node()));

    auto as_shp = [](const HostTensorND& hv) {
        mgb_assert(hv.dtype() == dtype::Int32());
        mgb_assert(hv.shape().ndim == 1);
        TensorShape ret;
        ret.ndim = hv.shape()[0];
        auto p = hv.ptr<int>();
        for (size_t i = 0; i < ret.ndim; ++i)
            ret[i] = p[i];
        return ret;
    };
    HostTensorND host_z0, host_z1;
    auto func = graph->compile(
            {make_callback_copy(z0, host_z0), make_callback_copy(z1, host_z1)});
    func->execute();

    ASSERT_EQ(TensorShape({3}), as_shp(host_z0));
    ASSERT_EQ(TensorShape({2}), as_shp(host_z1));

    *host_x = *gen({5, 1, 6});
    *host_y = *gen({1, 8, 1});
    func->execute();

    ASSERT_EQ(TensorShape({6}), as_shp(host_z0));
    ASSERT_EQ(TensorShape({8}), as_shp(host_z1));
}

TEST(TestTensorManip, Reshape) {
    constexpr size_t N = 123, C = 456;
    HostTensorGenerator<> gen;
    auto host_opr0 = gen({N * C}), host_opr1 = gen({N, C});
    auto graph = ComputingGraph::make();
    SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"}),
              opr1 = opr::Host2DeviceCopy::make(*graph, host_opr1, {"opr1"}),
              opr0_reshp = opr::Reshape::make(opr0, opr::GetVarShape::make(opr1)),
              sum = opr::add(opr0_reshp, opr1);

    {
        // check dep type
        auto op = opr0_reshp.node()->owner_opr();
        auto&& dep_map = opr0_reshp.node()->owner_opr()->node_prop().dep_map();
        using DT = cg::OperatorNodeBase::NodeProp::DepType;
        ASSERT_EQ(2u, dep_map.size());
        ASSERT_EQ(DT::DEV_VALUE | DT::VALUE_ALLOW_EMPTY, dep_map.at(op->input(0)));
        ASSERT_EQ(DT::HOST_VALUE, dep_map.at(op->input(1)));
    }

    HostTensorND host_sum;
    auto func = graph->compile({make_callback_copy(sum, host_sum)});
    func->execute();
    ASSERT_TRUE(cg::is_static_var_storage(opr0_reshp.node()));
    ASSERT_FALSE(host_sum.layout().eq_layout(host_opr0->layout()));
    ASSERT_TRUE(host_sum.layout().eq_layout(host_opr1->layout()));
    ASSERT_EQ(dev_ptr(opr0), dev_ptr(opr0_reshp));
    auto o0 = host_opr0->ptr<float>(), o1 = host_opr1->ptr<float>(),
         s = host_sum.ptr<float>();
    for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; i++) {
        MGB_ASSERT_FLOAT_EQ(o0[i] + o1[i], s[i])
                << ssprintf("failed opr0(%.5f)+opr1(%.5f) at %zd", o0[i], o1[i], i);
    }
}

TEST(TestTensorManip, ReshapeNoncontigValueInfer) {
    HostTensorGenerator<> gen;
    auto host_x = gen({2, 1});
    auto graph = ComputingGraph::make();
    auto x = opr::ImmutableTensor::make(*graph, *host_x), y = x.broadcast({2, 2}),
         z = opr::Reshape::make(y, {1, 0}, 1);
    auto&& mgr = graph->static_infer_manager();
    ASSERT_EQ(cg::static_infer::InferType::CONST, mgr.get_infer_type(z.node()).value);
    auto zv = mgr.infer_value(z.node());
    auto xp = host_x->ptr<float>(), zp = zv.ptr<float>();
    for (int i = 0; i < 2; ++i) {
        for (int j = 0; j < 2; ++j) {
            ASSERT_EQ(xp[i], zp[i * 2 + j]);
        }
    }

    ASSERT_THROW(opr::Reshape::make(y, {3, 0}, 1), TensorReshapeError);
    ASSERT_THROW(opr::Reshape::make(y, {3, 2}), TensorReshapeError);
}

TEST(TestTensorManip, ReshapeSameShapeBypass) {
    HostTensorGenerator<> gen;
    auto host_x = gen({2, 3});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x), x1 = x.reshape({6}),
         x2 = x1.reshape({6}), x3 = x.reshape(opr::GetVarShape::make(x));
    ASSERT_EQ(x1.node(), x2.node());
    ASSERT_EQ(x.node(), x3.node());
    ASSERT_NE(x.node(), x1.node());
}

TEST(TestTensorManip, ReshapeAndInplace) {
    constexpr size_t C = 456;
    HostTensorGenerator<> gen;
    auto host_opr0 = gen({C}), host_opr1 = gen({C / 2, 2});
    auto graph = ComputingGraph::make();
    SymbolVar opr0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_opr0),
              opr1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_opr1),
              reshape = opr::Reshape::make(opr0, TensorShape{C / 2, 2}),
              sum = reshape + opr1;
    opr1.node()->add_flag(cg::VarNode::Flag::NO_MEM_RECLAIM);
    HostTensorND host_sum(CompNode::load("xpu0"));
    auto func = graph->compile({make_callback_copy(sum, host_sum)});
    func->execute();
    ASSERT_EQ(dev_ptr(reshape), dev_ptr(sum));
    // assert contiguous layout
    ASSERT_EQ(host_opr1->layout(), host_sum.layout());
    auto o0 = host_opr0->ptr<float>(), o1 = host_opr1->ptr<float>(),
         s = host_sum.sync().ptr<float>();
    for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; ++i) {
        MGB_ASSERT_FLOAT_EQ(o0[i] + o1[i], s[i])
                << ssprintf("failed opr0(%.5f)+opr1(%.5f) at %zd", o0[i], o1[i], i);
    }
}

TEST(TestTensorManip, DynamicReshape) {
    HostTensorGenerator<> gen;
    auto host_x = gen({3, 4}), host_tshp = std::make_shared<HostTensorND>(
                                       host_x->comp_node(), dtype::Int32());
    host_tshp->resize({1}).ptr<int>()[0] = 12;
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
         x_rshp_shp = opr::MarkDynamicVar::make(
                 opr::Host2DeviceCopy::make(*graph, host_tshp).rename("x_rshp_shp")),
         x_rshp = opr::Reshape::make(x, x_rshp_shp).rename("x_rshp"),
         x_flat = x_rshp.flatten(),
         gx = cg::grad(opr::Dot::make(x_flat, x_flat).rename("loss"), x).rename("gx");
    ASSERT_FALSE(cg::is_static_var_shape(x_rshp.node()));
    ASSERT_TRUE(cg::is_static_var_shape(gx.node()));
    ASSERT_EQ(host_x->shape(), gx.node()->shape());
    HostTensorND host_rshp, host_gx;
    auto func = graph->compile(
            {make_callback_copy(x_rshp, host_rshp), make_callback_copy(gx, host_gx)});

    auto check = [&](const TensorShape& ishp, const TensorShape& tshp) {
        host_x->copy_from(*gen(ishp));
        {
            DeviceTensorND tmp;
            cg::copy_shape_to_tensor_value(tmp, tshp);
            host_tshp->copy_from(tmp);
        }
        func->execute();
        ASSERT_EQ(tshp, host_rshp.shape());
        ASSERT_EQ(host_x->shape(), host_gx.shape());
        for (size_t i = 0, it = host_x->shape().total_nr_elems(); i < it; ++i)
            MGB_ASSERT_FLOAT_EQ(host_x->ptr<float>()[i] * 2, host_gx.ptr<float>()[i]);
    };

    check({3, 4}, {12});
    check({5, 3}, {15});
    check({3, 4, 35}, {21, 20});
}

TEST(TestTensorManip, ReshapeWithUnspec) {
    HostTensorGenerator<> gen;
    auto host_x = gen({4, 8});
    auto graph = ComputingGraph::make();

    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Reshape::make(x, {1, 8}, 0);
    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});

    for (size_t ishp : {1, 5, 6}) {
        host_x->copy_from(*gen({ishp * 8}));
        func->execute();
        TensorShape expect_shape({ishp, 8});
        ASSERT_EQ(expect_shape, host_y.shape());
        MGB_ASSERT_TENSOR_EQ(
                host_x->sub(SubTensorSpec::make_from_layout(
                        host_x->layout().reshape(expect_shape))),
                host_y);
    }
}

TEST(TestTensorManip, ReshapeInferShapeForDynamicInput) {
    constexpr size_t N0 = 2, C0 = 3;
    HostTensorGenerator<> gen;
    auto host_x = gen({N0, C0}), host_tshp = gen({1});
    auto graph = ComputingGraph::make();
    host_tshp->ptr<float>()[0] = N0 * C0;
    SymbolVar x = opr::Host2DeviceCopy::make(*graph, host_x),
              xd = opr::MarkDynamicVar::make(x),
              tshp = opr::Host2DeviceCopy::make(*graph, host_tshp),
              y0 = opr::Reshape::make(xd, tshp) + 1,
              y1 = opr::Reshape::make(xd, opr::GetVarShape::make(x)) + 2;

    ASSERT_EQ(y0.shape(), TensorShape({N0 * C0}));
    ASSERT_EQ(y1.shape(), TensorShape({N0, C0}));
    HostTensorND host_y0, host_y1;
    auto func = graph->compile(
            {make_callback_copy(y0, host_y0), make_callback_copy(y1, host_y1)});

    auto run = [&](const TensorShape& ishp) {
        auto tot = ishp.total_nr_elems();
        host_x->copy_from(*gen(ishp));
        host_tshp->ptr<float>()[0] = tot;
        func->execute();
        ASSERT_EQ(host_y0.shape(), TensorShape({tot}));
        ASSERT_EQ(host_y1.shape(), ishp);
        for (size_t i = 0; i < tot; ++i) {
            ASSERT_EQ(host_x->ptr<float>()[i] + 1, host_y0.ptr<float>()[i]);
            ASSERT_EQ(host_x->ptr<float>()[i] + 2, host_y1.ptr<float>()[i]);
        }
    };

    run({3, 2});
    run({23, 12, 5});
}

TEST(TestTensorManip, ReshapeEmptyShape) {
    HostTensorGenerator<> gen;
    constexpr size_t x_length = 233;
    auto host_x = gen({x_length}), host_v = gen({2, 3, 3, 3});
    for (size_t i = 0; i < x_length; ++i) {
        host_x->ptr<float>()[i] = 1.f;
    }
    constexpr auto INVALID_AXIS = opr::Reshape::Param::INVALID_AXIS;
    for (auto unspec_axis : {INVALID_AXIS, 0, 1, 3}) {
        auto graph = ComputingGraph::make();
        graph->options().graph_opt_level = 0;
        TensorShape tshape{2, 3, 3, 3};
        auto zero_axis = unspec_axis;
        if (unspec_axis == INVALID_AXIS) {
            tshape[zero_axis = 2] = 0;
        }
        using CondTakeMode = opr::CondTake::Param::Mode;
        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
             x_empty = opr::CondTake::make(x, x, {CondTakeMode::EQ, 0.f})[0],
             v = opr::Host2DeviceCopy::make(*graph, host_v),
             x_reshape = opr::Reshape::make(x_empty, tshape, {unspec_axis}),
             y = opr::Concat::make({x_reshape, v}, zero_axis);
        HostTensorND host_empty, host_y;
        auto func = graph->compile(
                {make_callback_copy(x_reshape, host_empty),
                 make_callback_copy(y, host_y)});
        func->execute().wait();
        ASSERT_TRUE(host_empty.layout().is_empty());
        MGB_ASSERT_TENSOR_EQ(*host_v, host_y);
    }
}

TEST(TestTensorManip, ReshapeWithNegativeUnspec) {
    HostTensorGenerator<> gen;
    auto host_x = gen({4, 8});
    auto graph = ComputingGraph::make();

    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Reshape::make(x, {1, 8}, -2);
    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});

    for (size_t ishp : {1, 5, 6}) {
        host_x->copy_from(*gen({ishp * 8}));
        func->execute();
        TensorShape expect_shape({ishp, 8});
        ASSERT_EQ(expect_shape, host_y.shape());
        MGB_ASSERT_TENSOR_EQ(
                host_x->sub(SubTensorSpec::make_from_layout(
                        host_x->layout().reshape(expect_shape))),
                host_y);
    }
}

TEST(TestTensorManip, Broadcast) {
    constexpr size_t N = 20, C = 30;
    HostTensorGenerator<> gen;
    auto host_opr0 = gen({1, 1}), host_opr1 = gen({N, C});
    auto graph = ComputingGraph::make();
    SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"}),
              opr1 = opr::Host2DeviceCopy::make(*graph, host_opr1, {"opr1"}),
              sum = opr::add(opr::Broadcast::make(opr0, host_opr1->shape()), opr1);

    HostTensorND host_sum(CompNode::load("xpu0"));
    auto func =
            graph->compile({{sum, [&](DeviceTensorND& s) { host_sum.copy_from(s); }}});
    func->execute();
    ASSERT_TRUE(host_sum.layout().eq_layout(host_opr1->layout()));
    auto o0 = host_opr0->ptr<float>(), o1 = host_opr1->ptr<float>(),
         s = host_sum.sync().ptr<float>();
    for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; i++) {
        MGB_ASSERT_FLOAT_EQ(o0[0] + o1[i], s[i])
                << ssprintf("failed opr0(%.5f)+opr1(%.5f) at %zd", o0[i], o1[i], i);
    }
}

TEST(TestTensorManip, BroadcastEmptyShape) {
    HostTensorGenerator<> gen;
    for (auto&& arg :
         {std::make_pair(TensorShape{1}, TensorShape{0}),
          {{1, 2, 3}, {0, 2, 3}},
          {{2, 3}, {1, 0, 2, 3}},
          {{1, 0, 2, 3}, {4, 0, 2, 3}},
          {{0, 1, 2, 3}, {3, 0, 4, 2, 3}}}) {
        auto host_x = gen(arg.first);
        auto graph = ComputingGraph::make();
        graph->options().graph_opt_level = 0;
        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
             y = opr::Broadcast::make(x, arg.second);
        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        func->execute();
        ASSERT_TRUE(host_y.shape().eq_shape(arg.second));
    }
}

TEST(TestTensorManip, Dimshuffle) {
    HostTensorGenerator<> gen;
    constexpr size_t S0 = 8, S1 = 3;
    auto host_x = gen({S0, S1}), host_prod = gen({S1, 1, S0, 1});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"),
         prod = opr::Host2DeviceCopy::make(*graph, host_prod).rename("prod"),
         x_ds = opr::Dimshuffle::make(x, {1, -1, 0, -1}).rename("x_ds"),
         y = (x_ds * prod).reshape({S0 * S1}).rename("y"),
         loss = opr::Dot::make(y, y).rename("loss"),
         gx = cg::grad(loss, x).rename("gx");

    ASSERT_TRUE(cg::is_static_var_shape(gx.node()));
    ASSERT_EQ(host_x->shape(), gx.node()->shape());
    HostTensorND host_gx;
    auto func = graph->compile({make_callback_copy(gx, host_gx)});
    func->execute();

    for (size_t i = 0; i < S0; i++)
        for (size_t j = 0; j < S1; j++) {
            float x = host_x->ptr<float>({i, j})[0],
                  prod = host_prod->ptr<float>({j, 0, i, 0})[0],
                  gx = host_gx.ptr<float>({i, j})[0];
            MGB_ASSERT_FLOAT_EQ(2 * prod * prod * x, gx) << ssprintf(
                    "failed at (%zd, %zd): x=%g prod=%g gx=%g", i, j, x, prod, gx);
        }
}

TEST(TestTensorManip, DimshuffleEmptyShape) {
    HostTensorGenerator<> gen;
    for (auto&& arg :
         {std::make_pair(TensorShape{3, 0}, std::vector<int>{1, -1, 0, -1}),
          {{3, 1, 0, 4}, {-1, 3, -1, 0, 2}},
          {{2, 0, 3, 0}, {1, 0, 2, 3}}}) {
        auto host_x = gen(arg.first);
        auto graph = ComputingGraph::make();
        graph->options().graph_opt_level = 0;
        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
             y = opr::Dimshuffle::make(x, arg.second);
        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        func->execute();
        auto&& y_shape = host_y.shape();
        for (size_t idx = 0; idx < arg.second.size(); ++idx) {
            auto elem = arg.second[idx];
            if (elem == -1) {
                ASSERT_EQ(y_shape[idx], 1u);
            } else {
                ASSERT_EQ(arg.first[elem], y_shape[idx]);
            }
        }
    }
}

TEST(TestTensorManip, DimshuffleCombined) {
    using Checker = AutoOprChecker<1, 1>;
    constexpr int RED0 = 2, RED1 = 3;

    for (bool dyn : {false, true}) {
        auto make_graph =
                [dyn](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
            auto x = inputs[0];
            if (dyn)
                x = opr::MarkDynamicVar::make(x);

            auto cv = [&](int v) {
                auto rst = x.make_scalar(v);
                if (dyn)
                    rst = opr::MarkDynamicVar::make(rst);
                return rst;
            };

            auto xshp = opr::GetVarShape::make(x);
            auto sub = [&](int idx) {
                return opr::IndexAt::make(xshp, {{0, cv(idx)}});
            };
            auto tshp0 = opr::Concat::make(
                         {sub(0), sub(1) / (RED0 * RED1), cv(RED0), cv(RED1), sub(2),
                          sub(3)},
                         0),
                 tshp1 = opr::Concat::make(
                         {sub(0), sub(1) / (RED0 * RED1), sub(2) * RED0, sub(3) * RED1},
                         0);
            auto y0 = opr::Reshape::make(x, tshp0),
                 y1 = opr::Dimshuffle::make(y0, {0, 1, 3, 2, 4, 5}),
                 y2 = opr::Reshape::make(y1, tshp1);
            return {y2.node()};
        };

        auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
            auto&& iv = *inp.at(0);
            auto&& ov = dest.at(0);
            auto ishp = iv.shape();
            auto oshp = ishp;
            oshp.shape[1] /= RED0 * RED1;
            oshp.shape[2] *= RED0;
            oshp.shape[3] *= RED1;
            ov.comp_node(iv.comp_node()).resize(oshp);

            size_t tmpshp[6] = {oshp.shape[0], oshp.shape[1], RED1,
                                RED0,          ishp.shape[2], ishp.shape[3]},
                   tmpidx[6];
            for (size_t oidx = 0, oidxt = oshp.total_nr_elems(); oidx < oidxt; ++oidx) {
                for (int i = 5, x = oidx; i >= 0; --i) {
                    tmpidx[i] = x % tmpshp[i];
                    x /= tmpshp[i];
                    mgb_assert(i || !x);
                }
                std::swap(tmpshp[2], tmpshp[3]);
                std::swap(tmpidx[2], tmpidx[3]);
                size_t iidx = 0;
                for (int i = 5, d = 1; i >= 0; --i) {
                    iidx += d * tmpidx[i];
                    d *= tmpshp[i];
                }
                std::swap(tmpshp[2], tmpshp[3]);
                ov.ptr<float>()[oidx] = iv.ptr<float>()[iidx];
            }
        };

        Checker::RunOptions opt;
        opt.numdiff_eps = 1;  // large eps because all linear
        constexpr size_t R = RED0 * RED1;
        Checker(make_graph, fwd)
                .run({{{1, R, 1, 1}}}, opt)
                .run({{{5, R * 2, 3, 2}}}, opt)
                .run({{{2, R * 3, 4, 3}}}, opt);
    }
}

TEST(TestTensorManip, Subtensor) {
    using Checker = AutoOprChecker<1, 1>;

    SymbolVar sub0, sub1, sub2, sub3, sub4;
    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        using AIdx = opr::Subtensor::AxisIndexer;
        auto x = inputs[0];
        x = x.rename("x");
        auto cv = [&](int v, bool dyn = false) {
            auto rst = x.make_scalar(v);
            if (dyn)
                rst = opr::MarkDynamicVar::make(rst);
            return rst;
        };

        // sub0 = (0.9*x)[10:shp0:2]
        sub0 = opr::Subtensor::make(
                       x * 0.9f,
                       {AIdx::make_interval(
                               0, cv(10, true), opr::GetVarShape::make(x, 0), cv(2))})
                       .rename("sub0");

        // sub1 = x[:-10:2]
        sub1 = opr::Subtensor::make(
                       opr::MarkDynamicVar::make(x),
                       {AIdx::make_interval(0, None, cv(-10), cv(2))})
                       .rename("sub1");

        // sub2_raw = x[5:-5:2, 3]
        auto sub2_raw = opr::Subtensor::make(
                opr::IndexAt::make(x, {{1, cv(3)}}),
                {AIdx::make_interval(0, cv(5), cv(-5), cv(2))});
        {
            auto opr = sub2_raw.node()->owner_opr();
            auto&& inp = opr->input();
            auto&& dmap = opr->node_prop().dep_map();
            for (size_t i = 1; i < inp.size(); ++i) {
                mgb_assert(
                        dmap.at(inp[i]) &
                        cg::OperatorNodeBase::NodeProp::DepType::HOST_VALUE);
            }
        }
        sub2 = opr::AxisAddRemove::make(
                       sub2_raw, {opr::AxisAddRemove::AxisDesc::make_add(1)})
                       .rename("sub2");

        // sub3 = x[4:-6:2, -1:]
        sub3 = opr::Subtensor::make(
                x, {AIdx::make_interval(0, cv(4), cv(-6), cv(2)),
                    AIdx::make_interval(1, cv(-1), None, None)});

        // sub4 = (x + 0.1)[-3:7:-2, 1::-3] (negative stride)
        sub4 = opr::Subtensor::make(
                x + .1f, {AIdx::make_interval(0, cv(-3), cv(7), cv(-2)),
                          AIdx::make_interval(1, cv(1), None, cv(-3, true))});

        return {(sub0 + sub1 + sub2 + sub3 + sub4).rename("y")};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto iptr = inp[0]->ptr<float>();
        auto ishp = inp[0]->shape();
        auto oshp = ishp;
        auto s0 = ishp.shape[0], s1 = ishp.total_nr_elems() / s0,
             s2 = s1 / ishp.shape[1];
        auto os0 = (s0 - 10 + 1) / 2;
        oshp.shape[0] = os0;
        dest[0].comp_node(inp[0]->comp_node());
        dest[0].resize(oshp);
        auto optr = dest[0].ptr<float>();

        for (size_t i = 0; i < os0; ++i)
            for (size_t j = 0; j < s1; ++j) {
                optr[i * s1 + j] =
                        iptr[(i * 2 + 10) * s1 + j] * .9f + iptr[(i * 2) * s1 + j] +
                        iptr[(i * 2 + 5) * s1 + j % s2 + s2 * 3] +
                        iptr[(i * 2 + 4) * s1 + j % s2 + s2 * (ishp.shape[1] - 1)] +
                        iptr[(ishp.shape[0] - 3 - i * 2) * s1 + j % s2 + s2 * 1] + 0.1;
            }
    };

    Checker::RunOptions opt;
    opt.numdiff_eps = 1;  // large eps because all linear
    Checker checker(make_graph, fwd);

    checker.run({{{11, 5}}}, opt).run({{{20, 6}}}, opt).run({{{56, 6, 4}}}, opt);

    ASSERT_FALSE(cg::is_static_var_shape(sub0.node()));
    ASSERT_FALSE(cg::is_static_var_shape(sub1.node()));
    ASSERT_TRUE(cg::is_static_var_storage(sub2.node()));
    ASSERT_TRUE(cg::is_static_var_storage(sub3.node()));
    ASSERT_FALSE(cg::is_static_var_storage(sub4.node()));
}

TEST(TestTensorManip, SubtensorNegativeAxis) {
    using Checker = AutoOprChecker<1, 1>;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        using AIdx = opr::Subtensor::AxisIndexer;
        auto x = inputs[0];
        return {opr::Subtensor::make(x, {AIdx::make_index(-1, x.make_scalar(2))})};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto iptr = inp[0]->ptr<float>();
        auto ishp = inp[0]->shape();
        auto oshp = ishp;
        --oshp.ndim;
        auto stride = oshp.shape[oshp.ndim];
        if (!oshp.ndim)
            oshp = {1};
        auto optr = dest[0].resize(oshp).ptr<float>();

        for (size_t i = 0, it = oshp.total_nr_elems(); i < it; ++i) {
            optr[i] = iptr[i * stride + 2];
        }
    };

    Checker checker(make_graph, fwd);
    checker.run({TensorShape{5}})
            .run({TensorShape{2, 3}})
            .run({TensorShape{2, 3, 4}})
            .run({TensorShape{2, 3, 4, 5}});
}

TEST(TestTensorManip, SubtensorWithEmptyIndexDesc) {
    using Checker = AutoOprChecker<1, 1>;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        auto x = inputs[0];
        return {opr::Subtensor::make(x, {})};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto iptr = inp[0]->ptr<float>();
        auto oshp = inp[0]->shape();
        auto optr = dest[0].resize(oshp).ptr<float>();
        for (size_t i = 0, it = oshp.total_nr_elems(); i < it; ++i) {
            optr[i] = iptr[i];
        }
    };

    Checker checker(make_graph, fwd);
    checker.run({TensorShape{5}})
            .run({TensorShape{2, 3}})
            .run({TensorShape{2, 3, 4}})
            .run({TensorShape{2, 3, 4, 5}});
}

TEST(TestTensorManip, SubtensorShapeInferForDynAxisIdx) {
    HostTensorGenerator<> gen;
    auto host_x = gen({5, 6, 3});
    auto host_idx = gen({1});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         idx = opr::MarkDynamicVar::make(opr::Host2DeviceCopy::make(*graph, host_idx));
    auto cv = [&](int v) { return x.make_scalar(v); };
    using Ad = opr::Subtensor::AxisIndexer;
    // y = x[2, 1:-2:2]
    auto y = opr::Subtensor::make(
            x, {Ad::make_interval(1, cv(1), cv(-2), cv(2)), Ad::make_index(0, idx)});
    ASSERT_TRUE(cg::is_static_var_shape(y.node()));
    ASSERT_EQ(y.node()->shape(), TensorShape({2, 3}));

    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});
    host_idx->ptr<float>()[0] = 2;

    func->execute();

    HostTensorND expt{host_x->comp_node(), host_x->dtype()};
    expt.resize({2, 3});
    for (size_t i = 0; i < 2; ++i)
        for (size_t j = 0; j < 3; ++j) {
            expt.ptr<float>()[i * 3 + j] = host_x->ptr<float>({2, i * 2 + 1, j})[0];
        }
    MGB_ASSERT_TENSOR_EQ(expt, host_y);
}

TEST(TestTensorManip, SubtensorDynCaseMemFwd) {
    auto run = [](int dyn_type) {
        // dyn_type: 0->const idx, 1->static idx, 2->dynamic idx, 3->dynamic inp
        ASSERT_FALSE(HasFailure()) << "already failed before " << dyn_type;
        HostTensorGenerator<> gen;
        auto host_x = gen({2, 3});
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x);
        SymbolVar idx;
        if (dyn_type == 0 || dyn_type == 3) {
            idx = x.make_scalar(1);
            if (dyn_type == 3) {
                // force dynamic storage by reading on another comp node
                auto xrd = opr::Copy::make(x, host_x->comp_node().change_stream(1));
                graph->options().extra_vardeps[x.node()].push_back(xrd.node());
            }
        } else {
            auto host_idx =
                    std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32{});
            host_idx->resize({1}).ptr<int>()[0] = 1;
            idx = opr::Host2DeviceCopy::make(*graph, host_idx);
            if (dyn_type == 2) {
                idx = opr::MarkDynamicVar::make(idx);
            }
        }
        auto y = opr::Subtensor::make(
                x, {opr::Subtensor::AxisIndexer::make_interval(0, idx, None, None)});
        if (dyn_type != 2) {
            ASSERT_EQ(TensorShape({1, 3}), y.shape());
        }
        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        func->execute();
        auto xsub = host_x->sub(SubTensorSpec::make_from_offset_elem(
                TensorLayout({1, 3}, dtype::Float32{}), 3));
        MGB_ASSERT_TENSOR_EQ(xsub, host_y);
        ASSERT_EQ(dyn_type == 0, cg::is_static_var_storage(y.node()));
        ASSERT_EQ(dyn_type != 2, cg::is_static_var_shape(y.node()));
        ASSERT_EQ(
                static_cast<const uint8_t*>(prev_dev_ptr(x)) + 3 * sizeof(float),
                prev_dev_ptr(y));
    };
    run(0);
    run(1);
    run(2);
    run(3);
}

TEST(TestTensorManip, SubtensorWithNoValInferInp) {
    HostTensorGenerator<> gen;
    auto host_x = gen({5, 1}), host_idx = gen({1});
    auto graph = ComputingGraph::make();
    using Ad = opr::Subtensor::AxisIndexer;
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         idx = opr::Host2DeviceCopy::make_no_value_infer(*graph, host_idx),
         y = opr::Subtensor::make(x, {Ad::make_index(0, idx)});

    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});

    host_idx->ptr<float>()[0] = 2;
    func->execute();

    HostTensorND expt{host_x->comp_node(), host_x->dtype()};
    expt.resize({1}).ptr<float>()[0] = host_x->ptr<float>()[2];
    MGB_ASSERT_TENSOR_EQ(expt, host_y);
}

TEST(TestTensorManip, SubtensorDedup) {
    HostTensorGenerator<> gen;
    auto host_x = gen({5, 5, 5, 5});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x);
    auto cv = [&](int v) { return x.make_scalar(v); };

    using S = opr::Subtensor;
    using D = S::AxisIndexer;
    std::unordered_set<VarNode*> nodes;
    for (int i : {0, 1, 1, 0}) {
        nodes.insert(S::make(x, {D::make_index(i, cv(2))}).node());
        nodes.insert(S::make(x, {D::make_interval(i, cv(2), None, None)}).node());
        nodes.insert(S::make(x, {D::make_interval(i, None, cv(2), None)}).node());
        nodes.insert(S::make(x, {D::make_interval(i, None, None, cv(2))}).node());
    }

    ASSERT_EQ(8u, nodes.size());
}

TEST(TestTensorManip, SubtensorIdxChange) {
    auto run = [](bool dyn) {
        HostTensorGenerator<> gen;
        auto host_x = gen({10});
        auto host_idx =
                std::make_shared<HostTensorND>(host_x->comp_node(), dtype::Int32());
        host_idx->resize({1}).ptr<int>()[0] = 1;
        bool idx_exec = false, idx_infered = false;
        auto cb_set_idx_exec = [&](DeviceTensorND& dv) {
            if (dv.comp_node() == CompNode::default_cpu()) {
                idx_infered = true;
            } else {
                idx_exec = true;
            }
        };
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make(*graph, host_x);
        SymbolVar idx_;
        if (dyn) {
            idx_ = opr::Host2DeviceCopy::make(*graph, host_idx);
        } else {
            idx_ = opr::ImmutableTensor::make(*graph, *host_idx);
        }
        auto idx = opr::CallbackInjector::make(idx_, {false, true, cb_set_idx_exec}),
             y = opr::Subtensor::make(
                     x, {opr::Subtensor::AxisIndexer::make_interval(
                                0, idx, idx + 1, None)});

        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        ASSERT_TRUE(cg::is_static_var_shape(y.node()));
        ASSERT_TRUE(cg::is_static_var_value(y.node()));
        ASSERT_EQ(!dyn, cg::is_static_var_storage(y.node()));
        ASSERT_EQ(TensorShape({1}), y.node()->shape());

        auto px = host_x->ptr<float>();
        func->execute();
        ASSERT_EQ(px[1], host_y.ptr<float>()[0]);

        host_idx->ptr<int>()[0] = 5;
        func->execute();
        if (dyn) {
            ASSERT_EQ(px[5], host_y.ptr<float>()[0]);
        } else {
            ASSERT_EQ(px[1], host_y.ptr<float>()[0]);
        }
        ASSERT_TRUE(idx_infered);
        ASSERT_FALSE(idx_exec);
    };
    run(true);
    run(false);
}

TEST(TestTensorManip, SubtensorEmptyIO) {
    using AIdx = opr::Subtensor::AxisIndexer;
    using IndexDesc = std::vector<AIdx>;
    using IndexDescCreater = thin_function<IndexDesc(SymbolVar)>;
    HostTensorGenerator<> gen;
    auto run = [&](const TensorShape& inp_shp, const TensorShape& out_shp,
                   const IndexDescCreater& c) {
        auto host_x = gen(inp_shp);
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make(*graph, host_x);

        auto y = opr::Subtensor::make(x, c(x));
        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        func->execute();
        ASSERT_EQ(host_y.shape(), out_shp);
        ASSERT_TRUE(host_y.empty());
    };
    // x.shape = {0}, x[:0]
    run({0}, {0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(0, None, x.make_scalar(0), None)};
    });
    // x.shape = {100, 0}, x[0:-10:2]
    run({100, 0}, {45, 0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(
                0, x.make_scalar(0), x.make_scalar(-10), x.make_scalar(2))};
    });
    // x.shape = {100, 0}, x[10:-10:2, 0:0]
    run({100, 0}, {40, 0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(
                        0, x.make_scalar(10), x.make_scalar(-10), x.make_scalar(2)),
                AIdx::make_interval(1, x.make_scalar(0), x.make_scalar(0), None)};
    });
    // x.shape = {10, 0, 10}, x[5, 10:-10:-2]
    run({10, 0, 10}, {0, 10}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_index(0, x.make_scalar(5)),
                AIdx::make_interval(
                        1, x.make_scalar(10), x.make_scalar(-10), x.make_scalar(2))};
    });
    // x.shape = {10}, x[100:]
    run({10}, {0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(0, x.make_scalar(100), None, None)};
    });
}

TEST(TestTensorManip, SetSubtensorEmptyIO) {
    using AIdx = opr::SetSubtensor::AxisIndexer;
    using IndexDesc = std::vector<AIdx>;
    using IndexDescCreater = thin_function<IndexDesc(SymbolVar)>;
    HostTensorGenerator<> gen;
    auto run = [&](const TensorShape& inp_shp, const TensorShape& val_shp,
                   const IndexDescCreater& c) {
        auto host_x = gen(inp_shp), host_v = gen(val_shp);
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make(*graph, host_x),
             v = opr::Host2DeviceCopy::make(*graph, host_v);

        auto y = opr::SetSubtensor::make(x, v, c(x));
        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        func->execute();
        ASSERT_EQ(host_y.shape(), inp_shp);
    };
    // x.shape = {0}, v.shape = {0}, x[:0] = v
    run({0}, {0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(0, None, x.make_scalar(0), None)};
    });
    // x.shape = {100, 0}, v.shape = {45, 0}, x[0:-10:2] = v
    run({100, 0}, {45, 0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(
                0, x.make_scalar(0), x.make_scalar(-10), x.make_scalar(2))};
    });
    // x.shape = {100, 0}, v.shape = {40, 0}, x[10:-10:2, 0:0] = v
    run({100, 0}, {40, 0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(
                        0, x.make_scalar(10), x.make_scalar(-10), x.make_scalar(2)),
                AIdx::make_interval(1, x.make_scalar(0), x.make_scalar(0), None)};
    });
    // x.shape = {10, 0, 10}, v.shape = {0, 10}, x[5, 10:-10:-2] = v
    run({10, 0, 10}, {0, 10}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_index(0, x.make_scalar(5)),
                AIdx::make_interval(
                        1, x.make_scalar(10), x.make_scalar(-10), x.make_scalar(2))};
    });
    // x.shape = {10}, v.shape = {0}, x[100:] = v
    run({10}, {0}, [&](SymbolVar x) -> IndexDesc {
        return {AIdx::make_interval(0, x.make_scalar(100), None, None)};
    });
}

namespace {

void test_subtensor_fwdonly(bool dyn_inp, bool dyn_idx) {
    constexpr size_t SIZE = 25;
    auto mkhost = [](size_t size, DType dtype) {
        auto rst = std::make_shared<HostTensorND>(CompNode::load("xpu0"), dtype);
        rst->resize({size});
        return rst;
    };
    auto host_x = mkhost(SIZE, dtype::Float32()), host_idx0 = mkhost(1, dtype::Int32()),
         host_idx1 = mkhost(1, dtype::Int32());
    for (size_t i = 0; i < SIZE; ++i) {
        host_x->ptr<float>()[i] = i;
    }

    host_idx0->ptr<int>()[0] = 2;
    host_idx1->ptr<int>()[0] = 6;

    auto graph = ComputingGraph::make();
    using AIdx = opr::Subtensor::AxisIndexer;
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         idx0 = opr::Host2DeviceCopy::make(*graph, host_idx0),
         idx1 = opr::Host2DeviceCopy::make(*graph, host_idx1);
    float *x_ptr = nullptr, *x_ptr_end = nullptr, *xsub_ptr = nullptr;
    if (dyn_inp)
        x = opr::MarkDynamicVar::make(x);
    x = opr::CallbackInjector::make(x, [&](DeviceTensorND& v) {
        x_ptr = v.ptr<float>();
        x_ptr_end = v.ptr<float>() + v.layout().total_nr_elems();
    });
    if (dyn_idx)
        idx0 = opr::MarkDynamicVar::make(idx0);
    auto xsub = opr::Subtensor::make(x, {AIdx::make_interval(0, idx0, idx1, None)});
    xsub = opr::CallbackInjector::make(
            xsub, [&](DeviceTensorND& v) { xsub_ptr = v.ptr<float>(); });

    ASSERT_EQ(!dyn_inp && !dyn_idx, cg::is_static_var_shape(xsub.node()));

    HostTensorND host_sub;
    auto func = graph->compile({make_callback_copy(xsub, host_sub)});

    bool failed = false;
    auto run_and_check = [&](size_t begin, size_t end) {
        ASSERT_FALSE(failed);
        failed = true;
        host_idx0->ptr<int>()[0] = begin;
        host_idx1->ptr<int>()[0] = end;
        func->execute();

        if (!(!dyn_inp && dyn_idx)) {
            ASSERT_GE(xsub_ptr, x_ptr);
            ASSERT_LE(xsub_ptr, x_ptr_end);
        }

        ASSERT_EQ(TensorShape({end - begin}), host_sub.shape());
        for (size_t i = 0; i < end - begin; ++i)
            ASSERT_EQ(host_x->ptr<float>()[i + begin], host_sub.ptr<float>()[i])
                    << ssprintf("failed [%zu, %zu): i=%zu", begin, end, i);
        failed = false;
    };

    run_and_check(0, 1);
    run_and_check(2, 3);
    run_and_check(0, 5);
    run_and_check(1, 6);
    run_and_check(3, 21);
    run_and_check(0, SIZE);
    run_and_check(1, SIZE);
    run_and_check(0, SIZE - 1);
}
}  // anonymous namespace

TEST(TestTensorManip, SubtensorFwdOnly00) {
    test_subtensor_fwdonly(false, false);
}

TEST(TestTensorManip, SubtensorFwdOnly01) {
    test_subtensor_fwdonly(false, true);
}

TEST(TestTensorManip, SubtensorFwdOnly10) {
    test_subtensor_fwdonly(true, false);
}

TEST(TestTensorManip, SubtensorFwdOnly11) {
    test_subtensor_fwdonly(true, true);
}

TEST(TestTensorManip, OverlapSetSubtensor) {
    constexpr size_t SIZE = 2048, SIZE_SUB = (SIZE - 4) / 2;
    auto host_x =
            std::make_shared<HostTensorND>(CompNode::load("xpu0"), dtype::Float32());
    host_x->resize({SIZE});
    for (size_t i = 0; i < SIZE; ++i)
        host_x->ptr<float>()[i] = i;
    auto graph = ComputingGraph::make();
    graph->options().allocate_static_mem_after_graph_compile = true;
    auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x");
    auto cv = [&](int v, bool dyn = false) {
        auto rst = x.make_scalar(v);
        if (dyn)
            rst = opr::MarkDynamicVar::make(rst);
        return rst;
    };
    using AIdx = opr::Subtensor::AxisIndexer;
    auto xsub = opr::Subtensor::make(x, {AIdx::make_interval(0, cv(2), cv(-2), cv(2))})
                        .rename("xsub"),
         // y = xsub[:-10] := xsub[10:]
            y = opr::SetSubtensor::make(
                        xsub,
                        opr::Subtensor::make(
                                xsub, {AIdx::make_interval(0, cv(10), None, None)})
                                .rename("xsub[10:]"),
                        {AIdx::make_interval(0, None, cv(-10), None)})
                        .rename("y");

    HostTensorND expected(host_x->comp_node(), dtype::Float32());
    expected.resize({SIZE_SUB});
    for (size_t i = 0; i < SIZE_SUB; ++i) {
        auto i0 = i;
        if (i0 < SIZE_SUB - 10)
            i0 += 10;
        expected.ptr<float>()[i] = i0 * 2 + 2;
    }

    ASSERT_TRUE(cg::is_static_var_value(y.node()));
    HostTensorND infer_result;
    infer_result.copy_from(graph->static_infer_manager().infer_value(y.node()));
    MGB_ASSERT_TENSOR_EQ(expected, infer_result);

    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});
    func->to_json()->writeto_fpath(output_file("OverlapSetSubtensor.json"));
    func->execute();
    MGB_ASSERT_TENSOR_EQ(expected, host_y);
}

TEST(TestTensorManip, OverlapSetSubtensor2) {
    constexpr size_t SIZE_X = 20, SIZE_Y = 23;
    auto run = [](bool should_overlap) {
        auto host_x = std::make_shared<HostTensorND>(
                CompNode::load("xpu0"), dtype::Float32());
        host_x->resize({SIZE_X, SIZE_Y});
        for (size_t i = 0; i < SIZE_X * SIZE_Y; ++i)
            host_x->ptr<float>()[i] = i;
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x");
        auto cv = [&](int v) { return x.make_scalar(v); };
        auto make_sub_desc = [&](int begin, int end) -> opr::Subtensor::IndexDesc {
            using AIdx = opr::Subtensor::AxisIndexer;
            return {AIdx::make_interval(0, cv(begin), cv(end), None)};
        };
        auto slice = [&](SymbolVar inp, int begin, int end) {
            return opr::Subtensor::make(inp, make_sub_desc(begin, end));
        };
        // y = x.copy()
        // y[2:7] = y[4:9].copy()
        // y[1:6] += y[3:8].copy()
        auto xsub = slice(x, 4, 9).rename("xsub"),
             y0 = opr::SetSubtensor::make(x, xsub, make_sub_desc(2, 7)).rename("y0"),
             y0sub = slice(y0, 3, 8).rename("y0sub"),
             ypar = should_overlap ? y0 : y0 + 1,
             y = opr::IncrSubtensor::make(ypar, y0sub, make_sub_desc(1, 6))
                         .rename("y1");

        HostTensorND expect;
        expect.copy_from(*host_x);
        auto ptr = expect.ptr<float>();
        memmove(ptr + 2 * SIZE_Y, ptr + 4 * SIZE_Y, 5 * SIZE_Y * sizeof(float));
        for (size_t i = 1; i < 6; ++i) {
            for (size_t j = 0; j < SIZE_Y; ++j) {
                ptr[i * SIZE_Y + j] += ptr[(i + 2) * SIZE_Y + j];
            }
        }
        if (!should_overlap) {
            for (size_t i = 0; i < SIZE_X * SIZE_Y; ++i) {
                ++ptr[i];
            }
        }

        ASSERT_TRUE(cg::is_static_var_value(y.node()));
        HostTensorND infer_result;
        infer_result.copy_from(graph->static_infer_manager().infer_value(y.node()));
        MGB_ASSERT_TENSOR_EQ(expect, infer_result);

        HostTensorND host_y;
        auto func = graph->compile({make_callback_copy(y, host_y)});
        func->execute();
        MGB_ASSERT_TENSOR_EQ(expect, host_y);

        if (!should_overlap) {
            ASSERT_EQ(prev_dev_ptr(ypar), prev_dev_ptr(y));
        }
    };
    run(false);
    run(true);
}

TEST(TestTensorManip, SetSubtensor) {
    using Checker = AutoOprChecker<3, 1>;
    auto make_graph = [](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        using AIdx = opr::Subtensor::AxisIndexer;
        auto x = inputs[0], v0 = inputs[1], v1 = inputs[2];
        x = x.rename("x");
        v0 = v0.rename("v0");
        v1 = v1.rename("v1");
        auto cv = [&](int v, bool dyn = false) {
            auto rst = x.make_scalar(v);
            if (dyn)
                rst = opr::MarkDynamicVar::make(rst);
            return rst;
        };
        auto
                // x0 = x[10::2] := v0
                x0 = opr::SetSubtensor::make(
                             x, v0, {AIdx::make_interval(0, cv(10), None, cv(2))})
                             .rename("x0"),
                // x1 = x[:-10:2] := v0[:, 3] := v1
                x1 = opr::SetSubtensor::make(
                             opr::MarkDynamicVar::make(x),
                             opr::SetSubtensor::make(
                                     v0, v1, {AIdx::make_index(1, cv(3))}),
                             {AIdx::make_interval(0, None, cv(-10), cv(2))})
                             .rename("x_sub1"),
                // x2 = (x[:5] := x[4:9])[3:-7:2, -1] := v1
                x2_t = opr::Subtensor::make(
                               x, {AIdx::make_interval(0, cv(4), cv(9), None)})
                               .rename("x2_t"),
                x2 = opr::SetSubtensor::make(
                             opr::SetSubtensor::make(
                                     x, x2_t,
                                     {AIdx::make_interval(0, None, cv(5), None)}),
                             v1,
                             {AIdx::make_interval(0, cv(3), cv(-7), cv(2)),
                              AIdx::make_index(1, cv(-1))})
                             .rename("x2"),
                y = (x0 + x1 + x2).rename("y");
        mgb_assert(cg::is_static_var_storage(x0.node()));
        mgb_assert(!cg::is_static_var_shape(x1.node()));
        mgb_assert(cg::is_static_var_storage(x2.node()));
        return {y};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto px = inp[0]->ptr<float>(), pv0 = inp[1]->ptr<float>(),
             pv1 = inp[2]->ptr<float>();
        auto ishp = inp[0]->shape();
        dest[0].comp_node(inp[0]->comp_node());
        dest[0].resize(ishp);
        auto optr = dest[0].ptr<float>();
        auto s0 = ishp.shape[0], s1 = ishp.total_nr_elems() / s0,
             s2 = s1 / ishp.shape[1];
        for (size_t i = 0; i < s0; ++i) {
            for (size_t j = 0; j < s1; ++j) {
                float x0, x1, x2;
                x0 = x1 = x2 = px[i * s1 + j];
                if (i >= 10 && (i - 10) % 2 == 0)
                    x0 = pv0[((i - 10) / 2) * s1 + j];

                if (i < s0 - 10 && i % 2 == 0) {
                    auto row = i / 2;
                    if (j / s2 == 3)
                        x1 = pv1[row * s2 + j % s2];
                    else
                        x1 = pv0[row * s1 + j];
                }

                if (i >= 3 && i < s0 - 7 && (i - 3) % 2 == 0 &&
                    j / s2 == ishp.shape[1] - 1)
                    x2 = pv1[((i - 3) / 2) * s2 + j % s2];
                else if (i < 5)
                    x2 = px[(i + 4) * s1 + j];

                optr[i * s1 + j] = x0 + x1 + x2;
            }
        }
    };

    auto mkshp = [](const TensorShape& shp0) -> Checker::ShapeInpArray {
        mgb_assert(shp0.shape[0] > 10 && shp0.ndim >= 2 && shp0.shape[1] >= 4);
        auto shp1 = shp0;
        shp1.shape[0] = (shp0.shape[0] - 10) / 2;
        auto shp2 = shp1;
        for (size_t i = 2; i < shp2.ndim; ++i)
            shp2.shape[i - 1] = shp2.shape[i];
        --shp2.ndim;
        return {shp0, shp1, shp2};
    };

    Checker::RunOptions opt;
    opt.numdiff_eps = 1;
    Checker(make_graph, fwd)
            .run(mkshp({16, 4, 2}), opt)
            .run(mkshp({14, 10}), opt)
            .run(mkshp({18, 5, 2, 3}), opt);
}

TEST(TestTensorManip, SetSubtensorCheckByShapeInfer) {
    HostTensorGenerator<> gen;
    HostTensorGenerator<dtype::Int32> gen_int;
    auto host_x = gen({12}), host_sub = gen({1}), host_idx = gen_int({1});
    host_idx->ptr<int>()[0] = 13;
    auto graph = ComputingGraph::make();
    using Ad = opr::Subtensor::AxisIndexer;
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         sub = opr::Host2DeviceCopy::make(*graph, host_sub);
    auto idx1 = Ad::make_index(0, opr::ImmutableTensor::make(*graph, *host_idx)),
         idx2 = Ad::make_index(0, opr::Host2DeviceCopy::make(*graph, host_idx));

    MGB_MARK_USED_VAR(x);
    MGB_MARK_USED_VAR(sub);
    MGB_MARK_USED_VAR(idx1);
    MGB_MARK_USED_VAR(idx2);
    ASSERT_THROW(opr::SetSubtensor::make(x, sub, {idx1}), MegBrainError);
    ASSERT_THROW(opr::SetSubtensor::make(x, sub, {idx2}), MegBrainError);
}

TEST(TestTensorManip, SetSubtensorShapeInfer) {
    HostTensorGenerator<> gen;
    HostTensorGenerator<dtype::Int32> gen_int;
    auto host_x = gen({12}), host_sub = gen({1}), host_idx = gen_int({1});
    host_idx->ptr<int>()[0] = 13;
    auto graph = ComputingGraph::make();
    auto&& mgr = graph->static_infer_manager();
    using Ad = opr::Subtensor::AxisIndexer;
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         sub = opr::Host2DeviceCopy::make(*graph, host_sub),
         index = opr::Host2DeviceCopy::make_no_value_infer(*graph, host_idx);
    auto rt_static_idx = Ad::make_index(0, index * 2);
    auto y = opr::SetSubtensor::make(x, sub, {rt_static_idx});
    ASSERT_TRUE(mgr.infer_shape_fallible(y.node()));
}

TEST(TestTensorManip, SetSubtensorDynIdx) {
    HostTensorGenerator<> gen;
    auto host_x = gen({12}), host_sub = gen({1}), host_idx = gen({1});
    host_idx->ptr<float>()[0] = 3;
    auto dev_idx = std::make_shared<DeviceTensorND>();
    dev_idx->copy_from(*host_idx);

    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         sub = opr::Host2DeviceCopy::make(*graph, host_sub),
         idx = opr::SharedDeviceTensor::make(*graph, dev_idx),
         y = opr::SetSubtensor::make(
                 x, sub, {opr::SetSubtensor::AxisIndexer::make_index(0, idx)});

    ASSERT_TRUE(cg::is_static_var_storage(y.node()));
    HostTensorND host_y;

    auto func = graph->compile({make_callback_copy(y, host_y)});
    func->execute();

    host_x->ptr<float>()[3] = host_sub->ptr<float>()[0];
    MGB_ASSERT_TENSOR_EQ(*host_x, host_y);
}

TEST(TestTensorManip, SetSubtensorWithEmptyIndexDesc) {
    HostTensorGenerator<> gen;
    auto host_x = gen({12}), host_y = gen({12});

    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Host2DeviceCopy::make(*graph, host_y),
         z = opr::SetSubtensor::make(x, y, {});

    ASSERT_TRUE(cg::is_static_var_storage(z.node()));
    HostTensorND host_z;

    auto func = graph->compile({make_callback_copy(z, host_z)});
    func->execute();

    MGB_ASSERT_TENSOR_EQ(*host_y, host_z);
}

TEST(TestTensorManip, IncrSubtensor) {
    using Checker = AutoOprChecker<2, 1>;
    auto make_graph = [](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        using AIdx = opr::Subtensor::AxisIndexer;
        auto x = inputs[0];
        return {opr::IncrSubtensor::make(
                x, inputs[1],
                {AIdx::make_interval(
                        0, x.make_scalar(2), x.make_scalar(-2), x.make_scalar(2))})};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto nr = inp[0]->shape(0);
        auto pv = inp[1]->ptr<float>(), pd = dest[0].copy_from(*inp[0]).ptr<float>();
        for (size_t i = 0; i < (nr - 3) / 2; ++i) {
            pd[i * 2 + 2] += pv[i];
        }
    };

    Checker{make_graph, fwd}
            .run({TensorShape{5}, {1}})
            .run({TensorShape{8}, {2}})
            .run({TensorShape{23}, {10}});
}

TEST(TestTensorManip, Concat) {
    auto cns = load_multiple_xpus(4);

    using Checker = AutoOprChecker<3, 1>;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        auto sub0 = inputs[0], sub1 = opr::Copy::make(inputs[1], cns[1]),
             sub2 = opr::Copy::make(inputs[2], cns[2]),
             ret = opr::Concat::make({sub0, sub1, sub2}, 1, cns[3]);
        return {opr::Copy::make(ret, cns[0])};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        size_t n = inp[0]->shape(0), c0 = inp[0]->shape(1), c1 = inp[1]->shape(1),
               c2 = inp[2]->shape(1), c = c0 + c1 + c2;
        auto i0 = inp[0]->ptr<float>(), i1 = inp[1]->ptr<float>(),
             i2 = inp[2]->ptr<float>(), o = dest[0].resize({n, c}).ptr<float>();
        for (size_t i = 0; i < n; ++i) {
            for (size_t j = 0; j < c; ++j) {
                float cur;
                if (j < c0) {
                    cur = i0[i * c0 + j];
                } else if (j < c0 + c1) {
                    cur = i1[i * c1 + j - c0];
                } else {
                    cur = i2[i * c2 + j - c0 - c1];
                }
                o[i * c + j] = cur;
            }
        }
    };
    Checker checker{make_graph, fwd, cns[0]};
    checker.run({TensorShape{2, 3}, {2, 4}, {2, 5}})
            .run({TensorShape{2, 8}, {2, 3}, {2, 9}})
            .run({TensorShape{5, 10}, {5, 3}, {5, 4}});
}

TEST(TestTensorManip, ConcatWithNegativeAxis) {
    auto cns = load_multiple_xpus(4);

    using Checker = AutoOprChecker<3, 1>;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        auto sub0 = inputs[0], sub1 = opr::Copy::make(inputs[1], cns[1]),
             sub2 = opr::Copy::make(inputs[2], cns[2]),
             ret = opr::Concat::make({sub0, sub1, sub2}, -1, cns[3]);
        return {opr::Copy::make(ret, cns[0])};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        size_t n = inp[0]->shape(0), c0 = inp[0]->shape(1), c1 = inp[1]->shape(1),
               c2 = inp[2]->shape(1), c = c0 + c1 + c2;
        auto i0 = inp[0]->ptr<float>(), i1 = inp[1]->ptr<float>(),
             i2 = inp[2]->ptr<float>(), o = dest[0].resize({n, c}).ptr<float>();
        for (size_t i = 0; i < n; ++i) {
            for (size_t j = 0; j < c; ++j) {
                float cur;
                if (j < c0) {
                    cur = i0[i * c0 + j];
                } else if (j < c0 + c1) {
                    cur = i1[i * c1 + j - c0];
                } else {
                    cur = i2[i * c2 + j - c0 - c1];
                }
                o[i * c + j] = cur;
            }
        }
    };
    Checker checker{make_graph, fwd, cns[0]};
    checker.run({TensorShape{2, 3}, {2, 4}, {2, 5}})
            .run({TensorShape{2, 8}, {2, 3}, {2, 9}})
            .run({TensorShape{5, 10}, {5, 3}, {5, 4}});
}

TEST(TestTensorManip, ConcatEmpty) {
    HostTensorGenerator<> gen;
    auto host_x = gen({2, 3, 5}), host_y = gen({2, 0, 5});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Host2DeviceCopy::make(*graph, host_y),
         z = opr::Concat::make({x, y}, 1);
    HostTensorND host_z;
    auto func = graph->compile({make_callback_copy(z, host_z)});
    func->execute();
    MGB_ASSERT_TENSOR_EQ(*host_x, host_z);
    host_x->resize({2, 0, 5});
    func->execute();
    MGB_ASSERT_TENSOR_EQ(*host_y, host_z);
}

TEST(TestTensorManip, ConcatEmpty2) {
    HostTensorGenerator<> gen;
    auto host_x = gen({2, 0, 5}), host_y = gen({2, 0, 6});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Host2DeviceCopy::make(*graph, host_y),
         z = opr::Concat::make({x, y}, 2);
    HostTensorND host_z;
    auto func = graph->compile({make_callback_copy(z, host_z)});
    func->execute();
    ASSERT_EQ(TensorShape({2, 0, 11}), host_z.shape());
}

#if MGB_OPENCL
#include "megcore_opencl.h"

#define REQUIRE_OPENCL()                                                 \
    do {                                                                 \
        if (!CompNode::get_device_count(CompNode::DeviceType::OPENCL)) { \
            return;                                                      \
        }                                                                \
    } while (0)

TEST(TestTensorManip, ConcatCD4) {
    REQUIRE_OPENCL();
    auto cn = CompNode::load("openclx");
    HostTensorGenerator<> gen;
    auto host_x = gen({1, 4, 2, 2}, cn), host_y = gen({1, 4, 2, 2}, cn);

    auto graph0 = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph0, host_x);
    auto y = opr::Host2DeviceCopy::make(*graph0, host_y);
    x = opr::RelayoutFormat::make(x, {opr::RelayoutFormat::Param::Mode::NCHW_NHWCD4I});
    y = opr::RelayoutFormat::make(y, {opr::RelayoutFormat::Param::Mode::NCHW_NHWCD4I});
    auto z = opr::Concat::make({x, y}, 2);

    HostTensorND host_z0;
    auto func = graph0->compile({make_callback_copy(z, host_z0)});
    func->execute();
    ASSERT_EQ(TensorShape({1, 2, 2, 2, 4}), host_z0.shape());

    auto graph1 = ComputingGraph::make();
    x = opr::Host2DeviceCopy::make(*graph1, host_x);
    y = opr::Host2DeviceCopy::make(*graph1, host_y);
    z = opr::RelayoutFormat::make(
            opr::Concat::make({x, y}, 1),
            {opr::RelayoutFormat::Param::Mode::NCHW_NHWCD4I});

    HostTensorND host_z1;
    func = graph1->compile({make_callback_copy(z, host_z1)});
    func->execute();
    MGB_ASSERT_TENSOR_EQ(host_z0, host_z1);
}
#endif

TEST(TestTensorManip, AxisAddRemove) {
    HostTensorGenerator<> gen;
    for (bool dyn_shape : {false, true}) {
        auto host_x = gen({2, 1, 5});
        using AD = opr::AxisAddRemove::AxisDesc;
        auto graph = ComputingGraph::make();
        auto x = opr::Host2DeviceCopy::make(*graph, host_x);
        if (dyn_shape) {
            x = opr::MarkDynamicVar::make(x);
        }
        auto y = opr::AxisAddRemove::make(x, {AD::make_add(0)}),
             z = opr::AxisAddRemove::make(x, {AD::make_remove(1)});
        HostTensorND host_y, host_z;
        auto func = graph->compile(
                {make_callback_copy(y, host_y), make_callback_copy(z, host_z)});
        func->execute();
        ASSERT_EQ(TensorShape({1, 2, 1, 5}), host_y.shape());
        ASSERT_EQ(TensorShape({2, 5}), host_z.shape());
        MGB_ASSERT_TENSOR_EQ(*host_x, host_y.resize(host_x->shape()));
        MGB_ASSERT_TENSOR_EQ(*host_x, host_z.resize(host_x->shape()));

        // test empty tensor
        host_x->resize({2, 1, 0});
        func->execute();
        ASSERT_EQ(TensorShape({1, 2, 1, 0}), host_y.shape());
        ASSERT_EQ(TensorShape({2, 0}), host_z.shape());
    }
}

TEST(TestTensorManip, Split) {
    auto cns = load_multiple_xpus(3);
    constexpr size_t C1 = 20, C2 = 30;
    constexpr size_t N = 2, C = C1 + C2;
    HostTensorGenerator<> gen;
    auto host_opr0 = gen({N, C}, cns[0]);
    auto graph = ComputingGraph::make();
    SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"});

    auto spl = opr::Split::make(
            opr0, Split::Options::make_partition(opr0, 1, {C1, C2}),
            OperatorNodeConfig("split").comp_node_arr({cns[1], cns[2]}));

    auto cost0 = opr::Dot::make(spl[0].flatten(), spl[0].flatten()),
         cost1_ = opr::Dot::make(spl[1].flatten(), spl[1].flatten()),
         cost1 = opr::Copy::make(cost1_, OperatorNodeConfig().follow_comp_node(cost0)),
         cost = opr::Copy::make(
                 cost0 + cost1, OperatorNodeConfig().follow_comp_node(opr0)),
         grad = cg::grad(cost, opr0);

    HostTensorND host_spl0, host_spl1, host_grad;
    auto func = graph->compile(
            {{spl[0], [&](DeviceTensorND& s) { host_spl0.copy_from(s); }},
             {spl[1], [&](DeviceTensorND& s) { host_spl1.copy_from(s); }},
             {grad, [&](DeviceTensorND& s) { host_grad.copy_from(s); }}});
    func->execute();

    auto o0 = host_spl0.sync().ptr<float>(), o1 = host_spl1.sync().ptr<float>(),
         c = host_opr0->ptr<float>(), g = host_grad.sync().ptr<float>();
    for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; i++) {
        auto ch = i % C;
        auto n = i / C;
        if (ch < C1) {
            MGB_ASSERT_FLOAT_EQ(o0[n * C1 + ch], c[i]) << ssprintf("failed at %zd", i);
        } else {
            MGB_ASSERT_FLOAT_EQ(o1[n * C2 + ch - C1], c[i])
                    << ssprintf("failed at %zd", i);
        }
        MGB_ASSERT_FLOAT_EQ(c[i] * 2, g[i]) << ssprintf("grad failed at %zd", i);
    }
}

TEST(TestTensorManip, SplitWithNegativeAxis) {
    auto cns = load_multiple_xpus(3);
    constexpr size_t C1 = 20, C2 = 30;
    constexpr size_t N = 2, C = C1 + C2;
    HostTensorGenerator<> gen;
    auto host_opr0 = gen({N, C}, cns[0]);
    auto graph = ComputingGraph::make();
    SymbolVar opr0 = opr::Host2DeviceCopy::make(*graph, host_opr0, {"opr0"});

    auto spl = opr::Split::make(
            opr0, Split::Options::make_partition(opr0, -1, {C1, C2}),
            OperatorNodeConfig("split").comp_node_arr({cns[1], cns[2]}));

    auto cost0 = opr::Dot::make(spl[0].flatten(), spl[0].flatten()),
         cost1_ = opr::Dot::make(spl[1].flatten(), spl[1].flatten()),
         cost1 = opr::Copy::make(cost1_, OperatorNodeConfig().follow_comp_node(cost0)),
         cost = opr::Copy::make(
                 cost0 + cost1, OperatorNodeConfig().follow_comp_node(opr0)),
         grad = cg::grad(cost, opr0);

    HostTensorND host_spl0, host_spl1, host_grad;
    auto func = graph->compile(
            {{spl[0], [&](DeviceTensorND& s) { host_spl0.copy_from(s); }},
             {spl[1], [&](DeviceTensorND& s) { host_spl1.copy_from(s); }},
             {grad, [&](DeviceTensorND& s) { host_grad.copy_from(s); }}});
    func->execute();

    auto o0 = host_spl0.sync().ptr<float>(), o1 = host_spl1.sync().ptr<float>(),
         c = host_opr0->ptr<float>(), g = host_grad.sync().ptr<float>();
    for (size_t i = 0, it = host_opr0->layout().total_nr_elems(); i < it; i++) {
        auto ch = i % C;
        auto n = i / C;
        if (ch < C1) {
            MGB_ASSERT_FLOAT_EQ(o0[n * C1 + ch], c[i]) << ssprintf("failed at %zd", i);
        } else {
            MGB_ASSERT_FLOAT_EQ(o1[n * C2 + ch - C1], c[i])
                    << ssprintf("failed at %zd", i);
        }
        MGB_ASSERT_FLOAT_EQ(c[i] * 2, g[i]) << ssprintf("grad failed at %zd", i);
    }
}

TEST(TestTensorManip, SplitToDynOutShape) {
    using Checker = AutoOprChecker<1, 2>;
    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        auto x = inputs[0];
        auto y = opr::Split::make(
                x, opr::Split::Options::make_partition(
                           0, {x.make_scalar(3), opr::MarkDynamicVar::make(
                                                         opr::GetVarShape::make(x, 0) -
                                                         x.make_scalar(3))}));
        return {y[0], y[1]};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto sub = [&](size_t begin, Maybe<ptrdiff_t> end) {
            auto&& iv = inp[0];
            return iv->sub(Slice(begin, end, None).apply(iv->layout(), 0));
        };
        dest[0].copy_from(sub(0, 3));
        dest[1].copy_from(sub(3, None));
    };

    Checker{make_graph, fwd}
            .run({TensorShape{5}})
            .run({TensorShape{8}})
            .run({TensorShape{9, 3}});
}

TEST(TestTensorManip, SplitToDynOutStorage) {
    using Checker = AutoOprChecker<1, 2>;
    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        auto x = inputs[0];
        auto y = opr::Split::make(
                x, opr::Split::Options::make_partition(
                           0, {x.make_scalar(3),
                               opr::GetVarShape::make(x, 0) - x.make_scalar(3)}));
        auto y0 = opr::Copy::make(y[0], x.node()->comp_node().change_stream(1));
        y0 = opr::Copy::make(y0, x.node()->comp_node());
        return {y0, y[1]};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto sub = [&](size_t begin, Maybe<ptrdiff_t> end) {
            auto&& iv = inp[0];
            return iv->sub(Slice(begin, end, None).apply(iv->layout(), 0));
        };
        dest[0].copy_from(sub(0, 3));
        dest[1].copy_from(sub(3, None));
    };

    Checker{make_graph, fwd}
            .run({TensorShape{5}})
            .run({TensorShape{8}})
            .run({TensorShape{9, 3}});
}

namespace {

void do_test_dynamic_split(bool multiple_cn, bool force_dynamic) {
    auto cns = load_multiple_xpus(3);
    constexpr size_t N = 2, C = 51;
    HostTensorGenerator<> gen;
    auto host_x = gen({N, C}, cns[0]), host_sub_begin = gen({1}, cns[0]),
         host_sub_end = gen({1}, cns[0]);
    host_sub_begin->ptr<float>()[0] = 0;
    host_sub_end->ptr<float>()[0] = 2;
    auto graph = ComputingGraph::make();

    SymbolVar x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x, {"x"}),
              sub_begin = opr::Host2DeviceCopy::make_no_fwd(
                      *graph, host_sub_begin, {"sub_begin"}),
              sub_end = opr::Host2DeviceCopy::make_no_fwd(
                      *graph, host_sub_end, {"sub_end"}),
              xsub = opr::Subtensor::make(
                             x, {opr::Subtensor::AxisIndexer::make_interval(
                                        1, sub_begin, sub_end, None)})
                             .rename("xsub");

    OperatorNodeConfig split_config("split");
    if (multiple_cn) {
        split_config.comp_node_arr({cns[1], cns[2]});
    }

    if (force_dynamic)
        xsub = opr::MarkDynamicVar::make(xsub);

    auto spl = opr::Split::make(
            xsub,
            Split::Options::make_callback(
                    1, 2,
                    [](size_t s) {
                        return std::vector<size_t>{s / 2, s - s / 2};
                    }),
            split_config);

    if (multiple_cn) {
        spl[0] = opr::Sleep::make(spl[0], 0.1);
        spl[1] = opr::Sleep::make(spl[1], 0.2);
    }
    auto cost0 = opr::Dot::make(spl[0].flatten(), spl[0].flatten()),
         cost1_ = opr::Dot::make(spl[1].flatten(), spl[1].flatten()),
         cost1 = opr::Copy::make(cost1_, OperatorNodeConfig().follow_comp_node(cost0)),
         cost = opr::Copy::make(
                        cost0 + cost1, OperatorNodeConfig().follow_comp_node(x)) *
                0.5f,
         grad = cg::grad(cost, x);

    HostTensorND host_spl0, host_spl1, host_grad;
    auto func = graph->compile(
            {make_callback_copy(spl[0], host_spl0),
             make_callback_copy(spl[1], host_spl1),
             make_callback_copy(grad, host_grad)});

    if (force_dynamic)
        ASSERT_TRUE(!cg::is_static_var_shape(spl[0].node()));
    else {
        auto cb = [](cg::OperatorNodeBase* op) {
            for (auto i : op->output()) {
                mgb_assert(
                        cg::is_static_var_shape(i), "dynamic var: %s",
                        cg::dump_var_info({i}).c_str());
            }
            return true;
        };
        func->iter_opr_seq(cb);
    }

    bool failed = false, fwd_checked = false;
    auto run_and_check = [&](size_t begin, size_t end) {
        ASSERT_FALSE(failed);
        failed = true;

        host_sub_begin->ptr<float>()[0] = begin;
        host_sub_end->ptr<float>()[0] = end;
        func->execute();

        auto mid = begin + (end - begin) / 2;

        auto inp = host_x->ptr<float>(), grad = host_grad.ptr<float>();
        ASSERT_EQ(host_spl0.shape(), TensorShape({N, mid - begin}));
        ASSERT_EQ(host_spl1.shape(), TensorShape({N, end - mid}));
        if (!force_dynamic && !multiple_cn && !begin && mid - begin == 1) {
            // check mem fwd for spl[0]
            // do not check for spl[1] since flatten() causes copy
            ASSERT_EQ(
                    prev_dev_ptr(spl[0]),
                    static_cast<const dt_float32*>(prev_dev_ptr(x)));
            fwd_checked = true;
        }
        for (size_t i = 0, it = host_x->layout().total_nr_elems(); i < it; ++i) {
            auto ch = i % C;
            auto n = i / C;
            float expect_grad;
            if (ch >= begin && ch < mid) {
                MGB_ASSERT_FLOAT_EQ(inp[i], *host_spl0.ptr<float>({n, ch - begin}))
                        << ssprintf(
                                   "failed at (%zu, %zu),sub=[: ,%zu:%zu]", i, ch,
                                   begin, end);
                expect_grad = inp[i];
            } else if (ch >= mid && ch < end) {
                MGB_ASSERT_FLOAT_EQ(inp[i], *host_spl1.ptr<float>({n, ch - mid}))
                        << ssprintf(
                                   "failed at (%zu, %zu),sub=[: ,%zu:%zu]", i, ch,
                                   begin, end);
                expect_grad = inp[i];
            } else {
                expect_grad = 0;
            }
            MGB_ASSERT_FLOAT_EQ(expect_grad, grad[i]) << ssprintf(
                    "grad failed at (%zu, %zu), sub=x[:, %zu:%zu]", n, ch, begin, end);
        }

        failed = false;
    };

    run_and_check(0, 3);
    run_and_check(2, 8);
    run_and_check(5, 12);
    run_and_check(1, C - 1);
    run_and_check(0, C);
    run_and_check(C - 2, C);
    run_and_check(0, 2);

    if (!multiple_cn && !force_dynamic) {
        ASSERT_TRUE(fwd_checked);
    }
}

}  // namespace

TEST(TestTensorManip, DynamicSplit00) {
    do_test_dynamic_split(false, false);
}

TEST(TestTensorManip, DynamicSplit01) {
    do_test_dynamic_split(false, true);
}

TEST(TestTensorManip, DynamicSplit10) {
    do_test_dynamic_split(true, false);
}

TEST(TestTensorManip, DynamicSplit11) {
    do_test_dynamic_split(true, true);
}

TEST(TestTensorManip, SplitFromDynStorage) {
    HostTensorGenerator<> gen;
    auto host_x = gen({4});
    auto graph = cg::ComputingGraph::make();
    auto x = opr::MarkDynamicVar::make(opr::Host2DeviceCopy::make(*graph, host_x))
                     .reshape({4});
    ASSERT_TRUE(cg::is_static_var_shape(x.node()));
    auto y = opr::Split::make(x, opr::Split::Options::make_partition(x, 0, {1, 3}));
    HostTensorND y0, y1;
    auto func = graph->compile(
            {make_callback_copy(y[0], y0), make_callback_copy(y[1], y1)});

    func->execute();
    ASSERT_FALSE(cg::is_static_var_storage(x.node()));
    HostTensorND expt{host_x->comp_node(), host_x->dtype()};
    expt.resize({1}).ptr<float>()[0] = host_x->ptr<float>()[0];
    MGB_ASSERT_TENSOR_EQ(expt, y0);
    expt.resize({3});
    for (int i = 0; i < 3; ++i)
        expt.ptr<float>()[i] = host_x->ptr<float>()[i + 1];
    MGB_ASSERT_TENSOR_EQ(expt, y1);
}

TEST(TestTensorManip, SplitPreAllocatedMultiCN) {
    auto cns = load_multiple_xpus(3);
    HostTensorGenerator<> gen;
    auto host_x = gen({3}, cns[0]);
    auto dev_x = std::make_shared<DeviceTensorND>();
    dev_x->copy_from(*host_x).sync();
    auto graph = cg::ComputingGraph::make();
    auto x = opr::SharedDeviceTensor::make(*graph, dev_x);
    auto ys = opr::Split::make(
            x, opr::Split::Options::make_average(0, 3),
            OperatorNodeConfig{}.comp_node_arr({cns.begin(), cns.end()}));
    ASSERT_EQ(3u, ys.size());
    HostTensorND y0, y1, y2;
    auto func = graph->compile(
            {make_callback_copy(ys[0], y0),
             make_callback_copy(opr::Copy::make(ys[1], {cns[0]}), y1),
             make_callback_copy(ys[2], y2)});
    func->execute();
    ASSERT_TRUE(cg::is_static_var_storage(ys[0].node()));
    ASSERT_FALSE(cg::is_static_var_storage(ys[1].node()));
    ASSERT_EQ(x.node()->prev_dev_ptr(), ys[0].node()->prev_dev_ptr());
    ASSERT_EQ(host_x->ptr<float>()[0], y0.ptr<float>()[0]);
    ASSERT_EQ(host_x->ptr<float>()[1], y1.ptr<float>()[0]);
    ASSERT_EQ(host_x->ptr<float>()[2], y2.ptr<float>()[0]);
}

TEST(TestTensorManip, SplitMemfwdMultipleTimesWithOffset) {
    auto cns = load_multiple_xpus(2);
    HostTensorGenerator<> gen;
    auto host_x = gen({4}, cns[0]);
    auto graph = cg::ComputingGraph::make();
    auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
         x = opr::Subtensor::make(
                 x0, {opr::Subtensor::AxisIndexer::make_interval(
                             0, x0.make_scalar(1), None, None)});
    auto ys = opr::Split::make(x, opr::Split::Options::make_average(0, 3));
    ASSERT_EQ(3u, ys.size());
    HostTensorND y0, y1, y2;
    auto func = graph->compile(
            {make_callback_copy(ys[0], y0),
             make_callback_copy(opr::Copy::make(ys[1], {cns[1]}), y1),
             make_callback_copy(ys[2], y2)});
    func->execute();
    ASSERT_FALSE(cg::is_static_var_storage(ys[0].node()));
    ASSERT_TRUE(cg::is_static_var_shape(ys[0].node()));
    ASSERT_FALSE(cg::is_static_var_storage(ys[1].node()));
    ASSERT_EQ(host_x->ptr<float>()[1], y0.ptr<float>()[0]);
    ASSERT_EQ(host_x->ptr<float>()[2], y1.ptr<float>()[0]);
    ASSERT_EQ(host_x->ptr<float>()[3], y2.ptr<float>()[0]);
    ASSERT_EQ(static_cast<const float*>(prev_dev_ptr(x0)) + 3, prev_dev_ptr(ys[2]));
}

TEST(TestTensorManip, SplitValueInfer) {
    auto cns = load_multiple_xpus(3);
    HostTensorGenerator<> gen;
    auto host_x = gen({3});
    auto graph = cg::ComputingGraph::make();
    auto x = opr::ImmutableTensor::make(*graph, *host_x);

    auto ys = opr::Split::make(
            x, opr::Split::Options::make_average(0, 3),
            OperatorNodeConfig{}.comp_node_arr({cns.begin(), cns.end()}));
    for (size_t i = 0; i < 3; ++i) {
        // split itself does not replace imm vars; use +0 to trigger optimizer
        auto var = (ys[i] + 0).node();
        ASSERT_TRUE(var->owner_opr()->same_type<opr::ImmutableTensor>());
        ASSERT_EQ(cns[i], var->comp_node());
        HostTensorND hv;
        hv.copy_from(var->owner_graph()->static_infer_manager().infer_value(var));
        ASSERT_EQ(TensorShape{1}, hv.shape());
        ASSERT_EQ(host_x->ptr<float>()[i], hv.ptr<float>()[0]);
    }
}

TEST(TestTensorManip, SplitZeroGrad) {
    HostTensorGenerator<> gen;
    auto host_x = gen({3, 2});
    auto graph = cg::ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x);
    auto ys = opr::Split::make(x, opr::Split::Options::make_average(0, 3));
    auto loss = opr::reduce_sum(ys[2] * ys[2], x.make_scalar(1)),
         gx = cg::grad(loss, x);
    HostTensorND host_gx;
    auto func = graph->compile({make_callback_copy(gx, host_gx)});
    func->execute();
    auto px = host_x->ptr<float>(), pgx = host_gx.ptr<float>();
    for (int i = 0; i < 2; ++i) {
        MGB_ASSERT_FLOAT_EQ(0.f, pgx[i]);
        MGB_ASSERT_FLOAT_EQ(0.f, pgx[2 + i]);
        MGB_ASSERT_FLOAT_EQ(px[4 + i] * 2, pgx[4 + i]);
    }
}

TEST(TestTensorManip, DynamicFill) {
    HostTensorGenerator<> gen;
    auto host_x = gen({1});
    auto graph = cg::ComputingGraph::make();
    auto x = opr::MarkDynamicVar::make(opr::Host2DeviceCopy::make(*graph, host_x)),
         y = x.fill_retain_dtype(23);
    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});

    bool failed = false;
    auto check = [&](const TensorShape& ishp) {
        ASSERT_FALSE(failed);
        failed = true;
        host_x->resize(ishp);
        func->execute();
        ASSERT_EQ(ishp, host_y.shape());
        auto ptr = host_y.ptr<float>();
        for (size_t i = 0, it = host_y.shape().total_nr_elems(); i < it; ++i)
            ASSERT_EQ(23, ptr[i]);
        failed = false;
    };
    check({4, 2});
    check({2, 4});
    check({23});
}

TEST(TestTensorManip, Pooling2DBySetSub) {
    constexpr int PH = 4, PW = 3;

    using Checker = AutoOprChecker<1, 1>;

    bool run_dyn = false;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        auto x = inputs.at(0);

        if (run_dyn)
            x = opr::MarkDynamicVar::make(x);

        x.rename("x");
        auto cv = [&](int v, bool dyn = false) {
            auto rst = x.make_scalar(v);
            if (dyn)
                rst = opr::MarkDynamicVar::make(rst);
            return rst;
        };

        auto oh = (opr::GetVarShape::make(x, 0) / PH).rename("oh"),
             ow = (opr::GetVarShape::make(x, 1) / PW).rename("ow"),
             y_tmp_shape = opr::Concat::make({cv(PH * PW), oh, ow}, 0),
             y_tmp = opr::Alloc::make(y_tmp_shape, dtype::Float32());

        if (!run_dyn)
            mgb_assert(cg::is_static_var_storage(y_tmp.node()));

        using Ad = opr::Subtensor::AxisIndexer;
        for (size_t i = 0, num = 0; i < (size_t)PH; ++i) {
            for (size_t j = 0; j < (size_t)PW; ++j) {
                bool dyn = run_dyn && num % 2;
                auto xsub = opr::Subtensor::make(
                                    x, {Ad::make_interval(0, cv(i, dyn), None, cv(PH)),
                                        Ad::make_interval(1, cv(j), None, cv(PW))})
                                    .rename(ssprintf("sub(%zu, %zu)", i, j));
                y_tmp = opr::SetSubtensor::make(
                                y_tmp, xsub, {Ad::make_index(0, cv(num, dyn))})
                                .rename(ssprintf("y(%zu, %zu)", i, j));
                if (!run_dyn) {
                    mgb_assert(cg::is_static_var_storage(xsub.node()));
                    mgb_assert(cg::is_static_var_storage(y_tmp.node()));
                } else if (dyn)
                    y_tmp = opr::MarkDynamicVar::make(y_tmp);
                ++num;
            }
        }
        auto y = opr::Reduce::make(y_tmp, {opr::Reduce::Mode::SUM, 0});
        y = opr::AxisAddRemove::make(y, {opr::AxisAddRemove::AxisDesc::make_remove(0)});
        if (!run_dyn)
            mgb_assert(cg::is_static_var_storage(y.node()));
        return {y};
    };

    auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto&& ishp = inp.at(0)->shape();
        auto oshp = ishp;
        mgb_assert(oshp.shape[0] % PH == 0);
        mgb_assert(oshp.shape[1] % PW == 0);
        oshp.shape[0] /= PH;
        oshp.shape[1] /= PW;

        auto optr = dest.at(0).comp_node(inp[0]->comp_node()).resize(oshp).ptr<float>();

        auto&& iv = *inp.at(0);
        for (size_t i = 0; i < oshp.shape[0]; ++i)
            for (size_t j = 0; j < oshp.shape[1]; ++j) {
                auto ii = i * PH, ij = j * PW;
                float sum = 0;
                for (size_t di = 0; di < PH; ++di)
                    for (size_t dj = 0; dj < PW; ++dj) {
                        sum += *iv.ptr<float>({ii + di, ij + dj});
                    }
                *(optr++) = sum;
            }
    };

    auto run = [&](bool dyn) {
        run_dyn = dyn;
        Checker(make_graph, fwd)
                .run({TensorShape{PH * 1, PW * 2}})
                .run({TensorShape{PH * 4, PW * 3}})
                .run({TensorShape{PH * 2, PW * 2}});
    };

    run(false);
    run(true);
}

TEST(TestTensorManip, Flatten) {
    HostTensorGenerator<> gen;
    auto host_x = gen({20});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x).rename("x"), y = x.flatten();
    y = y + x.reshape(y.symshape());
    ASSERT_EQ(TensorShape{20}, y.node()->shape());
    HostTensorND host_y;
    auto func = graph->compile({make_callback_copy(y, host_y)});
    for (auto&& ishp : {TensorShape{2, 5}, TensorShape{6, 8, 1}, TensorShape{3}}) {
        *host_x = *gen(ishp);
        func->execute();
        auto expected = host_x->sub(SubTensorSpec::make_from_layout(
                {{ishp.total_nr_elems()}, host_x->dtype()}));
        auto ptr = expected.ptr<float>();
        for (size_t i = 0; i < expected.shape()[0]; ++i)
            ptr[i] *= 2;
        MGB_ASSERT_TENSOR_EQ(expected, host_y);
    }
}

TEST(TestTensorManip, FillWithDtypeDedup) {
    HostTensorGenerator<> gen;
    auto host_x = gen({20});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x);
    SymbolVar vals[] = {
            x.fill_retain_dtype(0),   x.fill_retain_dtype(1),
            x.fill_retain_dtype(0),   x.fill_retain_dtype(1),
            x.fill_retain_dtype(0.f), x.fill_retain_dtype(1.f),
            x.fill_retain_dtype(0.f), x.fill_retain_dtype(1.f),
    };
    for (int i : {0, 1})
        for (int j = 2; j < 8; j += 2)
            ASSERT_EQ(vals[i].node(), vals[i + j].node()) << i << ' ' << i + j;
    ASSERT_NE(vals[0].node(), vals[1].node());
}

TEST(TestTensorManip, StrongContig) {
    HostTensorGenerator<> gen;
    auto host_x = gen({5, 1});
    auto graph = ComputingGraph::make();
    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
         y = opr::Dimshuffle::make(x, {1, 0});
    auto cb = [](DeviceTensorND& dv) {
        TensorLayout expect{{1, 5}, dv.dtype()};
        ASSERT_EQ(expect, dv.layout());
    };
    auto func = graph->compile({{y, cb}});
    func->execute();
}

namespace {
void test_param_pack_concat(const TensorShapeArray& shapes, DType type) {
    auto cn = CompNode::load("xpu0");
    auto graph = ComputingGraph::make();
    auto align = cn.get_mem_addr_alignment() / type.size();

    size_t size = 0;
    std::vector<size_t> begins;
    for (auto&& shape : shapes) {
        size = get_aligned_power2(size, align);
        begins.push_back(size);
        size += shape.total_nr_elems();
    }

    SmallVector<SymbolVar> srcs;
    for (size_t i = 0; i < shapes.size(); i++) {
        auto data = std::make_shared<HostTensorND>();
        data->comp_node(cn).dtype(dtype::Int32()).resize(shapes[i]);
        auto ptr = data->ptr<dt_int32>();
        for (size_t j = 0; j < shapes[i].total_nr_elems(); j++) {
            ptr[j] = j;
        }
        auto nd = opr::Host2DeviceCopy::make(*graph, data);
        srcs.push_back(nd);
    }

    auto host_offsets_gen = megdnn::ParamPackConcat::gen_offsets(
            shapes, cn.get_mem_addr_alignment(), 4);
    ASSERT_EQ(host_offsets_gen.back(), size);
    auto host_offsets = std::make_shared<HostTensorND>();
    host_offsets->comp_node(cn).dtype(dtype::Int32{}).resize({srcs.size() * 2});
    memcpy(host_offsets->raw_ptr(), host_offsets_gen.data(), srcs.size() * 8);
    auto offsets = opr::Host2DeviceCopy::make(*graph, host_offsets);

    auto z = opr::ParamPackConcat::make(srcs, offsets, host_offsets_gen);
    HostTensorND host_z;

    auto func = graph->compile({make_callback_copy(z, host_z)});
    func->execute();

    HostTensorND expected;
    expected.comp_node(cn).dtype(dtype::Int32()).resize({size});
    {
        auto ptr = expected.ptr<dt_int32>();

        memset(ptr, 0, sizeof(int32_t) * size);
        for (size_t i = 0; i < begins.size(); i++) {
            auto begin = begins[i];
            auto shape = shapes[i];
            for (size_t j = 0; j < shape.total_nr_elems(); j++) {
                ptr[begin + j] = j;
            }
        }
    }
    MGB_ASSERT_TENSOR_EQ(expected, host_z);
}

template <size_t nr_out>
void test_param_pack_split(const TensorShapeArray& shapes) {
    auto cn = CompNode::load("xpu0");
    auto align = std::max<size_t>(cn.get_mem_addr_alignment() / 4, 1);
    size_t concat_size = 0;
    mgb_assert(shapes.size() == nr_out);
    for (auto&& i : shapes) {
        concat_size = get_aligned_power2(concat_size, align) + i.total_nr_elems();
    }

    using Checker = AutoOprChecker<1, nr_out>;

    auto make_graph = [&](const typename Checker::SymInpArray& inputs) ->
            typename Checker::SymOutArray {
                auto offsets_val = megdnn::ParamPackConcat::gen_offsets(
                        shapes, cn.get_mem_addr_alignment(), 4);
                HostTensorND offsets;
                std::copy_n(
                        offsets_val.data(), offsets_val.size(),
                        offsets.dtype(dtype::Int32{})
                                .comp_node(cn)
                                .resize({offsets_val.size()})
                                .ptr<dt_int32>());
                auto out = opr::ParamPackSplit::make(inputs[0], offsets_val, shapes);
                mgb_assert(out.size() == nr_out);
                typename Checker::SymOutArray ret;
                for (size_t i = 0; i < nr_out; ++i) {
                    ret[i] = out[i];
                }
                return ret;
            };

    auto fwd = [&](typename Checker::NumOutArray& dest,
                   typename Checker::NumInpArray inp) {
        size_t offset = 0;
        auto ptr = inp[0]->template ptr<float>();
        for (size_t i = 0; i < nr_out; ++i) {
            dest[i].resize(shapes[i]);
            offset = get_aligned_power2(offset, align);
            auto nr_elem = shapes[i].total_nr_elems();
            memcpy(dest[i].template ptr<float>(), ptr + offset, nr_elem * 4);
            offset += nr_elem;
        }
    };

    Checker{make_graph, fwd}
            .run({TensorShape{concat_size}})
            .run({TensorShape{concat_size}})
            .run({TensorShape{concat_size}});
}

}  // anonymous namespace

TEST(TestParamPack, Concat) {
    TensorShapeArray array = {{129}, {21}};
    test_param_pack_concat(array, dtype::Int32());

    array = {{23}, {32}, {75}, {45}};
    test_param_pack_concat(array, dtype::Int32());

    array = {{129}, {512}, {513}, {27}};
    test_param_pack_concat(array, dtype::Int32());
}

TEST(TestParamPack, Split) {
    test_param_pack_split<2>({{2, 3}, {4, 5, 6}});
    test_param_pack_split<3>({{2, 9}, {123}, {5, 3}});
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}