|
@@ -719,6 +719,164 @@ TEST(TestOprCollectiveComm, ReduceSumWithGrad) { |
|
|
MGB_ASSERT_TENSOR_EQ(*host_grad, host_out_grad1); |
|
|
MGB_ASSERT_TENSOR_EQ(*host_grad, host_out_grad1); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, Gather) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
auto host_x0 = gen({28, 28}); |
|
|
|
|
|
auto host_x1 = gen({28, 28}); |
|
|
|
|
|
HostTensorND host_y0, host_y1, host_y_expect; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
auto graph = ComputingGraph::make(); |
|
|
|
|
|
|
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph, host_x0, cn0); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph, host_x1, cn0); |
|
|
|
|
|
auto x1c = opr::Copy::make(x1, cn1); |
|
|
|
|
|
|
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x0}, graph.get(), "gather", |
|
|
|
|
|
2, true, 0, client, {Mode::GATHER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({x1c}, graph.get(), "gather", |
|
|
|
|
|
2, false, 1, client, {Mode::GATHER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto y_expect = opr::Concat::make({x0, x1}, 0); |
|
|
|
|
|
|
|
|
|
|
|
auto func = graph->compile({make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(y1, host_y1), |
|
|
|
|
|
make_callback_copy(y_expect, host_y_expect)}); |
|
|
|
|
|
func->execute(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_y_expect, host_y0); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, GatherMultiThread) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
auto host_x0 = gen({28, 28}); |
|
|
|
|
|
auto host_x1 = gen({28, 28}); |
|
|
|
|
|
HostTensorND host_y0, host_y_expect; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
|
|
|
|
|
|
auto run_0 = [&]() { // rank 0 |
|
|
|
|
|
auto graph0 = ComputingGraph::make(); |
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph0, host_x0, cn0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x0}, graph0.get(), "gather", 2, true, 0, client, |
|
|
|
|
|
{Mode::GATHER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto func0 = graph0->compile({make_callback_copy(y0, host_y0)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_1 = [&]() { // rank 1 |
|
|
|
|
|
auto graph1 = ComputingGraph::make(); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph1, host_x1, cn1); |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({x1}, graph1.get(), "gather", 2, false, 1, client, |
|
|
|
|
|
{Mode::GATHER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto func1 = graph1->compile({{y1, nullptr}}); |
|
|
|
|
|
func1->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_2 = [&]() { // check |
|
|
|
|
|
auto graph2 = ComputingGraph::make(); |
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph2, host_x0, cn0); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph2, host_x1, cn0); |
|
|
|
|
|
auto y_expect = opr::Concat::make({x0, x1}, 0); |
|
|
|
|
|
auto func2 = graph2->compile({make_callback_copy(y_expect, host_y_expect)}); |
|
|
|
|
|
func2->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::thread t0(run_0); |
|
|
|
|
|
std::thread t1(run_1); |
|
|
|
|
|
std::thread t2(run_2); |
|
|
|
|
|
|
|
|
|
|
|
t0.join(); |
|
|
|
|
|
t1.join(); |
|
|
|
|
|
t2.join(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_y_expect, host_y0); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, GatherWithGrad) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
TensorShape shape({28, 28}); |
|
|
|
|
|
auto host_x0 = gen(shape); |
|
|
|
|
|
auto host_x1 = gen(shape); |
|
|
|
|
|
auto host_grad0 = gen(shape); |
|
|
|
|
|
auto host_grad1 = gen(shape); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorND host_y0, host_y0_expect, host_out_grad0, host_out_grad1; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
|
|
|
|
|
|
auto run_0 = [&]() { // rank 0 |
|
|
|
|
|
auto graph0 = ComputingGraph::make(); |
|
|
|
|
|
graph0->options().graph_opt_level = 0; |
|
|
|
|
|
|
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph0, host_x0, cn0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x0}, graph0.get(), "gather", 2, true, 0, client, |
|
|
|
|
|
{Mode::GATHER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
y0.node()->owner_opr()->node_prop().attribute().priority = -1; |
|
|
|
|
|
|
|
|
|
|
|
auto grad0 = opr::Host2DeviceCopy::make(*graph0, host_grad0, cn0); |
|
|
|
|
|
auto grad1 = opr::Host2DeviceCopy::make(*graph0, host_grad1, cn0); |
|
|
|
|
|
auto grad = opr::Concat::make({grad0, grad1}, 0); |
|
|
|
|
|
auto loss = opr::Dot::make(y0, grad); |
|
|
|
|
|
auto g = opr::VirtualGrad::make(loss, x0); |
|
|
|
|
|
|
|
|
|
|
|
auto func0 = graph0->compile( |
|
|
|
|
|
{make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(g, host_out_grad0)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_1 = [&]() { // rank 1 |
|
|
|
|
|
auto graph1 = ComputingGraph::make(); |
|
|
|
|
|
graph1->options().graph_opt_level = 0; |
|
|
|
|
|
|
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph1, host_x1, cn1); |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({x1}, graph1.get(), "gather", 2, false, 1, client, |
|
|
|
|
|
{Mode::GATHER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
y1.node()->owner_opr()->node_prop().attribute().priority = -1; |
|
|
|
|
|
|
|
|
|
|
|
auto grad = opr::Host2DeviceCopy::make(*graph1, gen({1}), cn1); |
|
|
|
|
|
auto loss = opr::Dot::make(y1, grad); |
|
|
|
|
|
auto g = opr::VirtualGrad::make(loss, x1); |
|
|
|
|
|
|
|
|
|
|
|
auto func1 = graph1->compile({{y1, nullptr}, make_callback_copy(g, host_out_grad1)}); |
|
|
|
|
|
func1->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_2 = [&]() { // check |
|
|
|
|
|
auto graph2 = ComputingGraph::make(); |
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph2, host_x0, cn0); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph2, host_x1, cn0); |
|
|
|
|
|
auto y0_expect = opr::Concat::make({x0, x1}, 0); |
|
|
|
|
|
auto func2 = graph2->compile({ |
|
|
|
|
|
make_callback_copy(y0_expect, host_y0_expect)}); |
|
|
|
|
|
func2->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::thread t0(run_0); |
|
|
|
|
|
std::thread t1(run_1); |
|
|
|
|
|
std::thread t2(run_2); |
|
|
|
|
|
|
|
|
|
|
|
t0.join(); |
|
|
|
|
|
t1.join(); |
|
|
|
|
|
t2.join(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_y0_expect, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_grad0, host_out_grad0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_grad1, host_out_grad1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, Broadcast) { |
|
|
TEST(TestOprCollectiveComm, Broadcast) { |
|
|
REQUIRE_GPU(2); |
|
|
REQUIRE_GPU(2); |
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
auto cn0 = CompNode::load("gpu0"); |
|
@@ -863,3 +1021,349 @@ TEST(TestOprCollectiveComm, BroadcastWithGrad) { |
|
|
MGB_ASSERT_TENSOR_EQ(*host_x0, host_y1); |
|
|
MGB_ASSERT_TENSOR_EQ(*host_x0, host_y1); |
|
|
MGB_ASSERT_TENSOR_EQ(host_out_grad_expect, host_out_grad); |
|
|
MGB_ASSERT_TENSOR_EQ(host_out_grad_expect, host_out_grad); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, Scatter) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
auto host_x0 = gen({28, 28}); |
|
|
|
|
|
auto host_x1 = gen({28, 28}); |
|
|
|
|
|
HostTensorND host_y0, host_y1; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
auto graph = ComputingGraph::make(); |
|
|
|
|
|
|
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph, host_x0, cn0); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph, host_x1, cn0); |
|
|
|
|
|
auto x = opr::Concat::make({x0, x1}, 0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x}, graph.get(), "scatter", |
|
|
|
|
|
2, true, 0, client, {Mode::SCATTER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({}, graph.get(), "scatter", 2, false, 1, |
|
|
|
|
|
client, {Mode::SCATTER}, dtype::Float32(), "nccl", {cn1})[0]; |
|
|
|
|
|
|
|
|
|
|
|
auto func = graph->compile({make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(y1, host_y1)}); |
|
|
|
|
|
func->execute(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_x0, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_x1, host_y1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, ScatterMultiThread) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
auto host_x0 = gen({28, 28}); |
|
|
|
|
|
auto host_x1 = gen({28, 28}); |
|
|
|
|
|
HostTensorND host_y0, host_y1; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
|
|
|
|
|
|
auto run_0 = [&]() { // rank 0 |
|
|
|
|
|
auto graph0 = ComputingGraph::make(); |
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph0, host_x0, cn0); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph0, host_x1, cn0); |
|
|
|
|
|
auto x = opr::Concat::make({x0, x1}, 0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x}, graph0.get(), "scatter", 2, true, 0, client, |
|
|
|
|
|
{Mode::SCATTER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto func0 = graph0->compile({make_callback_copy(y0, host_y0)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_1 = [&]() { // rank 1 |
|
|
|
|
|
auto graph1 = ComputingGraph::make(); |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({}, graph1.get(), "scatter", 2, false, 1, client, |
|
|
|
|
|
{Mode::SCATTER}, dtype::Float32(), "nccl", {cn1})[0]; |
|
|
|
|
|
auto func1 = graph1->compile({make_callback_copy(y1, host_y1)}); |
|
|
|
|
|
func1->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::thread t0(run_0); |
|
|
|
|
|
std::thread t1(run_1); |
|
|
|
|
|
|
|
|
|
|
|
t0.join(); |
|
|
|
|
|
t1.join(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_x0, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_x1, host_y1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, ScatterWithGrad) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
TensorShape shape({28, 28}); |
|
|
|
|
|
auto host_x0 = gen(shape); |
|
|
|
|
|
auto host_x1 = gen(shape); |
|
|
|
|
|
auto host_grad0 = gen(shape); |
|
|
|
|
|
auto host_grad1 = gen(shape); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorND host_y0, host_y1, host_out_grad, host_out_grad_expect; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
|
|
|
|
|
|
auto run_0 = [&]() { // rank 0 |
|
|
|
|
|
auto graph0 = ComputingGraph::make(); |
|
|
|
|
|
graph0->options().graph_opt_level = 0; |
|
|
|
|
|
|
|
|
|
|
|
auto x0 = opr::Host2DeviceCopy::make(*graph0, host_x0, cn0); |
|
|
|
|
|
auto x1 = opr::Host2DeviceCopy::make(*graph0, host_x1, cn0); |
|
|
|
|
|
auto x = opr::Concat::make({x0, x1}, 0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x}, graph0.get(), "scatter", 2, true, 0, client, |
|
|
|
|
|
{Mode::SCATTER}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
y0.node()->owner_opr()->node_prop().attribute().priority = -1; |
|
|
|
|
|
|
|
|
|
|
|
auto grad0 = opr::Host2DeviceCopy::make(*graph0, host_grad0, cn0); |
|
|
|
|
|
auto loss = opr::Dot::make(y0, grad0); |
|
|
|
|
|
auto g = opr::VirtualGrad::make(loss, x); |
|
|
|
|
|
|
|
|
|
|
|
auto func0 = graph0->compile( |
|
|
|
|
|
{make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(g, host_out_grad)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_1 = [&]() { // rank 1 |
|
|
|
|
|
auto graph1 = ComputingGraph::make(); |
|
|
|
|
|
graph1->options().graph_opt_level = 0; |
|
|
|
|
|
|
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({}, graph1.get(), "scatter", 2, false, 1, client, |
|
|
|
|
|
{Mode::SCATTER}, dtype::Float32(), "nccl", {cn1})[0]; |
|
|
|
|
|
|
|
|
|
|
|
auto grad1 = opr::Host2DeviceCopy::make(*graph1, host_grad1, cn1); |
|
|
|
|
|
auto g = opr::CollectiveComm::make({grad1}, graph1.get(), "scatter:grad", 2, false, 1, client, |
|
|
|
|
|
Mode::GATHER, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
g.node()->owner_opr()->node_prop().attribute().priority = 1; |
|
|
|
|
|
|
|
|
|
|
|
auto func1 = graph1->compile({make_callback_copy(y1, host_y1), {g, nullptr}}); |
|
|
|
|
|
func1->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_2 = [&]() { // check |
|
|
|
|
|
auto graph2 = ComputingGraph::make(); |
|
|
|
|
|
auto grad0 = opr::Host2DeviceCopy::make(*graph2, host_grad0, cn0); |
|
|
|
|
|
auto grad1 = opr::Host2DeviceCopy::make(*graph2, host_grad1, cn0); |
|
|
|
|
|
auto out_grad_expect = opr::Concat::make({grad0, grad1}, 0); |
|
|
|
|
|
auto func2 = graph2->compile({ |
|
|
|
|
|
make_callback_copy(out_grad_expect, host_out_grad_expect)}); |
|
|
|
|
|
func2->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::thread t0(run_0); |
|
|
|
|
|
std::thread t1(run_1); |
|
|
|
|
|
std::thread t2(run_2); |
|
|
|
|
|
|
|
|
|
|
|
t0.join(); |
|
|
|
|
|
t1.join(); |
|
|
|
|
|
t2.join(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_x0, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(*host_x1, host_y1); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_out_grad_expect, host_out_grad); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, AllToAll) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
TensorShape shape({10}); |
|
|
|
|
|
auto host_x00 = gen(shape); |
|
|
|
|
|
auto host_x01 = gen(shape); |
|
|
|
|
|
auto host_x10 = gen(shape); |
|
|
|
|
|
auto host_x11 = gen(shape); |
|
|
|
|
|
HostTensorND host_y0, host_y1, host_expect_y0, host_expect_y1; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
auto graph = ComputingGraph::make(); |
|
|
|
|
|
|
|
|
|
|
|
auto x00 = opr::Host2DeviceCopy::make(*graph, host_x00, cn0); |
|
|
|
|
|
auto x01 = opr::Host2DeviceCopy::make(*graph, host_x01, cn0); |
|
|
|
|
|
auto x0 = opr::Concat::make({x00, x01}, 0); |
|
|
|
|
|
auto x10 = opr::Host2DeviceCopy::make(*graph, host_x10, cn1); |
|
|
|
|
|
auto x11 = opr::Host2DeviceCopy::make(*graph, host_x11, cn1); |
|
|
|
|
|
auto x1 = opr::Concat::make({x10, x11}, 0); |
|
|
|
|
|
|
|
|
|
|
|
auto x01c = opr::Copy::make(x01, {cn1}); |
|
|
|
|
|
auto x10c = opr::Copy::make(x10, {cn0}); |
|
|
|
|
|
|
|
|
|
|
|
auto expect_y0 = opr::Concat::make({x00, x10c}, 0); |
|
|
|
|
|
auto expect_y1 = opr::Concat::make({x01c, x11}, 0); |
|
|
|
|
|
|
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x0}, graph.get(), "alltoall", |
|
|
|
|
|
2, false, 0, client, {Mode::ALL_TO_ALL}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({x1}, graph.get(), "alltoall", 2, false, 1, |
|
|
|
|
|
client, {Mode::ALL_TO_ALL}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
|
|
|
|
|
|
auto func = graph->compile({make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(y1, host_y1), |
|
|
|
|
|
make_callback_copy(expect_y0, host_expect_y0), |
|
|
|
|
|
make_callback_copy(expect_y1, host_expect_y1)}); |
|
|
|
|
|
func->execute(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_y0, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_y1, host_y1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, AllToAllMultiThread) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
TensorShape shape({10}); |
|
|
|
|
|
auto host_x00 = gen(shape); |
|
|
|
|
|
auto host_x01 = gen(shape); |
|
|
|
|
|
auto host_x10 = gen(shape); |
|
|
|
|
|
auto host_x11 = gen(shape); |
|
|
|
|
|
HostTensorND host_y0, host_y1, host_expect_y0, host_expect_y1; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
|
|
|
|
|
|
auto run_0 = [&]() { // rank 0 |
|
|
|
|
|
auto graph0 = ComputingGraph::make(); |
|
|
|
|
|
auto x00 = opr::Host2DeviceCopy::make(*graph0, host_x00, cn0); |
|
|
|
|
|
auto x01 = opr::Host2DeviceCopy::make(*graph0, host_x01, cn0); |
|
|
|
|
|
auto x10 = opr::Host2DeviceCopy::make(*graph0, host_x10, cn0); |
|
|
|
|
|
auto x0 = opr::Concat::make({x00, x01}, 0); |
|
|
|
|
|
auto expect_y0 = opr::Concat::make({x00, x10}, 0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x0}, graph0.get(), "alltoall", 2, false, 0, client, |
|
|
|
|
|
{Mode::ALL_TO_ALL}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto func0 = graph0->compile( |
|
|
|
|
|
{make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(expect_y0, host_expect_y0)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_1 = [&]() { // rank 1 |
|
|
|
|
|
auto graph1 = ComputingGraph::make(); |
|
|
|
|
|
auto x10 = opr::Host2DeviceCopy::make(*graph1, host_x10, cn1); |
|
|
|
|
|
auto x11 = opr::Host2DeviceCopy::make(*graph1, host_x11, cn1); |
|
|
|
|
|
auto x01 = opr::Host2DeviceCopy::make(*graph1, host_x01, cn1); |
|
|
|
|
|
auto x1 = opr::Concat::make({x10, x11}, 0); |
|
|
|
|
|
auto expect_y1 = opr::Concat::make({x01, x11}, 0); |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({x1}, graph1.get(), "alltoall", 2, false, 1, client, |
|
|
|
|
|
{Mode::ALL_TO_ALL}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
auto func1 = graph1->compile( |
|
|
|
|
|
{make_callback_copy(y1, host_y1), |
|
|
|
|
|
make_callback_copy(expect_y1, host_expect_y1)}); |
|
|
|
|
|
func1->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::thread t0(run_0); |
|
|
|
|
|
std::thread t1(run_1); |
|
|
|
|
|
|
|
|
|
|
|
t0.join(); |
|
|
|
|
|
t1.join(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_y0, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_y1, host_y1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
TEST(TestOprCollectiveComm, AllToAllWithGrad) { |
|
|
|
|
|
REQUIRE_GPU(2); |
|
|
|
|
|
auto cn0 = CompNode::load("gpu0"); |
|
|
|
|
|
auto cn1 = CompNode::load("gpu1"); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorGenerator<> gen; |
|
|
|
|
|
TensorShape shape({10}); |
|
|
|
|
|
auto host_x00 = gen(shape); |
|
|
|
|
|
auto host_x01 = gen(shape); |
|
|
|
|
|
auto host_x10 = gen(shape); |
|
|
|
|
|
auto host_x11 = gen(shape); |
|
|
|
|
|
auto host_grad00 = gen(shape); |
|
|
|
|
|
auto host_grad01 = gen(shape); |
|
|
|
|
|
auto host_grad10 = gen(shape); |
|
|
|
|
|
auto host_grad11 = gen(shape); |
|
|
|
|
|
|
|
|
|
|
|
HostTensorND host_y0, host_y1, host_expect_y0, host_expect_y1, host_grad0, |
|
|
|
|
|
host_grad1, host_expect_grad0, host_expect_grad1; |
|
|
|
|
|
|
|
|
|
|
|
auto client = std::make_shared<test::MockGroupClient>(); |
|
|
|
|
|
|
|
|
|
|
|
auto run_0 = [&]() { // rank 0 |
|
|
|
|
|
auto graph0 = ComputingGraph::make(); |
|
|
|
|
|
graph0->options().graph_opt_level = 0; |
|
|
|
|
|
|
|
|
|
|
|
auto x00 = opr::Host2DeviceCopy::make(*graph0, host_x00, cn0); |
|
|
|
|
|
auto x01 = opr::Host2DeviceCopy::make(*graph0, host_x01, cn0); |
|
|
|
|
|
auto x10 = opr::Host2DeviceCopy::make(*graph0, host_x10, cn0); |
|
|
|
|
|
auto x0 = opr::Concat::make({x00, x01}, 0); |
|
|
|
|
|
auto expect_y0 = opr::Concat::make({x00, x10}, 0); |
|
|
|
|
|
auto y0 = opr::CollectiveComm::make({x0}, graph0.get(), "alltoall", 2, false, 0, client, |
|
|
|
|
|
{Mode::ALL_TO_ALL}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
y0.node()->owner_opr()->node_prop().attribute().priority = -1; |
|
|
|
|
|
|
|
|
|
|
|
auto grad00 = opr::Host2DeviceCopy::make(*graph0, host_grad00, cn0); |
|
|
|
|
|
auto grad10 = opr::Host2DeviceCopy::make(*graph0, host_grad10, cn0); |
|
|
|
|
|
auto grad_y0 = opr::Concat::make({grad00, grad10}, 0); |
|
|
|
|
|
auto loss = opr::Dot::make(y0, grad_y0); |
|
|
|
|
|
auto g = opr::VirtualGrad::make(loss, x0); |
|
|
|
|
|
|
|
|
|
|
|
auto func0 = graph0->compile( |
|
|
|
|
|
{make_callback_copy(y0, host_y0), |
|
|
|
|
|
make_callback_copy(g, host_grad0), |
|
|
|
|
|
make_callback_copy(expect_y0, host_expect_y0)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_1 = [&]() { // rank 1 |
|
|
|
|
|
auto graph1 = ComputingGraph::make(); |
|
|
|
|
|
graph1->options().graph_opt_level = 0; |
|
|
|
|
|
|
|
|
|
|
|
auto x10 = opr::Host2DeviceCopy::make(*graph1, host_x10, cn1); |
|
|
|
|
|
auto x11 = opr::Host2DeviceCopy::make(*graph1, host_x11, cn1); |
|
|
|
|
|
auto x01 = opr::Host2DeviceCopy::make(*graph1, host_x01, cn1); |
|
|
|
|
|
auto x1 = opr::Concat::make({x10, x11}, 0); |
|
|
|
|
|
auto expect_y1 = opr::Concat::make({x01, x11}, 0); |
|
|
|
|
|
auto y1 = opr::CollectiveComm::make({x1}, graph1.get(), "alltoall", 2, false, 1, client, |
|
|
|
|
|
{Mode::ALL_TO_ALL}, dtype::Float32(), "nccl")[0]; |
|
|
|
|
|
y1.node()->owner_opr()->node_prop().attribute().priority = -1; |
|
|
|
|
|
|
|
|
|
|
|
auto grad01 = opr::Host2DeviceCopy::make(*graph1, host_grad01, cn1); |
|
|
|
|
|
auto grad11 = opr::Host2DeviceCopy::make(*graph1, host_grad11, cn1); |
|
|
|
|
|
auto grad_y1 = opr::Concat::make({grad01, grad11}, 0); |
|
|
|
|
|
auto loss = opr::Dot::make(y1, grad_y1); |
|
|
|
|
|
auto g = opr::VirtualGrad::make(loss, x1); |
|
|
|
|
|
|
|
|
|
|
|
auto func0 = graph1->compile( |
|
|
|
|
|
{make_callback_copy(y1, host_y1), |
|
|
|
|
|
make_callback_copy(g, host_grad1), |
|
|
|
|
|
make_callback_copy(expect_y1, host_expect_y1)}); |
|
|
|
|
|
func0->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
auto run_2 = [&]() { // check |
|
|
|
|
|
auto graph2 = ComputingGraph::make(); |
|
|
|
|
|
auto grad00 = opr::Host2DeviceCopy::make(*graph2, host_grad00, cn0); |
|
|
|
|
|
auto grad01 = opr::Host2DeviceCopy::make(*graph2, host_grad01, cn0); |
|
|
|
|
|
auto grad10 = opr::Host2DeviceCopy::make(*graph2, host_grad10, cn0); |
|
|
|
|
|
auto grad11 = opr::Host2DeviceCopy::make(*graph2, host_grad11, cn0); |
|
|
|
|
|
auto out_grad0_expect = opr::Concat::make({grad00, grad01}, 0); |
|
|
|
|
|
auto out_grad1_expect = opr::Concat::make({grad10, grad11}, 0); |
|
|
|
|
|
auto func2 = graph2->compile({ |
|
|
|
|
|
make_callback_copy(out_grad0_expect, host_expect_grad0), |
|
|
|
|
|
make_callback_copy(out_grad1_expect, host_expect_grad1)}); |
|
|
|
|
|
func2->execute(); |
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::thread t0(run_0); |
|
|
|
|
|
std::thread t1(run_1); |
|
|
|
|
|
std::thread t2(run_2); |
|
|
|
|
|
|
|
|
|
|
|
t0.join(); |
|
|
|
|
|
t1.join(); |
|
|
|
|
|
t2.join(); |
|
|
|
|
|
|
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_y0, host_y0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_y1, host_y1); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_grad0, host_grad0); |
|
|
|
|
|
MGB_ASSERT_TENSOR_EQ(host_expect_grad1, host_grad1); |
|
|
|
|
|
} |