# -*- coding: utf-8 -*-
import platform

import numpy as np
import pytest

import megengine as mge
import megengine.distributed as dist
from megengine import Parameter, tensor
from megengine.core._imperative_rt.core2 import sync
from megengine.device import get_default_device, set_default_device
from megengine.functional.distributed import (
    all_gather,
    all_reduce_max,
    all_reduce_min,
    all_reduce_sum,
    all_to_all,
    broadcast,
    gather,
    reduce_scatter_sum,
    reduce_sum,
    remote_recv,
    remote_send,
    scatter,
)


def run_reduce_sum(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = reduce_sum(inp)
        if rank == 0:
            assert np.allclose(output.numpy(), expect[rank])
        else:
            assert output is None

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = x + y
    data = (x, y)
    expect = (z, None)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_reduce_sum_multishape(shape):
    run_reduce_sum(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_reduce_sum_multidtype(dtype):
    run_reduce_sum((8, 10), dtype)


def run_broadcast(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = broadcast(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = x + 1
    data = (x, y)
    expect = (x, x)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_broadcast_multishape(shape):
    run_broadcast(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_broadcast_multidtype(dtype):
    run_broadcast((8, 10), dtype)


def run_all_gather(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = all_gather(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = np.concatenate((x, y))
    data = (x, y)
    expect = (z, z)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(1,), (2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_all_gather_multishape(shape):
    run_all_gather(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_all_gather_multidtype(dtype):
    run_all_gather((8, 10), dtype)


def run_reduce_scatter_sum(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = reduce_scatter_sum(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = x + y
    data = (x, y)
    expect = (z[: shape[0] // 2], z[shape[0] // 2 :])
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(2, 3), (8, 10), (88, 44)], ids=str)
@pytest.mark.isolated_distributed
def test_reduce_scatter_sum_multishape(shape):
    run_reduce_scatter_sum(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_reduce_scatter_sum_multidtype(dtype):
    run_reduce_scatter_sum((8, 10), dtype)


def run_all_reduce_sum(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = all_reduce_sum(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = x + y
    data = (x, y)
    expect = (z, z)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_all_reduce_sum_multishape(shape):
    run_all_reduce_sum(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_all_reduce_sum_multidtype(dtype):
    run_all_reduce_sum((8, 10), dtype)


def run_all_reduce_max(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = all_reduce_max(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = np.maximum(x, y)
    data = (x, y)
    expect = (z, z)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_all_reduce_max_multishape(shape):
    run_all_reduce_max(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_all_reduce_max_multidtype(dtype):
    run_all_reduce_max((8, 10), dtype)


def run_all_reduce_min(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = all_reduce_min(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = np.minimum(x, y)
    data = (x, y)
    expect = (z, z)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(), (1,), (2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_all_reduce_min_multishape(shape):
    run_all_reduce_min(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_all_reduce_min_multidtype(dtype):
    run_all_reduce_min((8, 10), dtype)


def run_gather(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = gather(inp)
        if rank == 0:
            assert np.allclose(output.numpy(), expect[rank])
        else:
            assert output is None

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    z = np.concatenate((x, y))
    data = (x, y)
    expect = (z, None)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(2, 3), (8, 10), (99, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_gather_multishape(shape):
    run_gather(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_gather_multidtype(dtype):
    run_gather((8, 10), dtype)


def run_scatter(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = scatter(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = x + 1
    data = (x, y)
    expect = (x[: shape[0] // 2], x[shape[0] // 2 :])
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(2, 3), (8, 10), (100, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_scatter_multishape(shape):
    run_scatter(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_scatter_multidtype(dtype):
    run_scatter((8, 10), dtype)


def run_all_to_all(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(data, expect):
        rank = dist.get_rank()
        inp = tensor(data[rank])
        output = all_to_all(inp)
        assert np.allclose(output.numpy(), expect[rank])

    x = np.random.random_sample(shape).astype(dtype)
    y = np.random.random_sample(shape).astype(dtype)
    a = np.concatenate((x[: shape[0] // 2], y[: shape[0] // 2]))
    b = np.concatenate((x[shape[0] // 2 :], y[shape[0] // 2 :]))
    data = (x, y)
    expect = (a, b)
    worker(data, expect)


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("shape", [(2, 3), (8, 10), (100, 77)], ids=str)
@pytest.mark.isolated_distributed
def test_all_to_all_multishape(shape):
    run_all_to_all(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
@pytest.mark.isolated_distributed
def test_all_to_all_multidtype(dtype):
    run_all_to_all((8, 10), dtype)


def run_io_remote(shape, dtype):
    @dist.launcher(n_gpus=2)
    def worker(val, shape):
        rank = dist.get_rank()
        if rank == 0:  # remote send
            x = tensor(val, device="xpu0")
            remote_send(x, 1)
            sync()
        else:  # remote recv
            y = remote_recv(0)
            assert y.device == get_default_device()
            np.testing.assert_almost_equal(val, y.numpy())

    val = np.random.random_sample(shape).astype(dtype)
    worker(val, shape)


@pytest.mark.require_ngpu(2)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize("shape", [(), (1,), (4, 5)], ids=str)
def test_io_remote_multishape(shape):
    run_io_remote(shape, "float32")


@pytest.mark.require_ngpu(2)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize("dtype", ["float32", "int32", "int8", "uint8"], ids=str)
def test_io_remote_multidtype(dtype):
    run_io_remote((8, 10), dtype)


@pytest.mark.require_ngpu(2)
def test_cuda_init_before_fork():
    a = mge.tensor(1, device="gpu0")

    @dist.launcher(n_gpus=2)
    def worker():
        a += 1
        b = mge.tensor(2)

    with pytest.raises(AssertionError):
        worker()