import os
import platform
import weakref

import numpy as np
import pytest

import megengine as mge
import megengine.core.tensor.dtype as dtype
import megengine.distributed as dist
import megengine.functional as F
import megengine.module as M
import megengine.optimizer as optim
from megengine.autodiff import Function, GradManager
from megengine.jit import trace


def test_basic():
    x = mge.tensor([1.0, 3.0, 5.0]).reshape(1, 3)
    w = mge.tensor([2.0, 4.0, 6.0]).reshape(3, 1)
    b = mge.tensor(-1.0)

    gm = GradManager().attach([w, b])
    gm.record()

    p = F.matmul(x, w)
    y = p + b

    gm.backward(y)
    gm.release()  # is not necessary
    np.testing.assert_equal(w.grad.numpy(), [[1], [3], [5]])
    np.testing.assert_equal(b.grad.numpy(), [1])

    w.grad = None
    b.grad = None
    with gm:
        p = F.matmul(x, w)
        y = p + b
        gm.backward(y)

    np.testing.assert_equal(w.grad.numpy(), [[1], [3], [5]])
    np.testing.assert_equal(b.grad.numpy(), [1])


def test_dy():
    x = mge.tensor([1.0, 3.0, 5.0]).reshape(1, 3)
    w = mge.tensor([2.0, 4.0, 6.0]).reshape(3, 1)
    b = mge.tensor(-1.0)

    gm = GradManager().attach([w, b])

    def get_grad(grad, dy, idx):
        if isinstance(dy, (list, tuple)):
            return np.array(grad) * dy[idx]
        else:
            return np.array(grad) * dy

    # dy's shape should be the same as y's
    dy = mge.tensor(2.5).reshape(1, 1)
    w.grad = None
    b.grad = None
    with gm:
        p = F.matmul(x, w)
        y = p + b
        gm.backward(y, dy=dy)

    np.testing.assert_equal(w.grad.numpy(), [[1], [3], [5]] * dy.numpy())
    np.testing.assert_equal(b.grad.numpy(), [1] * dy.numpy())


def test_attach_in_with_block():
    a = mge.Parameter([1.0])
    gm = GradManager()
    with gm:
        b = a * 3
        gm.attach(b)
        c = b + 1
        gm.backward(c)
    assert int(b.grad.numpy()) == 1


def test_attach_temporary():
    w = mge.Parameter(2.0)
    gm = GradManager()
    gm.attach(w)

    def cb(x, g):
        assert x is ref()
        cb.called = True

    for i in range(3):
        with gm:
            cb.called = False
            x = mge.Tensor(i, dtype="float32")
            gm.attach(x, callbacks=cb)
            ref = weakref.ref(x)
            y = x * w
            gm.backward(y)
            assert cb.called
        del x
        assert ref() is None

    # NOTE: does not guarantee timely release when recording
    # for i in range(3):
    #     with gm:
    #         x = mge.Tensor(i, dtype='float32')
    #         gm.attach(x)
    #         ref = weakref.ref(x)
    #         y = x * w
    #         del x
    #         assert ref() is None
    #         gm.backward(y)


def test_attached_tensors():
    w1 = mge.Parameter(2.0)
    w2 = mge.Parameter(2.0)
    gm = GradManager()

    def check(expected):
        actual = gm.attached_tensors()
        assert len(expected) == len(actual)
        for exp, act in zip(expected, actual):
            assert exp is act

    gm.attach(w1)
    check([w1])
    gm.attach(w2)
    check([w1, w2])
    gm.attach(w1)
    check([w1, w2])


def test_no_dependency():
    x = mge.tensor(3)

    w = mge.Parameter(1.0)
    w_no_dep = mge.Parameter(1.0)
    gm = GradManager()
    gm.attach(w)
    gm.attach(w_no_dep)

    with gm:
        out1 = x * w
        out2 = w_no_dep * out1
        gm.backward(out1.sum())

    assert w.grad is not None
    assert w_no_dep.grad is None


def test_regression_1762():
    x = F.ones((10, 10, 3, 3))

    conv = M.Conv2d(10, 10, kernel_size=3, padding=1)

    t_shape = (1, 10, 1, 1)
    weight = mge.Parameter(np.ones(t_shape, dtype=np.float32))
    bias = mge.Parameter(np.zeros(t_shape, dtype=np.float32))

    gm = GradManager()
    gm.attach(list(conv.parameters()) + [weight, bias])

    with gm:
        out1 = conv(x)

        out2 = F.batch_norm(out1, None, None, weight, bias, training=True,)

        # Weird error only occur when this action is placed after BN
        # Op type is not relevant
        loss = out1 + 1
        gm.backward(loss)


def test_empty_grad_in_backward():
    x = mge.Parameter(F.full(100, 0.5))
    y = mge.Parameter(F.ones(100))

    gm = GradManager()
    gm.attach([x, y])

    with gm:
        z = F.where(x > 0.7, x, y)
        loss = z.sum()
        gm.backward(loss)
        assert np.all(x.grad.numpy() == 0)
        assert np.all(y.grad.numpy() == 1)


@pytest.mark.require_ngpu(2)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize(
    "trace_mode", [True, False, None], ids=["symbolic", "trace", "no_trace"]
)
def test_remote_grad(trace_mode):
    @dist.launcher
    def worker():
        rank = dist.get_rank()
        size = dist.get_world_size()
        x = mge.tensor(np.random.randn(1, rank * 2 + 2), dtype=np.float32)
        m = M.Linear(rank * 2 + 2, rank * 2 + 4)
        gm = GradManager().attach(m.parameters())
        opt = optim.SGD(m.parameters(), 1e-3, momentum=0.9)

        def train_func(x):
            with gm:
                if rank != 0:
                    x = dist.functional.remote_recv(rank - 1)
                y = m(x)
                if rank != size - 1:
                    x = dist.functional.remote_send(y, dest_rank=rank + 1)
                    gm.backward()
                else:
                    y = y.mean()
                    gm.backward(y)
                opt.step().clear_grad()

        if trace_mode is not None:
            train_func = trace(symbolic=trace_mode)(train_func)

        for i in range(1):
            train_func(x)

    worker()


@pytest.mark.require_ngpu(3)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize(
    "trace_mode", [True, False, None], ids=["symbolic", "trace", "no_trace"]
)
def test_gather_grad(trace_mode):
    @dist.launcher(n_gpus=3)
    def worker():
        m = M.Linear(10, 10)
        x = F.ones([3, 10], dtype="float32")

        def func():
            with GradManager().attach(m.parameters()) as gm:
                y = m(x)
                y = F.distributed.gather(y)
                if dist.get_rank() == 0:
                    loss = (2 * y + 1).mean()
                    gm.backward(loss)
                else:
                    gm.backward()

        if trace_mode is not None:
            func = trace(symbolic=trace_mode)(func)
        func()

    worker()


@pytest.mark.require_ngpu(3)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize(
    "trace_mode", [True, False, None], ids=["symbolic", "trace", "no_trace"]
)
def test_scatter_grad(trace_mode):
    @dist.launcher(n_gpus=3)
    def worker():
        x = F.ones([3, 10], dtype="float32")
        m = M.Linear(10, 10)

        def func():
            with GradManager().attach(m.parameters()) as gm:
                if dist.get_rank() == 0:
                    y = m(x)
                else:
                    y = x
                y = F.distributed.scatter(y)
                gm.backward(y)

        if trace_mode is not None:
            func = trace(symbolic=trace_mode)(func)
        func()

    worker()


@pytest.mark.require_ngpu(3)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize(
    "trace_mode", [True, False, None], ids=["symbolic", "trace", "no_trace"]
)
def test_reduce_grad(trace_mode):
    @dist.launcher(n_gpus=3)
    def worker():
        m = M.Linear(10, 10)
        x = F.ones([3, 10], dtype="float32")

        def func():
            with GradManager().attach(m.parameters()) as gm:
                y = m(x)
                y = F.distributed.reduce_sum(y)
                if dist.get_rank() == 0:
                    loss = (2 * y + 1).mean()
                    gm.backward(loss)
                else:
                    gm.backward()

        if trace_mode is not None:
            func = trace(symbolic=trace_mode)(func)
        func()

    worker()


@pytest.mark.require_ngpu(3)
@pytest.mark.isolated_distributed
@pytest.mark.parametrize(
    "trace_mode", [True, False, None], ids=["symbolic", "trace", "no_trace"]
)
def test_broadcast_grad(trace_mode):
    @dist.launcher(n_gpus=3)
    def worker():
        x = F.ones([3, 10], dtype="float32")
        m = M.Linear(10, 10)

        def func():
            with GradManager().attach(m.parameters()) as gm:
                if dist.get_rank() == 0:
                    y = m(x)
                else:
                    y = x
                y = F.distributed.broadcast(y)
                gm.backward(y)

        if trace_mode is not None:
            func = trace(symbolic=trace_mode)(func)
        func()

    worker()


def test_2nd_grad_with_manager():
    x_np = np.random.rand(10).astype("float32")
    x = mge.tensor(x_np)

    gm = GradManager().attach([x])
    gm2 = GradManager().attach([x])

    with gm:
        with gm2:
            y = F.cos(x)
            gm2.backward(y)
        np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)
        gm.backward(x.grad)
    np.testing.assert_almost_equal(
        x.grad.numpy(), -np.sin(x_np) - np.cos(x_np), decimal=5
    )


def test_grad_manager_group():
    x_np = np.random.rand(10).astype("float32")
    x = mge.tensor(x_np)

    gm = GradManager().attach([x])
    gm2 = GradManager().attach([x])

    with gm | gm2:
        y = F.cos(x)
        gm.backward(y)
        gm2.backward(y)
    np.testing.assert_almost_equal(x.grad.numpy(), -2 * np.sin(x_np), decimal=5)

    x.grad = None


def test_grad_manager_group_visibility():
    x_np = np.random.rand(10).astype("float32")
    x = mge.tensor(x_np)

    gm = GradManager().attach([x])
    gm2 = GradManager().attach([x])

    with gm | gm2:
        y = F.cos(x)
        gm2.backward(y)
        np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)
        gm.backward(x.grad)
        np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)


def test_grad_manager_visibility_by_order():
    x_np = np.random.rand(10).astype("float32")
    x = mge.tensor(x_np)

    gm = GradManager().attach([x])
    gm2 = GradManager().attach([x])

    with gm2:
        with gm:
            y = F.cos(x)
            gm2.backward(y)
            np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)
            gm.backward(x.grad)

    np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)


@pytest.mark.parametrize("target", [F.cos, F.sin, lambda x: x * 2 + 1])
def test_emulate_forward_mode_with_reverse_mode(target):
    def jvp(inp, expr):
        with GradManager() as gm:
            with GradManager().attach([inp]) as gm2:
                oup = expr(inp)
                oup_grad = F.zeros_like(oup)
                gm.attach(oup_grad)
                gm2.backward(oup, oup_grad)
            gm.backward(inp.grad)
        return oup, oup_grad.grad

    def fake_jvp(inp, expr):
        delta = 0.001
        return expr(inp), (expr(inp + delta) - expr(inp - delta)) / (2 * delta)

    x_np = np.random.rand(10).astype("float32")
    x = mge.tensor(x_np)
    y, dy = jvp(x, target)
    y1, dy1 = fake_jvp(x, target)

    np.testing.assert_almost_equal(y.numpy(), y1.numpy(), decimal=5)
    np.testing.assert_almost_equal(dy.numpy(), dy1.numpy(), decimal=3)


def test_2nd_grad_with_custom_gradient():
    class MySin(Function):
        def forward(self, x):
            self.inp = x
            x = mge.Tensor(x.numpy())
            y = F.sin(x)
            return y

        def backward(self, dy):
            dx = F.cos(self.inp) * dy
            return dx

    class MyCos(Function):
        def forward(self, x):
            self.inp = x
            x = mge.Tensor(x.numpy())
            y = F.cos(x)
            return y

        def backward(self, dy):
            if dy is None:
                return None
            dx = -MySin()(self.inp) * dy
            return dx

    x_np = np.random.rand(10).astype("float32")
    x = mge.tensor(x_np)

    gm = GradManager().attach([x])
    gm2 = GradManager().attach([x])

    with gm:
        with gm2:
            y = MyCos()(x)
            gm2.backward(y)
        np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)
        gm.backward(x.grad)
    np.testing.assert_almost_equal(
        x.grad.numpy(), -np.sin(x_np) - np.cos(x_np), decimal=5
    )


@pytest.mark.parametrize("invalid_dtype", [np.uint8, np.int8, np.int32])
def test_attach_invalid_tensor_dtype(invalid_dtype):
    gm = GradManager()
    x = mge.tensor([1], dtype=invalid_dtype)
    with pytest.raises(AssertionError):
        gm.attach([x])


@pytest.mark.parametrize("differentible_dtype", [np.float32, np.float16])
def test_attach_differentible_tensor_dtype(differentible_dtype):
    gm = GradManager()
    x = mge.tensor([1], dtype=differentible_dtype)
    gm.attach([x])