# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from typing import Optional, Tuple

from ..core._imperative_rt.core2 import apply
from ..core.autodiff.grad import _grad_manager_dict
from ..core.ops.builtin import CollectiveComm, Copy, PyOpBase, RemoteRecv, RemoteSend
from ..core.tensor.utils import isscalar, setscalar
from ..device import get_default_device
from ..tensor import Tensor
from .group import WORLD, Group, get_backend, get_client, get_mm_server_addr, get_rank

__all__ = [
    "reduce_sum",
    "broadcast",
    "all_gather",
    "reduce_scatter_sum",
    "all_reduce_sum",
    "all_reduce_max",
    "all_reduce_min",
    "gather",
    "scatter",
    "all_to_all",
    "remote_send",
    "remote_recv",
]


def collective_comm(inp, mode, group, device):
    """Helper function for applying collective communication functions."""
    assert isinstance(group, Group)
    if group is None:
        return inp
    addr, port = get_mm_server_addr()
    op = CollectiveComm(
        key=group.key,
        nr_devices=group.size,
        rank=group.rank,
        is_root=(group.rank == 0),
        local_grad=False,
        addr=addr,
        port=port,
        mode=mode,
        dtype=inp.dtype,
        backend=get_backend(),
        comp_node=device,
    )
    (result,) = apply(op, inp)
    # assume all workers have homogeneous shape
    if mode in (
        CollectiveComm.Mode.REDUCE_SUM,
        CollectiveComm.Mode.BROADCAST,
        CollectiveComm.Mode.ALL_REDUCE_SUM,
        CollectiveComm.Mode.ALL_REDUCE_MAX,
        CollectiveComm.Mode.ALL_REDUCE_MIN,
    ):
        if isscalar(inp):
            setscalar(result)
    return result


def reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create reduce_sum operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.REDUCE_SUM
    return collective_comm(inp, mode, group, device)


def broadcast(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create broadcast operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.BROADCAST
    return collective_comm(inp, mode, group, device)


def all_gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create all_gather operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.ALL_GATHER
    return collective_comm(inp, mode, group, device)


def reduce_scatter_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create reduce_scatter_sum operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.REDUCE_SCATTER_SUM
    return collective_comm(inp, mode, group, device)


def all_reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create all_reduce_sum operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_SUM
    return collective_comm(inp, mode, group, device)


def all_reduce_max(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create all_reduce_max operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_MAX
    return collective_comm(inp, mode, group, device)


def all_reduce_min(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create all_reduce_min operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_MIN
    return collective_comm(inp, mode, group, device)


def gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create gather operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.GATHER
    return collective_comm(inp, mode, group, device)


def scatter(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create scatter operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.SCATTER
    return collective_comm(inp, mode, group, device)


def all_to_all(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
) -> Tensor:
    """
    Create all_to_all operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
    :param device: execution device.
    """
    mode = CollectiveComm.Mode.ALL_TO_ALL
    return collective_comm(inp, mode, group, device)


class _RemoteSend(PyOpBase):
    def __init__(self, op: RemoteSend):
        self.op = op

    def _default_rule(self, data):
        return apply(self.op, data)

    def _grad_rule(self, data):
        self.dtype = data.dtype
        self.shape = data.shape
        self.device = data.device
        (self.dummy,) = self._default_rule(data)
        return self.dummy, self.backward

    def backward(self, grad):
        assert grad is None
        if get_client().check_is_grad(self.op.key):
            return remote_recv(
                self.op.rank_to,
                self.shape,
                self.dtype,
                device=str(self.device),
                inp=self.dummy,
            )


class _RemoteRecv(PyOpBase):
    def __init__(self, op: RemoteRecv):
        self.op = op

    def _default_rule(self, dummy):
        return apply(self.op, dummy)

    def _grad_rule(self, dummy):
        return self._default_rule(dummy), self.backward

    def backward(self, grad):
        get_client().set_is_grad(self.op.key, grad is not None)
        if grad is not None:
            remote_send(grad, self.op.rank_from)


def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
    """
    Send a Tensor to a remote process.

    :param inp: tensor to send.
    :param dest_rank: destination process rank.
    """
    key = "{}->{}".format(get_rank(), dest_rank)
    grad_keys = {}
    for n, g in _grad_manager_dict.items():
        if g._is_attached_to(inp):
            grad_keys[n] = g
    get_client().set_remote_tracer(key, grad_keys)

    op = RemoteSend()
    op.key = key
    op.addr, op.port = get_mm_server_addr()
    op.rank_to = dest_rank
    op.backend = get_backend()
    (dummy,) = apply(_RemoteSend(op), inp)

    for g in grad_keys.values():
        g._refkeeper.append(dummy)


def remote_recv(
    src_rank: int,
    shape: Tuple[int],
    dtype: type,
    device: Optional[str] = None,
    inp=None,
) -> Tensor:
    """
    Receive a Tensor from a remote process.

    :param src_rank: source process rank.
    :param shape: the shape of the tensor to receive.
    :param dtype: the data type of the tensor to receive.
    :param device: the device to place the received tensor.
    :param inp: dummy input to determine recved tensor type
    """
    key = "{}->{}".format(src_rank, get_rank())

    if device is None:
        device = get_default_device()
    # dummy input
    if inp is None:
        inp = Tensor([0], device=device)
    tracer_set = get_client().check_remote_tracer(key)
    for n in tracer_set:
        g = _grad_manager_dict.get(n)
        if g is not None:
            g.wrt(inp)
            g._refkeeper.append(inp)

    _isscalar = False
    if len(shape) == 0:
        shape = (1,)
        _isscalar = True

    op = RemoteRecv()
    op.key = key
    op.cn = device
    op.shape = shape
    op.dtype = dtype
    op.addr, op.port = get_mm_server_addr()
    op.rank_from = src_rank
    op.backend = get_backend()

    (ret,) = apply(_RemoteRecv(op), inp)
    if _isscalar:
        setscalar(ret)
    return ret