# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import numpy as np

from ..core.tensor.utils import make_shape_tuple
from ..tensor import Tensor
from .elemwise import abs, eq, exp, log, maximum, pow, relu
from .nn import assert_equal, indexing_one_hot
from .tensor import where
from .utils import zero_grad


def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
    r"""
    Calculates the mean absolute error (MAE) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean absolute error can be described as:

    .. math:: \ell(x,y) = mean\left(L \right)

    where

    .. math::

        L = \{l_1,\dots,l_N\}, \quad
        l_n = \left| x_n - y_n \right|,

    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

    :param pred: The predicted result from model.
    :param label: The ground truth to compare.

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.functional as F
        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
        loss = F.l1_loss(ipt,tgt)
        print(loss.numpy())

    Outputs:

    .. testoutput::

        [2.75]

    """
    diff = pred - label
    return abs(diff).mean()


def square_loss(pred: Tensor, label: Tensor) -> Tensor:
    r"""
    Calculates the mean squared error (squared L2 norm) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean squared error can be described as:

    .. math:: \ell(x, y) = mean\left( L \right)

    where

    .. math::

        L = \{l_1,\dots,l_N\}, \quad
        l_n = \left( x_n - y_n \right)^2,

    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

    :param pred: The predicted result from model.
    :param label: The ground truth to compare.

    Shape:
        - pred: :math:`(N, *)` where :math:`*` means any number of additional
          dimensions
        - label: :math:`(N, *)`. Same shape as ``pred``

    """
    diff = pred - label
    return (diff ** 2).mean()


def cross_entropy(
    inp: Tensor, target: Tensor, axis: int = 1, ignore_index: int = -1
) -> Tensor:
    r"""
    Returns the cross entropy loss in a classification problem.

    .. math:: \textrm{CrossEntropy}(x, y) = - \sum_{i} y_i\log(x_i)

    :param inp: The input tensor representing the predicted probability.
    :param label: The input tensor representing the classification label.
    :param axis: An axis along which cross_entropy will be applied. Default: 1
    :param ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. Default: -1

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F

        data_shape = (1, 2)
        label_shape = (1, )

        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
        label = tensor(np.ones(label_shape, dtype=np.int32))
        loss = F.cross_entropy(pred, label)
        print(loss.numpy())

    Outputs:

    .. testoutput::

        [0.69]

    """
    raise NotImplementedError
    # n0 = inp.ndim
    # n1 = target.ndim
    # assert n0 == n1 + 1, (
    #     "target ndim must be one less than input ndim; input_ndim={} "
    #     "target_ndim={}".format(n0, n1)
    # )

    # if ignore_index != -1:
    #     mask = 1 - equal(target, ignore_index)
    #     target = target * mask
    #     loss = -log(indexing_one_hot(inp, target, axis)) * mask
    #     return loss.sum() / maximum(mask.sum(), 1.0)
    # else:
    #     return -log(indexing_one_hot(inp, target, axis)).mean()


def cross_entropy_with_softmax(
    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
) -> Tensor:
    r"""
    Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.

    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.

    When using label smoothing, the label distribution is as follows:

    .. math:: y^{LS}_{k}=y_{k}\left(1-\alpha\right)+\alpha/K

    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.

    :param pred: The input tensor representing the predicted probability.
    :param label: The input tensor representing the classification label.
    :param axis: An axis along which softmax will be applied. Default: 1.
    :param label_smooth: A label smoothing of parameter that can re-distribute target distribution. Default: 0.
    """
    n0 = pred.ndim
    n1 = label.ndim
    assert n0 == n1 + 1, (
        "target ndim must be one less than input ndim; input_ndim={} "
        "target_ndim={}".format(n0, n1)
    )

    num_classes = pred.shape[axis]

    # Denominator of the softmax
    offset = pred.max(axis=axis).detach()
    pred = pred - offset
    down = exp(pred).sum(axis=axis)

    up = indexing_one_hot(pred, label, axis)

    if label_smooth != 0:
        factor = label_smooth / num_classes
        up = up * (1 - label_smooth) + pred.sum(axis=axis) * factor

    return (log(down) - up).mean()


def triplet_margin_loss(
    anchor: Tensor, positive: Tensor, negative: Tensor, margin: float = 1.0, p: int = 2
) -> Tensor:
    r"""
    Creates a criterion that measures the triplet loss given an input tensors.

    .. math::

        L(a, p, n) = max\left\{d\left(a_{i},p_{i}\right)-d\left(a_{i}, n_{i}\right)+margin, 0\right\},\
        d\left(x_{i},y_{i}\right)=\left\|x_{i}-y_{i}\right\|_{p}

    :param anchor: The input tensor representing the anchor samples.
    :param positive: The input tensor representing the positive samples.
    :param negative: The input tensor representing the negative samples.
    :param margin: Default: 1.0
    :param p: The norm degree for pairwise distance. Default: 2.0
    """
    s0 = anchor.shapeof()
    s1 = positive.shapeof()
    s2 = negative.shapeof()
    assert_equal(s0, s1)
    assert_equal(s1, s2)

    n0 = anchor.ndim
    n1 = positive.ndim
    n2 = negative.ndim
    assert n0 == 2 and n1 == 2 and n2 == 2, (
        "anchor ndim, positive ndim, and negative ndim must be 2; "
        "anchor_ndim={} positive_ndim={} negative_ndim={}".format(n0, n1, n2)
    )
    assert p > 0, "a margin with a value greater than 0; p={}".format(p)

    diff0 = abs(anchor - positive)
    diff1 = abs(anchor - negative)

    d1 = power(power(diff0, p).sum(axis=1, keepdims=True), 1 / p)
    d2 = power(power(diff1, p).sum(axis=1, keepdims=True), 1 / p)

    loss = maximum(d1 - d2 + margin, 0)

    return loss.mean()


def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
    r"""Function that measures the Binary Cross Entropy between the target and the prediction.

    :param pred: (N,*) where * means, any number of additional dimensions.
    :param label: (N,*), same shape as the input.

    """
    assert make_shape_tuple(pred.shape) == make_shape_tuple(label.shape)

    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()


def nll_loss(
    pred: Tensor, label: Tensor, axis: int = 1, ignore_index: int = -1
) -> Tensor:
    r"""
    The negative log likelihood loss.

    :param pred: The predicted result from model.
    :param label: The ground truth to compare.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        data_shape = (2, 2)
        label_shape = (2, )

        data = tensor(
            np.array([[1, 0.5], [0.3, 1.2]], dtype=np.float32).reshape(data_shape),
        )
        label = tensor(
            np.ones(label_shape, dtype=np.int32)
        )
        pred = F.log(F.softmax(data))
        loss1 = F.nll_loss(pred, label)
        loss2 = F.cross_entropy_with_softmax(data, label)
        print(loss1.numpy(), loss2.numpy())

    Outputs:

    .. testoutput::

        [0.6576154] [0.6576154]

    """
    raise NotImplementedError
    # n0 = pred.ndim
    # n1 = label.ndim
    # assert n0 == n1 + 1, (
    #     "target ndim must be one less than input ndim; input_ndim={} "
    #     "target_ndim={}".format(n0, n1)
    # )

    # mask = 1.0 - equal(label, ignore_index)
    # label = label * mask

    # loss = indexing_one_hot(pred, label, axis) * mask

    # return -1.0 * loss.sum() / maximum(mask.sum(), 1.0)


def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
    r"""
    Caculate the hinge loss which is often used in SVMs.

    The hinge loss can be described as:

    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_i_j*y_i_j))

    :param pred: The input tensor representing the predicted probability, shape is (N, C).
    :param label: The input tensor representing the binary classification label, shape is (N, C).
    :param norm: Specify the norm to caculate the loss, should be "L1" or "L2".

    Examples:

    .. testcode::

        from megengine import tensor
        import megengine.functional as F

        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")

        loss = F.hinge_loss(pred, label)

        print(loss.numpy())

    Outputs:

    .. testoutput::

        [1.5]

    """
    assert norm in ["L1", "L2"], "norm must be L1 or L2"
    # Converts binary labels to -1/1 labels.
    loss = relu(1.0 - pred * label)
    if norm == "L1":
        return loss.sum(axis=1).mean()
    else:
        return (loss ** 2).sum(axis=1).mean()


def smooth_l1_loss(pred: Tensor, label: Tensor) -> Tensor:
    r"""
    Caculate the smooth l1 loss proposed in `Fast R-CNN paper by Ross Girshick`.

    The smooth l1 loss can be described as:

    .. math::
        \text{loss}(x, y) = \frac{1}{n} \sum_{i} l_{i}

    where :math:`l_{i}` is given by:

    .. math::
        l_{i} =
        \begin{cases}
        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
        |x_i - y_i| - 0.5, & \text{otherwise }
        \end{cases}

    :param pred: The predicted result from model.
    :param label: The ground truth to compare.

    Examples:

    .. testcode::

        from megengine import tensor
        import megengine.functional as F

        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]])
        label = tensor([[0.4, 1.5, 1.2], [0., 0.1, 2.2]])

        loss = F.smooth_l1_loss(pred, label)

        print(loss.numpy())

    Outputs:

    .. testoutput::

        [0.5608334]
    """
    raise NotImplementedError
    # diff = abs(pred - label)
    # l2_loss = 0.5 * (diff ** 2)
    # l1_loss = diff - 0.5
    # mask = diff < 1
    # loss = where(mask, l2_loss, l1_loss)
    # return loss.mean()