"""
 Copyright 2020 Tianshu AI Platform. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 =============================================================
"""

from kamal.core.engine.trainer import Engine
from kamal.core.tasks.loss import kldiv
import torch.nn.functional as F
from kamal.utils.logger import get_logger
from kamal.utils import set_mode, move_to_device
import weakref

import torch
import torch.nn as nn

import time
import numpy as np

class KDDistiller(Engine):
    def __init__( self, 
                  logger=None,
                  tb_writer=None):
        super(KDDistiller, self).__init__(logger=logger, tb_writer=tb_writer)

    def setup(self, student, teacher, dataloader, optimizer, T=1.0, alpha=1.0, beta=1.0, gamma=1.0, device=None):
        self.model = self.student = student
        self.teacher = teacher
        self.dataloader = dataloader
        self.optimizer = optimizer
        if device is None:
            device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )
        self.device = device

        self.T = T
        self.gamma = gamma
        self.alpha = alpha
        self.beta = beta

        self.student.to(self.device)
        self.teacher.to(self.device)

    def run( self, max_iter, start_iter=0, epoch_length=None):
        with set_mode(self.student, training=True), \
             set_mode(self.teacher, training=False):
            super( KDDistiller, self ).run( self.step_fn, self.dataloader, start_iter=start_iter, max_iter=max_iter, epoch_length=epoch_length)

    def additional_kd_loss(self, engine, batch):
        return batch[0].new_zeros(1)

    def step_fn(self, engine, batch):
        student = self.student
        teacher = self.teacher
        start_time = time.perf_counter()
        batch = move_to_device(batch, self.device)
        inputs, targets = batch
        outputs = student(inputs)
        with torch.no_grad():
            soft_targets = teacher(inputs)
    
        loss_dict = { "loss_kld":        self.alpha * kldiv(outputs, soft_targets, T=self.T),
                      "loss_ce":         self.beta * F.cross_entropy( outputs, targets ),
                      "loss_additional": self.gamma * self.additional_kd_loss(engine, batch) }
        
        loss = sum( loss_dict.values() )
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        step_time = time.perf_counter() - start_time
        metrics = { loss_name: loss_value.item() for (loss_name, loss_value) in loss_dict.items() }
        metrics.update({
            'total_loss': loss.item(),
            'step_time': step_time,
            'lr': float( self.optimizer.param_groups[0]['lr'] )
        })
        return metrics