You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

adadelta.py 3.1 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # -*- coding: utf-8 -*-
  2. from typing import Iterable, Union
  3. import numpy as np
  4. from ..tensor import Parameter, tensor
  5. from .optimizer import Optimizer
  6. class Adadelta(Optimizer):
  7. r"""Implements Adadelta algorithm.
  8. It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.
  9. Args:
  10. params: iterable of parameters to optimize or dicts defining
  11. parameter groups.
  12. lr: coefficient that scales delta before it is applied
  13. to the parameters. Default: 1.0
  14. rho: coefficient used for computing a running average
  15. of squared gradients. Default: 0.9
  16. eps: term added to the denominator to improve
  17. numerical stability. Default: 1e-6
  18. weight_decay: weight decay (L2 penalty). Default: 0
  19. """
  20. def __init__(
  21. self,
  22. params: Union[Iterable[Parameter], dict],
  23. lr: float = 1.0,
  24. rho: float = 0.9,
  25. eps: float = 1e-6,
  26. weight_decay: float = 0.0,
  27. ):
  28. assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
  29. assert rho >= 0.0 and rho <= 1.0, "Invalid rho value: {}".format(rho)
  30. assert eps >= 0.0, "Invalid epsilon value: {}".format(eps)
  31. assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
  32. weight_decay
  33. )
  34. defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
  35. super().__init__(params, defaults)
  36. self._disable_type_convert = True
  37. def _create_state(self, param_group):
  38. for param in param_group["params"]:
  39. self._add_state(param, "square_avg")
  40. self._add_state(param, "acc_delta")
  41. self._add_state(param, "step", initializer=0.0)
  42. def _updates(self, param_group):
  43. lr = param_group["lr"]
  44. weight_decay = param_group["weight_decay"]
  45. rho = param_group["rho"]
  46. eps = param_group["eps"]
  47. def make_scalar(val):
  48. return tensor(val, dtype="float32")
  49. # since `conver_inputs` is disabled for param updates,
  50. # scalar should be explicitly tansforred to tensor
  51. _lr = make_scalar(lr)
  52. _weight_decay = make_scalar(weight_decay)
  53. _rho = make_scalar(rho)
  54. _eps = make_scalar(eps)
  55. c1, c2, c05 = map(make_scalar, (1.0, 2.0, 0.5))
  56. for param in param_group["params"]:
  57. if param.grad is None:
  58. continue
  59. states = self._state[param]
  60. step = states["step"]
  61. step += c1
  62. grad = param.grad
  63. if weight_decay != 0.0:
  64. grad = grad + param * _weight_decay
  65. square_avg = states["square_avg"]
  66. acc_delta = states["acc_delta"]
  67. square_avg = _rho * square_avg + (c1 - _rho) * grad ** c2
  68. std = (square_avg + _eps) ** c05
  69. delta = (acc_delta + _eps) ** c05 / std * grad
  70. param -= _lr * delta
  71. acc_delta = _rho * acc_delta + (c1 - _rho) * delta ** c2
  72. states["square_avg"]._reset(square_avg)
  73. states["acc_delta"]._reset(acc_delta)