|
|
@@ -189,72 +189,70 @@ def test_adam(): |
|
|
|
_test_optimizer("Adam", case, CheckValue, update_lr=True) |
|
|
|
|
|
|
|
|
|
|
|
def test_adam(): |
|
|
|
def test_adagrad(): |
|
|
|
class CheckValue: |
|
|
|
def __init__(self, net, **kwarg): |
|
|
|
self.m_slots = TensorDict() |
|
|
|
self.v_slots = TensorDict() |
|
|
|
self.s_slots = TensorDict() |
|
|
|
for param in net.parameters(): |
|
|
|
self.m_slots[param] = np.zeros(param.shape).astype(np.float32) |
|
|
|
self.v_slots[param] = np.zeros(param.shape).astype(np.float32) |
|
|
|
self.s_slots[param] = np.zeros(param.shape).astype(np.float32) |
|
|
|
for k, v in kwarg.items(): |
|
|
|
setattr(self, k, v) |
|
|
|
|
|
|
|
def __call__(self, ori_params, new_params, step): |
|
|
|
for param in new_params: |
|
|
|
grad = param.grad.numpy() |
|
|
|
m = self.m_slots[param] |
|
|
|
v = self.v_slots[param] |
|
|
|
m *= self.betas[0] |
|
|
|
m += (1 - self.betas[0]) * grad |
|
|
|
v *= self.betas[1] |
|
|
|
v += (1 - self.betas[1]) * grad * grad |
|
|
|
delta = (m / (1 - self.betas[0] ** step)) / ( |
|
|
|
np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps |
|
|
|
) |
|
|
|
assertTensorClose(param.numpy(), ori_params[param] - self.lr * delta) |
|
|
|
self.s_slots[param] += grad ** 2 |
|
|
|
delta = grad / (self.s_slots[param] + self.eps) ** 0.5 |
|
|
|
delta *= -(self.lr / (1 + (step - 1) * self.lr_decay)) |
|
|
|
assertTensorClose(param.numpy(), ori_params[param] + delta) |
|
|
|
|
|
|
|
cases = [ |
|
|
|
{"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01}, |
|
|
|
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01}, |
|
|
|
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay |
|
|
|
{ |
|
|
|
"betas": (0.8, 0.9), |
|
|
|
"eps": 1e-04, |
|
|
|
"lr": 0.01, |
|
|
|
"eps": 1e-06, |
|
|
|
"lr_decay": 0.01, |
|
|
|
"weight_decay": 0.1, |
|
|
|
}, # with weight_decay |
|
|
|
] |
|
|
|
for case in cases: |
|
|
|
_test_optimizer("Adam", case, CheckValue) |
|
|
|
_test_optimizer("Adam", case, CheckValue, update_lr=True) |
|
|
|
_test_optimizer("Adagrad", case, CheckValue) |
|
|
|
_test_optimizer("Adagrad", case, CheckValue, update_lr=True) |
|
|
|
|
|
|
|
|
|
|
|
def test_adagrad(): |
|
|
|
def test_adadelta(): |
|
|
|
class CheckValue: |
|
|
|
def __init__(self, net, **kwarg): |
|
|
|
self.s_slots = TensorDict() |
|
|
|
self.a_slots = TensorDict() |
|
|
|
for param in net.parameters(): |
|
|
|
self.s_slots[param] = np.zeros(param.shape).astype(np.float32) |
|
|
|
self.a_slots[param] = np.zeros(param.shape).astype(np.float32) |
|
|
|
for k, v in kwarg.items(): |
|
|
|
setattr(self, k, v) |
|
|
|
|
|
|
|
def __call__(self, ori_params, new_params, step): |
|
|
|
for param in new_params: |
|
|
|
grad = param.grad.numpy() |
|
|
|
self.s_slots[param] += grad ** 2 |
|
|
|
delta = grad / (self.s_slots[param] + self.eps) ** 0.5 |
|
|
|
delta *= -(self.lr / (1 + (step - 1) * self.lr_decay)) |
|
|
|
self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * ( |
|
|
|
1 - self.rho |
|
|
|
) |
|
|
|
delta = ( |
|
|
|
grad |
|
|
|
* ((self.a_slots[param] + self.eps) ** 0.5) |
|
|
|
/ (self.s_slots[param] + self.eps) ** 0.5 |
|
|
|
) |
|
|
|
self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * ( |
|
|
|
1 - self.rho |
|
|
|
) |
|
|
|
delta *= -self.lr |
|
|
|
assertTensorClose(param.numpy(), ori_params[param] + delta) |
|
|
|
|
|
|
|
cases = [ |
|
|
|
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01}, |
|
|
|
{"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0}, # without lr_decay |
|
|
|
{ |
|
|
|
"lr": 0.01, |
|
|
|
"eps": 1e-06, |
|
|
|
"lr_decay": 0.01, |
|
|
|
"weight_decay": 0.1, |
|
|
|
}, # with weight_decay |
|
|
|
{"lr": 1.0, "eps": 1e-06, "rho": 0.9}, |
|
|
|
{"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9}, # with weight_decay |
|
|
|
] |
|
|
|
for case in cases: |
|
|
|
_test_optimizer("Adagrad", case, CheckValue) |
|
|
|
_test_optimizer("Adagrad", case, CheckValue, update_lr=True) |
|
|
|
_test_optimizer("Adadelta", case, CheckValue) |
|
|
|
_test_optimizer("Adadelta", case, CheckValue, update_lr=True) |