@@ -25,5 +25,14 @@ class Parameter(Tensor): | |||
def __init__(self, value, *, dtype=None, device=None, requires_grad=True): | |||
# pylint: disable=super-init-not-called | |||
t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad) | |||
if isinstance(value, Tensor): | |||
t = value | |||
else: | |||
t = tensor(value, dtype=dtype, device=device, requires_grad=requires_grad) | |||
self.__dict__.update(t.__dict__) | |||
@property | |||
def shape(self): | |||
r"""Return shape of parameter. | |||
""" | |||
return self._symvar.imm_shape |
@@ -16,3 +16,4 @@ from .linear import Linear | |||
from .module import Module | |||
from .pooling import AvgPool2d, MaxPool2d | |||
from .sequential import Sequential | |||
from .parampack import ParamPack |
@@ -168,6 +168,29 @@ class Module(metaclass=ABCMeta): | |||
""" | |||
yield from self._flatten(predicate=_is_buffer, recursive=recursive) | |||
def replace_param(self, | |||
params: dict, | |||
start_pos: int, | |||
seen: Optional[Set[int]] = None): | |||
offset = 0 | |||
if seen is None: | |||
seen = set([id(self)]) | |||
module_dict = vars(self) | |||
for key in sorted(module_dict): | |||
hash_id = id(module_dict[key]) | |||
if hash_id in seen: | |||
continue | |||
seen.add(hash_id) | |||
if isinstance(module_dict[key], Parameter): | |||
if start_pos + offset in params: | |||
assert module_dict[key].shape == params[start_pos + | |||
offset].shape | |||
module_dict[key] = params[start_pos + offset] | |||
offset += 1 | |||
if isinstance(module_dict[key], Module): | |||
offset += module_dict[key].replace_param(params, start_pos + offset, seen) | |||
return offset | |||
def named_buffers( | |||
self, prefix: str = "", recursive: bool = True | |||
) -> Iterable[Tuple[str, Buffer]]: | |||
@@ -0,0 +1,117 @@ | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
import collections | |||
from typing import Iterable, Optional | |||
import numpy as np | |||
from ..core import Parameter, Tensor | |||
from .module import Module | |||
from .._internal.opr import param_pack_split | |||
class ParamPack(Module): | |||
def __init__(self, | |||
model: Module, | |||
nr_ignore_first:int = 8, | |||
max_size_per_group: int = 10, | |||
max_nr_params_per_group: int = 100): | |||
super().__init__() | |||
self._model = model | |||
self._nr_ignore_first = nr_ignore_first | |||
self._max_size_per_group = max_size_per_group | |||
self._max_nr_params_per_group = max_nr_params_per_group | |||
self._grouped_params = [] | |||
self._packed_params = [] | |||
params = model.parameters() | |||
self._pack_params(params) | |||
def parameters(self, requires_grad: Optional[bool] = None) -> Iterable[Parameter]: | |||
for param in self._packed_params: | |||
if requires_grad is None or param.requires_grad == requires_grad: | |||
yield param | |||
def _pack_params(self, params: Iterable[Parameter]): | |||
groups = collections.defaultdict(list) | |||
ignored = 0 | |||
param_id = 0 | |||
for param in params: | |||
if self._nr_ignore_first > ignored: | |||
ignored += 1 | |||
self._grouped_params.append([{'tensor': param, 'id': param_id}]) | |||
self._packed_params.append(param) | |||
else: | |||
key = (param.dtype, param.device, param.requires_grad) | |||
groups[key].append({'tensor': param, 'id': param_id}) | |||
param_id += 1 | |||
for (dtype, device, requires_grad) in groups.keys(): | |||
dtype_sz = np.dtype(dtype).itemsize | |||
align = device.mem_align | |||
if align < dtype_sz: | |||
align = 1 | |||
else: | |||
assert align % dtype_sz == 0 | |||
align //= dtype_sz | |||
group = groups[(dtype, device, requires_grad)] | |||
while group: | |||
aligned_pos = [] | |||
offset = 0 | |||
params = [] | |||
idx = 0 | |||
while idx < len(group): | |||
param = group[idx] | |||
assert param['tensor'].device == device | |||
padding = (align - (offset & (align - 1))) & (align - 1) | |||
offset += padding | |||
aligned_pos.append(offset) | |||
params.append(param) | |||
offset += int(np.prod(param['tensor'].shape)) | |||
idx += 1 | |||
if (offset * dtype_sz >= | |||
self._max_size_per_group * 1024 * 1024 | |||
or idx >= self._max_nr_params_per_group): | |||
break | |||
group = group[idx:] | |||
if idx == 1: | |||
# ignore param packs with only one item | |||
self._packed_params.append(params[0]) | |||
self._grouped_params.append(params) | |||
continue | |||
packed_value = np.zeros((offset, ), dtype=dtype) | |||
for param, pos in zip(params, aligned_pos): | |||
val = param['tensor'].numpy() | |||
packed_value[pos:pos + val.size] = val.flatten() | |||
new_param = Parameter(value=packed_value, | |||
device=device, | |||
dtype=dtype, | |||
requires_grad=requires_grad) | |||
self._packed_params.append(new_param) | |||
self._grouped_params.append(params) | |||
def forward(self, *args, **kwargs): | |||
replace_param = dict() | |||
for i in range(len(self._packed_params)): | |||
packed_param = self._packed_params[i] | |||
grouped_params = self._grouped_params[i] | |||
if len(grouped_params) == 1: | |||
continue | |||
split = param_pack_split(packed_param._symvar, | |||
[i['tensor'].shape for i in grouped_params]) | |||
split = [ | |||
Parameter(Tensor(i, requires_grad=packed_param.requires_grad)) | |||
for i in split | |||
] | |||
for j in range(len(split)): | |||
replace_param[grouped_params[j]['id']] = split[j] | |||
self._model.replace_param(replace_param, 0) | |||
return self._model.forward(*args, **kwargs) |
@@ -168,6 +168,8 @@ class Optimizer(metaclass=ABCMeta): | |||
cg = get_default_graph() | |||
grads = grad_func(loss, params, use_virtual_grad=not cg.is_eager()) | |||
if not isinstance(grads, list): | |||
grads = [grads] | |||
assert len(grads) == len(params) | |||
for param, grad in zip(params, grads): | |||
@@ -0,0 +1,207 @@ | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
import itertools | |||
import numpy as np | |||
import pytest | |||
import megengine as mge | |||
from megengine.core import tensor | |||
from megengine.functional import cross_entropy_with_softmax, tanh | |||
from megengine.jit import trace | |||
from megengine.module import Linear, Module, ParamPack | |||
from megengine.optimizer import SGD | |||
batch_size = 64 | |||
data_shape = (batch_size, 2) | |||
label_shape = (batch_size,) | |||
def minibatch_generator(): | |||
while True: | |||
inp_data = np.zeros((batch_size, 2)) | |||
label = np.zeros(batch_size, dtype=np.int32) | |||
for i in range(batch_size): | |||
# [x0, x1], sampled from U[-1, 1] | |||
inp_data[i, :] = np.random.rand(2) * 2 - 1 | |||
label[i] = 0 if np.prod(inp_data[i]) < 0 else 1 | |||
yield inp_data.astype(np.float32), label.astype(np.int32) | |||
def calculate_precision(data: np.ndarray, pred: np.ndarray) -> float: | |||
""" Calculate precision for given data and prediction. | |||
:type data: [[x, y], ...] | |||
:param data: Input data | |||
:type pred: [[x_pred, y_pred], ...] | |||
:param pred: Network output data | |||
""" | |||
correct = 0 | |||
assert len(data) == len(pred) | |||
for inp_data, pred_output in zip(data, pred): | |||
label = 0 if np.prod(inp_data) < 0 else 1 | |||
pred_label = np.argmax(pred_output) | |||
if pred_label == label: | |||
correct += 1 | |||
return float(correct) / len(data) | |||
class XORNet(Module): | |||
def __init__(self): | |||
self.mid_layers = 14 | |||
self.num_class = 2 | |||
super().__init__() | |||
self.fc0 = Linear(self.num_class, self.mid_layers, bias=True) | |||
self.fc1 = Linear(self.mid_layers, self.mid_layers, bias=True) | |||
self.fc2 = Linear(self.mid_layers, self.num_class, bias=True) | |||
def forward(self, x): | |||
x = self.fc0(x) | |||
x = tanh(x) | |||
x = self.fc1(x) | |||
x = tanh(x) | |||
x = self.fc2(x) | |||
return x | |||
@pytest.mark.slow | |||
def test_static_graph_parampack(): | |||
net = XORNet() | |||
net = ParamPack(net, | |||
nr_ignore_first=0, | |||
max_size_per_group=10, | |||
max_nr_params_per_group=100) | |||
opt = SGD( | |||
net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
) | |||
@trace(symbolic=True) | |||
def train(data, label): | |||
pred = net(data) | |||
opt.zero_grad() | |||
loss = cross_entropy_with_softmax(pred, label) | |||
opt.backward(loss) | |||
return loss | |||
@trace(symbolic=True) | |||
def infer(data): | |||
return net(data) | |||
train_dataset = minibatch_generator() | |||
losses = [] | |||
for data, label in itertools.islice(train_dataset, 2000): | |||
loss = train(data, label) | |||
loss = loss[0][0] | |||
opt.step() | |||
losses.append(loss.numpy()) | |||
assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough" | |||
data, _ = next(train_dataset) | |||
pred = infer(data).numpy() | |||
assert calculate_precision(data, pred) > 0.95, "Test precision must be high enough" | |||
@pytest.mark.slow | |||
def test_dynamic_graph_parampack(): | |||
net = XORNet() | |||
net = ParamPack(net, | |||
nr_ignore_first=0, | |||
max_size_per_group=10, | |||
max_nr_params_per_group=100) | |||
opt = SGD( | |||
net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
) | |||
@trace(symbolic=False) | |||
def train(data, label): | |||
pred = net(data) | |||
opt.zero_grad() | |||
loss = cross_entropy_with_softmax(pred, label) | |||
opt.backward(loss) | |||
return loss | |||
@trace(symbolic=False) | |||
def infer(data): | |||
return net(data) | |||
train_dataset = minibatch_generator() | |||
losses = [] | |||
for data, label in itertools.islice(train_dataset, 2000): | |||
loss = train(data, label) | |||
loss = loss[0][0] | |||
opt.step() | |||
losses.append(loss.numpy()) | |||
assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough" | |||
data, _ = next(train_dataset) | |||
pred = infer(data).numpy() | |||
assert calculate_precision(data, pred) > 0.95, "Test precision must be high enough" | |||
@pytest.mark.slow | |||
def test_correctness_parampack(): | |||
net1 = XORNet() | |||
net2 = XORNet() | |||
params1 = net1.parameters() | |||
params2 = net2.parameters() | |||
for param1, param2 in zip(params1, params2): | |||
param1.set_value(param2.numpy()) | |||
net1 = ParamPack(net1, | |||
nr_ignore_first=0, | |||
max_size_per_group=10, | |||
max_nr_params_per_group=100) | |||
opt1 = SGD( | |||
net1.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
) | |||
opt2 = SGD( | |||
net2.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4 | |||
) | |||
@trace(symbolic=False) | |||
def train1(data, label): | |||
pred = net1(data) | |||
opt1.zero_grad() | |||
loss = cross_entropy_with_softmax(pred, label) | |||
opt1.backward(loss) | |||
return loss | |||
@trace(symbolic=False) | |||
def train2(data, label): | |||
pred = net2(data) | |||
opt2.zero_grad() | |||
loss = cross_entropy_with_softmax(pred, label) | |||
opt2.backward(loss) | |||
return loss | |||
@trace(symbolic=False) | |||
def infer1(data): | |||
return net1(data) | |||
@trace(symbolic=False) | |||
def infer2(data): | |||
return net2(data) | |||
train_dataset = minibatch_generator() | |||
for data, label in itertools.islice(train_dataset, 2000): | |||
train1(data, label) | |||
opt1.step() | |||
train2(data, label) | |||
opt2.step() | |||
data, _ = next(train_dataset) | |||
pred1 = infer1(data).numpy() | |||
pred2 = infer2(data).numpy() | |||
assert np.allclose(pred1, pred2) |