@@ -20,42 +20,42 @@ class GradManager: | |||||
the forward operations start and when all resources should be released. A typical usage of | the forward operations start and when all resources should be released. A typical usage of | ||||
GradManager is as follows: | GradManager is as follows: | ||||
.. code-block:: | |||||
.. code-block:: | |||||
gm = GradManager() | |||||
gm.attach(model.parameters()) | |||||
with gm: | |||||
# forward operations | |||||
... | |||||
# backward gradients | |||||
gm.backward(loss) | |||||
gm = GradManager() | |||||
gm.attach(model.parameters()) | |||||
with gm: | |||||
# forward operations | |||||
... | |||||
# backward gradients | |||||
gm.backward(loss) | |||||
You can also use `record()` and `release()` method instead of `with` context: | |||||
You can also use ``record()`` and ``release()`` method instead of ``with`` context: | |||||
.. code-block:: | |||||
.. code-block:: | |||||
gm = GradManager() | |||||
gm.attach(model.parameters()) | |||||
gm = GradManager() | |||||
gm.attach(model.parameters()) | |||||
gm.record() | |||||
gm.record() | |||||
# forward operations | |||||
... | |||||
# backward gradients | |||||
gm.backward(loss) | |||||
# forward operations | |||||
... | |||||
# backward gradients | |||||
gm.backward(loss) | |||||
gm.release() | |||||
gm.release() | |||||
Typically, in data parallel, we would like to average the gradients across | Typically, in data parallel, we would like to average the gradients across | ||||
processes. Users will finally get the averaged gradients if an "AllReduce" | processes. Users will finally get the averaged gradients if an "AllReduce" | ||||
callback is registered as follows: | callback is registered as follows: | ||||
.. code-block:: | |||||
.. code-block:: | |||||
import megengine.distributed as dist | |||||
import megengine.distributed as dist | |||||
gm = GradManager() | |||||
gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) | |||||
gm = GradManager() | |||||
gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) | |||||
""" | """ | ||||
@@ -50,7 +50,6 @@ class DataLoader: | |||||
:param dataset: dataset from which to load the minibatch. | :param dataset: dataset from which to load the minibatch. | ||||
:type sampler: Sampler | :type sampler: Sampler | ||||
:param sampler: defines the strategy to sample data from the dataset. | :param sampler: defines the strategy to sample data from the dataset. | ||||
If specified, :attr:`shuffle` must be ``False``. | |||||
:type transform: Transform | :type transform: Transform | ||||
:param transform: defined the transforming strategy for a sampled batch. | :param transform: defined the transforming strategy for a sampled batch. | ||||
Default: None | Default: None | ||||
@@ -17,4 +17,4 @@ from . import distributed # isort:skip | |||||
# delete namespace | # delete namespace | ||||
# pylint: disable=undefined-variable | # pylint: disable=undefined-variable | ||||
# del elemwise, graph, loss, math, nn, tensor # type: ignore[name-defined] | |||||
del elemwise, graph, loss, math, nn, quantized, tensor, utils # type: ignore[name-defined] |
@@ -127,9 +127,10 @@ def cross_entropy( | |||||
with_logits: bool = True, | with_logits: bool = True, | ||||
label_smooth: float = 0, | label_smooth: float = 0, | ||||
) -> Tensor: | ) -> Tensor: | ||||
r"""Compute the multi-class cross entropy loss (using logits by default). | |||||
r"""Computes the multi-class cross entropy loss (using logits by default). | |||||
By default, prediction is assumed to be logits, whose softmax gives probabilities. | |||||
By default(``with_logitis`` is True), ``pred`` is assumed to be logits, | |||||
class probabilities are given by softmax. | |||||
It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`. | It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`. | ||||
@@ -194,9 +195,10 @@ def cross_entropy( | |||||
def binary_cross_entropy( | def binary_cross_entropy( | ||||
pred: Tensor, label: Tensor, with_logits: bool = True | pred: Tensor, label: Tensor, with_logits: bool = True | ||||
) -> Tensor: | ) -> Tensor: | ||||
r"""Compute the binary cross entropy loss (using logits by default). | |||||
r"""Computes the binary cross entropy loss (using logits by default). | |||||
By default, prediction is assumed to be logits, whose sigmoid gives probabilities. | |||||
By default(``with_logitis`` is True), ``pred`` is assumed to be logits, | |||||
class probabilities are given by sigmoid. | |||||
:param pred: `(N, *)`, where `*` means any number of additional dimensions. | :param pred: `(N, *)`, where `*` means any number of additional dimensions. | ||||
:param label: `(N, *)`, same shape as the input. | :param label: `(N, *)`, same shape as the input. | ||||
@@ -335,8 +335,8 @@ def adaptive_max_pool2d( | |||||
Refer to :class:`~.MaxAdaptivePool2d` for more information. | Refer to :class:`~.MaxAdaptivePool2d` for more information. | ||||
:param inp: The input tensor. | |||||
:param oshp: (OH, OW) size of the output shape. | |||||
:param inp: input tensor. | |||||
:param oshp: `(OH, OW)` size of the output shape. | |||||
:return: output tensor. | :return: output tensor. | ||||
""" | """ | ||||
assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | ||||
@@ -356,8 +356,8 @@ def adaptive_avg_pool2d( | |||||
Refer to :class:`~.AvgAdaptivePool2d` for more information. | Refer to :class:`~.AvgAdaptivePool2d` for more information. | ||||
:param inp: The input tensor. | |||||
:param oshp: (OH, OW) size of the output shape. | |||||
:param inp: input tensor. | |||||
:param oshp: `(OH, OW)` size of the output shape. | |||||
:return: output tensor. | :return: output tensor. | ||||
""" | """ | ||||
assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | ||||
@@ -40,10 +40,10 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd): | |||||
\text{stride[1]} \times w + n) | \text{stride[1]} \times w + n) | ||||
\end{aligned} | \end{aligned} | ||||
Kernel_size and stride can be inferred from input shape and out shape: | |||||
padding: (0, 0) | |||||
stride: (floor(IH / OH), floor(IW / OW)) | |||||
kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||||
``kernel_size`` and ``stride`` can be inferred from input shape and out shape: | |||||
* padding: (0, 0) | |||||
* stride: (floor(IH / OH), floor(IW / OW)) | |||||
* kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||||
Examples: | Examples: | ||||
@@ -83,10 +83,10 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd): | |||||
out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} | out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} | ||||
input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) | input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) | ||||
Kernel_size and stride can be inferred from input shape and out shape: | |||||
padding: (0, 0) | |||||
stride: (floor(IH / OH), floor(IW / OW)) | |||||
kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||||
``kernel_size`` and ``stride`` can be inferred from input shape and out shape: | |||||
* padding: (0, 0) | |||||
* stride: (floor(IH / OH), floor(IW / OW)) | |||||
* kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||||
Examples: | Examples: | ||||
@@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta): | |||||
def replace_param( | def replace_param( | ||||
self, params: dict, start_pos: int, seen: Optional[Set[int]] = None | self, params: dict, start_pos: int, seen: Optional[Set[int]] = None | ||||
): | ): | ||||
"""Replaces module's parameters with `params`, used by :class:`~.ParamPack` to | |||||
"""Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to | |||||
speedup multimachine training. | speedup multimachine training. | ||||
""" | """ | ||||
offset = 0 | offset = 0 | ||||
@@ -411,7 +411,7 @@ class Module(metaclass=ABCMeta): | |||||
If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys | If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys | ||||
returned by :func:`state_dict`. | returned by :func:`state_dict`. | ||||
Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]` | |||||
Users can also pass a closure: ``Function[key: str, var: Tensor] -> Optional[np.ndarray]`` | |||||
as a `state_dict`, in order to handle complex situations. For example, load everything | as a `state_dict`, in order to handle complex situations. For example, load everything | ||||
except for the final linear classifier: | except for the final linear classifier: | ||||
@@ -423,7 +423,7 @@ class Module(metaclass=ABCMeta): | |||||
for k, v in state_dict.items() | for k, v in state_dict.items() | ||||
}, strict=False) | }, strict=False) | ||||
Here returning `None` means skipping parameter `k`. | |||||
Here returning ``None`` means skipping parameter ``k``. | |||||
To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading: | To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading: | ||||
@@ -485,9 +485,8 @@ class Module(metaclass=ABCMeta): | |||||
) | ) | ||||
def _load_state_dict_with_closure(self, closure): | def _load_state_dict_with_closure(self, closure): | ||||
"""Advance state_dict load through callable `closure` whose signature is | |||||
`closure(key: str, var: Tensor) -> Union[np.ndarry, None]` | |||||
"""Advance state_dict load through callable ``closure`` whose signature is | |||||
``closure(key: str, var: Tensor) -> Union[np.ndarry, None]`` | |||||
""" | """ | ||||
assert callable(closure), "closure must be a function" | assert callable(closure), "closure must be a function" | ||||