@@ -20,42 +20,42 @@ class GradManager: | |||
the forward operations start and when all resources should be released. A typical usage of | |||
GradManager is as follows: | |||
.. code-block:: | |||
.. code-block:: | |||
gm = GradManager() | |||
gm.attach(model.parameters()) | |||
with gm: | |||
# forward operations | |||
... | |||
# backward gradients | |||
gm.backward(loss) | |||
gm = GradManager() | |||
gm.attach(model.parameters()) | |||
with gm: | |||
# forward operations | |||
... | |||
# backward gradients | |||
gm.backward(loss) | |||
You can also use `record()` and `release()` method instead of `with` context: | |||
You can also use ``record()`` and ``release()`` method instead of ``with`` context: | |||
.. code-block:: | |||
.. code-block:: | |||
gm = GradManager() | |||
gm.attach(model.parameters()) | |||
gm = GradManager() | |||
gm.attach(model.parameters()) | |||
gm.record() | |||
gm.record() | |||
# forward operations | |||
... | |||
# backward gradients | |||
gm.backward(loss) | |||
# forward operations | |||
... | |||
# backward gradients | |||
gm.backward(loss) | |||
gm.release() | |||
gm.release() | |||
Typically, in data parallel, we would like to average the gradients across | |||
processes. Users will finally get the averaged gradients if an "AllReduce" | |||
callback is registered as follows: | |||
.. code-block:: | |||
.. code-block:: | |||
import megengine.distributed as dist | |||
import megengine.distributed as dist | |||
gm = GradManager() | |||
gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) | |||
gm = GradManager() | |||
gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) | |||
""" | |||
@@ -50,7 +50,6 @@ class DataLoader: | |||
:param dataset: dataset from which to load the minibatch. | |||
:type sampler: Sampler | |||
:param sampler: defines the strategy to sample data from the dataset. | |||
If specified, :attr:`shuffle` must be ``False``. | |||
:type transform: Transform | |||
:param transform: defined the transforming strategy for a sampled batch. | |||
Default: None | |||
@@ -17,4 +17,4 @@ from . import distributed # isort:skip | |||
# delete namespace | |||
# pylint: disable=undefined-variable | |||
# del elemwise, graph, loss, math, nn, tensor # type: ignore[name-defined] | |||
del elemwise, graph, loss, math, nn, quantized, tensor, utils # type: ignore[name-defined] |
@@ -127,9 +127,10 @@ def cross_entropy( | |||
with_logits: bool = True, | |||
label_smooth: float = 0, | |||
) -> Tensor: | |||
r"""Compute the multi-class cross entropy loss (using logits by default). | |||
r"""Computes the multi-class cross entropy loss (using logits by default). | |||
By default, prediction is assumed to be logits, whose softmax gives probabilities. | |||
By default(``with_logitis`` is True), ``pred`` is assumed to be logits, | |||
class probabilities are given by softmax. | |||
It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`. | |||
@@ -194,9 +195,10 @@ def cross_entropy( | |||
def binary_cross_entropy( | |||
pred: Tensor, label: Tensor, with_logits: bool = True | |||
) -> Tensor: | |||
r"""Compute the binary cross entropy loss (using logits by default). | |||
r"""Computes the binary cross entropy loss (using logits by default). | |||
By default, prediction is assumed to be logits, whose sigmoid gives probabilities. | |||
By default(``with_logitis`` is True), ``pred`` is assumed to be logits, | |||
class probabilities are given by sigmoid. | |||
:param pred: `(N, *)`, where `*` means any number of additional dimensions. | |||
:param label: `(N, *)`, same shape as the input. | |||
@@ -335,8 +335,8 @@ def adaptive_max_pool2d( | |||
Refer to :class:`~.MaxAdaptivePool2d` for more information. | |||
:param inp: The input tensor. | |||
:param oshp: (OH, OW) size of the output shape. | |||
:param inp: input tensor. | |||
:param oshp: `(OH, OW)` size of the output shape. | |||
:return: output tensor. | |||
""" | |||
assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | |||
@@ -356,8 +356,8 @@ def adaptive_avg_pool2d( | |||
Refer to :class:`~.AvgAdaptivePool2d` for more information. | |||
:param inp: The input tensor. | |||
:param oshp: (OH, OW) size of the output shape. | |||
:param inp: input tensor. | |||
:param oshp: `(OH, OW)` size of the output shape. | |||
:return: output tensor. | |||
""" | |||
assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" | |||
@@ -40,10 +40,10 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd): | |||
\text{stride[1]} \times w + n) | |||
\end{aligned} | |||
Kernel_size and stride can be inferred from input shape and out shape: | |||
padding: (0, 0) | |||
stride: (floor(IH / OH), floor(IW / OW)) | |||
kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
``kernel_size`` and ``stride`` can be inferred from input shape and out shape: | |||
* padding: (0, 0) | |||
* stride: (floor(IH / OH), floor(IW / OW)) | |||
* kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
Examples: | |||
@@ -83,10 +83,10 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd): | |||
out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} | |||
input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) | |||
Kernel_size and stride can be inferred from input shape and out shape: | |||
padding: (0, 0) | |||
stride: (floor(IH / OH), floor(IW / OW)) | |||
kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
``kernel_size`` and ``stride`` can be inferred from input shape and out shape: | |||
* padding: (0, 0) | |||
* stride: (floor(IH / OH), floor(IW / OW)) | |||
* kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) | |||
Examples: | |||
@@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta): | |||
def replace_param( | |||
self, params: dict, start_pos: int, seen: Optional[Set[int]] = None | |||
): | |||
"""Replaces module's parameters with `params`, used by :class:`~.ParamPack` to | |||
"""Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to | |||
speedup multimachine training. | |||
""" | |||
offset = 0 | |||
@@ -411,7 +411,7 @@ class Module(metaclass=ABCMeta): | |||
If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys | |||
returned by :func:`state_dict`. | |||
Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]` | |||
Users can also pass a closure: ``Function[key: str, var: Tensor] -> Optional[np.ndarray]`` | |||
as a `state_dict`, in order to handle complex situations. For example, load everything | |||
except for the final linear classifier: | |||
@@ -423,7 +423,7 @@ class Module(metaclass=ABCMeta): | |||
for k, v in state_dict.items() | |||
}, strict=False) | |||
Here returning `None` means skipping parameter `k`. | |||
Here returning ``None`` means skipping parameter ``k``. | |||
To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading: | |||
@@ -485,9 +485,8 @@ class Module(metaclass=ABCMeta): | |||
) | |||
def _load_state_dict_with_closure(self, closure): | |||
"""Advance state_dict load through callable `closure` whose signature is | |||
`closure(key: str, var: Tensor) -> Union[np.ndarry, None]` | |||
"""Advance state_dict load through callable ``closure`` whose signature is | |||
``closure(key: str, var: Tensor) -> Union[np.ndarry, None]`` | |||
""" | |||
assert callable(closure), "closure must be a function" | |||