docs(docstring): transfer to google style

GitOrigin-RevId: a71245c553
3 years ago · c5f8b58385
--- a/imperative/python/megengine/amp/autocast.py
+++ b/imperative/python/megengine/amp/autocast.py
@@ -11,38 +11,37 @@ from ..core.tensor import amp


 class autocast:
    r"""
    A class to control autocast mode for amp as a context manager or a decorator.
    r"""A class to control autocast mode for amp as a context manager or a decorator.

    :param enabled: Whether autocast mode is enabled.
    :param low_prec_dtype: Set amp autocast mode's lower precision dtype. It will change
        the target dtype in tensor casting for better speed and memory. Default: float16.
    :param high_prec_dtype: Set amp autocast mode's higher precision dtype. It will
        change the target dtype in tensor casting for better precision. Default: float32.
    Args:
        enabled: Whether autocast mode is enabled.
        low_prec_dtype: Set amp autocast mode's lower precision dtype. It will change
            the target dtype in tensor casting for better speed and memory. Default: float16.
        high_prec_dtype: Set amp autocast mode's higher precision dtype. It will
            change the target dtype in tensor casting for better precision. Default: float32.

    Examples:
        .. code-block::

    .. code-block::
           # used as decorator
           @autocast()
           def train_step(image, label):
               with gm:
                   logits = model(image)
                   loss = F.nn.cross_entropy(logits, label)
                   gm.backward(loss)
               opt.step().clear_grad()
               return loss

        # used as decorator
        @autocast()
        def train_step(image, label):
            with gm:
                logits = model(image)
                loss = F.nn.cross_entropy(logits, label)
                gm.backward(loss)
            opt.step().clear_grad()
            return loss

        # used as context manager
        def train_step(image, label):
            with autocast():
                with gm:
                    logits = model(image)
                    loss = F.nn.cross_entropy(logits, label)
                    gm.backward(loss)
            opt.step().clear_grad()
            return loss
           # used as context manager
           def train_step(image, label):
               with autocast():
                   with gm:
                       logits = model(image)
                       loss = F.nn.cross_entropy(logits, label)
                       gm.backward(loss)
               opt.step().clear_grad()
               return loss
    """

    def __init__(
--- a/imperative/python/megengine/amp/grad_scaler.py
+++ b/imperative/python/megengine/amp/grad_scaler.py
@@ -16,50 +16,51 @@ from ..tensor import Tensor


 class GradScaler:
    r"""
    A helper class that performs grad scaling to prevent from data overflow in
    r"""A helper class that performs grad scaling to prevent from data overflow in
    :class:`~.autocast` mode.

    :param init_scale: Initial scale factor.
    :param growth_factor: Factor that the scale is multiplied by in actual
        :meth:`update` stage. If growth_factor is 0, scale_factor will not update.
    :param backoff_factor: Factor that the scale is multiplied by when encountering
        overflow grad.
    :param growth_interval: The interval between two scale update stages.

    Example::

        gm = GradManager()
        opt = ...
        scaler = GradScaler()

        gm.attach(model.parameters())

        @autocast()
        def train_step(image, label):
            with gm:
                logits = model(image)
                loss = F.nn.cross_entropy(logits, label)
                scaler.backward(gm, loss)
            opt.step().clear_grad()
            return loss

    If need more flexible usage, could split ``scaler.backward`` into three lines:

    .. code-block::

        @autocast()
        def train_step(image, label):
            with gm:
                logits = model(image)
                loss = F.nn.cross_entropy(logits, label)
                gm.backward(loss， dy=megengine.tensor(scaler.scale_factor))
            scaler.unscale(gm.attached_tensors())
            scaler.update()
            opt.step().clear_grad()
            return loss

    This is useful when need to accumulate grads for multi batches.
    Args:
        init_scale: Initial scale factor.
        growth_factor: Factor that the scale is multiplied by in actual
            :meth:`update` stage. If growth_factor is 0, scale_factor will not update.
        backoff_factor: Factor that the scale is multiplied by when encountering
            overflow grad.
        growth_interval: The interval between two scale update stages.

    Example:
        .. code-block::

           gm = GradManager()
           opt = ...
           scaler = GradScaler()

           gm.attach(model.parameters())

           @autocast()
           def train_step(image, label):
               with gm:
                   logits = model(image)
                   loss = F.nn.cross_entropy(logits, label)
                   scaler.backward(gm, loss)
               opt.step().clear_grad()
               return loss

        If need more flexible usage, could split ``scaler.backward`` into three lines:

        .. code-block::

           @autocast()
           def train_step(image, label):
               with gm:
                   logits = model(image)
                   loss = F.nn.cross_entropy(logits, label)
                   gm.backward(loss， dy=megengine.tensor(scaler.scale_factor))
               scaler.unscale(gm.attached_tensors())
               scaler.update()
               opt.step().clear_grad()
               return loss

        This is useful when need to accumulate grads for multi batches.
    """

    def __init__(
@@ -86,18 +87,18 @@ class GradScaler:
        unscale_grad: bool = True,
        update_scale: bool = "if_unscale_grad"
    ):
        r"""
        A wrapper of GradManager's :meth:`~.GradManager.backward`, used to scale
        r"""A wrapper of GradManager's :meth:`~.GradManager.backward`, used to scale
        ``y``'s grad and unscale parameters' grads.

        :param gm: The to be wrapped GradManager.
        :param y: Same as GradManager backward's ``y``.
        :param dy: Same as GradManager backward's ``dy``. Will be multiplied
            by ``scale_factor``.
        :param unscale_grad: Whether do :meth:`unscale` at the same time. Could be
            ``False`` if needs to accumulate grads.
        :param update_scale: Same as :meth:`unscale`'s ``update``. Will be ignored
            if ``unscale_grad`` is ``False``.
        Args:
            gm: The to be wrapped GradManager.
            y: Same as GradManager backward's ``y``.
            dy: Same as GradManager backward's ``dy``. Will be multiplied
                by ``scale_factor``.
            unscale_grad: Whether do :meth:`unscale` at the same time. Could be
                ``False`` if needs to accumulate grads.
            update_scale: Same as :meth:`unscale`'s ``update``. Will be ignored
                if ``unscale_grad`` is ``False``.
        """
        # These checks should be consistent with GradManager's
        if y is None:
@@ -121,11 +122,11 @@ class GradScaler:
                self.update()

    def unscale(self, grad_tensors: Iterable[Tensor]):
        r"""
        Unscale all ``grad_tensors``'s grad.
        r"""Unscale all ``grad_tensors``'s grad.

        :param grad_tensors: Tensors needed to unscale grads. Should be all tensors
            that are affected by ``target`` tensor in GradManager's backward.
        Args:
            grad_tensors: Tensors needed to unscale grads. Should be all tensors
                that are affected by ``target`` tensor in GradManager's backward.
        """
        # use float64 for better precision
        inv_scale = Tensor(1.0 / self.scale_factor)
@@ -151,7 +152,8 @@ class GradScaler:

    def update(self, new_scale: float = None):
        r"""Update the scale factor according to whether encountered overflow grad.
        If ``new_scale`` is provided, internal update mechanism will be ignored."""
        If ``new_scale`` is provided, internal update mechanism will be ignored.
        """
        if self.growth_interval == 0:
            return

--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -32,8 +32,7 @@ _global_priority = 0


 class GradManager:
    r"""
    GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
    r"""GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
    automatic differentiation (a.k.a. back propagation).

    Reverse mode autodiff normally reuses many intermediate tensors for best computation efficiency.
@@ -120,7 +119,6 @@ class GradManager:

        gm = GradManager()
        gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))

    """

    def __init__(self):
@@ -136,8 +134,7 @@ class GradManager:
        return [spec.tensor() for spec in self._attach_specs.values()]

    def attach(self, tensors: Iterable[Tensor], callbacks=None):
        r"""
        Instruct GradManager to track operations on tensors, so that gradients with respect
        r"""Instruct GradManager to track operations on tensors, so that gradients with respect
        to those tensors could be evaluated later.

        :meth:`attach` also accepts a list of callbacks, which will be called with the tensor and
@@ -188,8 +185,9 @@ class GradManager:
            multiple uses of a GradManager, which is unrelated to whether resources is timely
            released within a single use.

        :param tensors: tensor or list of tensors to track
        :param callbacks: callback or list of callbacks
        Args:
            tensors: tensor or list of tensors to track
            callbacks: callback or list of callbacks
        """
        if callbacks is None:
            callbacks = []
@@ -234,8 +232,7 @@ class GradManager:
        y: Union[Tensor, List[Tensor]] = None,
        dy: Union[Tensor, List[Tensor]] = None,
    ):
        r"""
        Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
        r"""Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
        corresponding .grad attribute, and release resources along the way.

        :meth:`backward` computes the vector-Jacobian product :math:`dx_j = \sum_{i} dy_i J_{ij}`
@@ -257,8 +254,9 @@ class GradManager:
        process of this call. When the call successfully finishes, the GradManager will be put back
        to an inactive state.

        :param y: tensor or list of tensors
        :param dy: tensor or list of tensors. Defaults to 1 if y is scalar
        Args:
            y: tensor or list of tensors
            dy: tensor or list of tensors. Defaults to 1 if y is scalar
        """
        push_scope("backward")
        set_option("record_computing_path", 0)
@@ -310,8 +308,7 @@ class GradManager:
        pop_scope("backward")

    def record(self):
        r"""
        Start recording operations
        r"""Start recording operations

        After this call, you will be able to call :meth:`backward`.
        """
@@ -342,8 +339,7 @@ class GradManager:
        self._grad.wrt(tensor, callback=callback)

    def release(self):
        r"""
        Stop recording operations and release resources kept for gradient computation
        r"""Stop recording operations and release resources kept for gradient computation

        After this call, you will not be able to call :meth:`backward`.
        """
--- a/imperative/python/megengine/core/_trace_option.py
+++ b/imperative/python/megengine/core/_trace_option.py
@@ -15,16 +15,12 @@ if os.environ.get("MEGENGINE_USE_SYMBOLIC_SHAPE"):


 def use_symbolic_shape() -> bool:
    """
    Returns whether tensor.shape returns a tensor instead of a tuple

    """
    r"""Returns whether tensor.shape returns a tensor instead of a tuple"""
    return _use_symbolic_shape


 def set_symbolic_shape(option: bool):
    """ Sets whether tensor.shape returns a tensor instead of a tuple
    """
    r"""Sets whether tensor.shape returns a tensor instead of a tuple"""
    global _use_symbolic_shape
    _org = _use_symbolic_shape
    _use_symbolic_shape = option
--- a/imperative/python/megengine/core/autodiff/grad.py
+++ b/imperative/python/megengine/core/autodiff/grad.py
@@ -88,67 +88,56 @@ class Grad:


 class Function(ops.PyOpBase):
    """
    Defines a block of operations with customizable differentiation.

    r"""Defines a block of operations with customizable differentiation.
    
    The computation should be defined in ``forward`` method, with gradient
    computation defined in ``backward`` method.

    
    Each instance of ``Function`` should be used only once during forwardding.

    
    Examples:

    .. code-block::

        class Sigmoid(Function):
            def forward(self, x):
                y = 1 / (1 + F.exp(-x))
                self.y = y
                return y

            def backward(self, dy):
                y = self.y
                return dy * y * (1-y)

    
        .. code-block::
    
            class Sigmoid(Function):
                def forward(self, x):
                    y = 1 / (1 + F.exp(-x))
                    self.y = y
                    return y

                def backward(self, dy):
                    y = self.y
    """

    def forward(self, *args, **kwargs):
        """
        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.

        :param input: input tensors.
        :return: a tuple of Tensor or a single Tensor.

        .. note::

            This method should return a tuple of Tensor or a single Tensor representing the output
            of the function.

        .. note::

            positional arguments should all be Tensor

        r"""Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.

        Args:
            input: input tensors.

        Returns:
            a tuple of Tensor or a single Tensor.
          
        Note:
            * This method should return a tuple of Tensor or a single Tensor representing the output
              of the function.
            * positional arguments should all be Tensor
        """
        raise NotImplementedError

    def backward(self, *output_grads):
        """
        Compute the gradient of the forward function. It must be overriden by all subclasses.

        :param output_grads: gradients of outputs that are returned by :meth:`forward`.

        .. note::

            In case when some tensors of outputs are not related to loss function, the corresponding
            values in ``output_grads`` would be ``None``.

        .. note::

            This method should return a tuple which containing the gradients of all inputs, in the same order
            as the ``inputs`` argument of :meth:`forward` . A ``Tensor`` could be returned
            instead if there is only one input. If users want to stop the propagation of some gradients,
            the corresponding returned values should be set ``None`` .

        r"""Compute the gradient of the forward function. It must be overriden by all subclasses.

        Args:
            output_grads: gradients of outputs that are returned by :meth:`forward`.
        
        Note:
            * In case when some tensors of outputs are not related to loss function, the corresponding
              values in ``output_grads`` would be ``None``.
            * This method should return a tuple which containing the gradients of all inputs, in the same order
              as the ``inputs`` argument of :meth:`forward` . A ``Tensor`` could be returned
              instead if there is only one input. If users want to stop the propagation of some gradients,
              the corresponding returned values should be set ``None`` .
        """
        raise NotImplementedError

--- a/imperative/python/megengine/core/tensor/amp.py
+++ b/imperative/python/megengine/core/tensor/amp.py
@@ -12,16 +12,14 @@ _low_prec_dtype = "float16"

@property
 def enabled(mod):
    r"""
    Get or set amp autocast mode enabled or not.

    r"""Get or set amp autocast mode enabled or not.
    
    Examples:
    
        .. code-block::

    .. code-block::

        import megengine as mge
        mge.amp.enabled = True

           import megengine as mge
           mge.amp.enabled = True
    """
    return _enabled

@@ -34,17 +32,15 @@ def enabled(mod, enabled: bool):

@property
 def high_prec_dtype(mod):
    r"""
    Get or set amp autocast mode's higher precision dtype. It will change the
    r"""Get or set amp autocast mode's higher precision dtype. It will change the
    target dtype in tensor casting for better precision. Default: float32.

    
    Examples:
    
        .. code-block::

    .. code-block::

        import megengine as mge
        mge.amp.high_prec_dtype = "float32"

           import megengine as mge
           mge.amp.high_prec_dtype = "float32"
    """
    return _high_prec_dtype

@@ -57,17 +53,15 @@ def high_prec_dtype(mod, dtype: str):

@property
 def low_prec_dtype(mod):
    r"""
    Get or set amp autocast mode's lower precision dtype. It will change the
    r"""Get or set amp autocast mode's lower precision dtype. It will change the
    target dtype in tensor casting for better speed and memory. Default: float16.

    
    Examples:
    
        .. code-block::

    .. code-block::

        import megengine as mge
        mge.amp.low_prec_dtype = "float16"

           import megengine as mge
           mge.amp.low_prec_dtype = "float16"
    """
    return _low_prec_dtype

--- a/imperative/python/megengine/core/tensor/array_method.py
+++ b/imperative/python/megengine/core/tensor/array_method.py
@@ -389,9 +389,7 @@ class ArrayMethodMixin(abc.ABC):

    @property
    def ndim(self):
        r"""
        Returns the number of dimensions of self :class:`~.Tensor`.
        """
        r"""Returns the number of dimensions of self :class:`~.Tensor`."""
        shape = self._tuple_shape
        if shape is None:
            raise ValueError("unkown ndim")
@@ -399,8 +397,7 @@ class ArrayMethodMixin(abc.ABC):

    @property
    def size(self):
        r"""
        Returns the size of the self :class:`~.Tensor`.
        r"""Returns the size of the self :class:`~.Tensor`.
        The returned value is a subclass of :class:`tuple`.
        """
        shape = self.shape
@@ -410,14 +407,11 @@ class ArrayMethodMixin(abc.ABC):

    @property
    def T(self):
        r"""
        alias of :attr:`~.Tensor.transpose`.
        """
        r"""alias of :attr:`~.Tensor.transpose`."""
        return self.transpose()

    def item(self, *args):
        r"""
        Returns the value of this :class:`~.Tensor` as a standard Python :class:`numbers.Number`.
        r"""Returns the value of this :class:`~.Tensor` as a standard Python :class:`numbers.Number`.
        This only works for tensors with one element. For other cases, see :meth:`~.tolist`.
        """
        if not args:
@@ -427,8 +421,7 @@ class ArrayMethodMixin(abc.ABC):
        return self[args].item()

    def tolist(self):
        r"""
        Returns the tensor as a (nested) list.
        r"""Returns the tensor as a (nested) list.
        For scalars, a standard Python number is returned, just like with :meth:`~.item`.
        Tensors are automatically moved to the CPU first if necessary.

@@ -437,16 +430,13 @@ class ArrayMethodMixin(abc.ABC):
        return self.numpy().tolist()

    def astype(self, dtype):
        r"""
        Returns a :class:`Tensor` with the same data and number of elements
        r"""Returns a :class:`Tensor` with the same data and number of elements
        with the specified :attr:`~.Tensor.dtype`.
        """
        return astype(self, dtype)

    def reshape(self, *args):
        r"""
        See :func:`~.reshape`.
        """
        r"""See :func:`~.reshape`."""
        return _reshape(self, _expand_args(args))

    # FIXME: remove this method
@@ -454,9 +444,7 @@ class ArrayMethodMixin(abc.ABC):
        return _broadcast(self, _expand_args(args))

    def transpose(self, *args):
        r"""
        See :func:`~.transpose`.
        """
        r"""See :func:`~.transpose`."""
        if self.ndim == 0:
            assert (
                len(args) == 0
@@ -469,172 +457,170 @@ class ArrayMethodMixin(abc.ABC):
        return _transpose(self, _expand_args(args))

    def flatten(self):
        r"""
        See :func:`~.flatten`.
        """
        r"""See :func:`~.flatten`."""
        return self.reshape(-1)

    def sum(self, axis=None, keepdims: bool = False):
        r"""
        Returns the sum of each row of the input tensor in the given dimension ``axis``.
        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

        :param axis: the dimension or dimensions to reduce.
        :param keepdims: whether the output tensor has ndim retained or not.
        :return: output tensor.
        Args:
            axis: the dimension or dimensions to reduce.
            keepdims: whether the output tensor has ndim retained or not.

        Examples:

        .. testcode::
        Returns:
            output tensor.

            from megengine import tensor
            a = tensor([False, True, True, False])
            b = tensor([1.0, 2.0, 3.0, 4.0])
            print(a.sum().numpy())
            print(b.sum().numpy())
        Examples:
            .. testcode::

        Outputs:
               from megengine import tensor
               a = tensor([False, True, True, False])
               b = tensor([1.0, 2.0, 3.0, 4.0])
               print(a.sum().numpy())
               print(b.sum().numpy())

        .. testoutput::
            Outputs:

            2
            10.0
            .. testoutput::

               2
               10.0
        """
        return _reduce("sum")(self, axis, keepdims)

    def prod(self, axis=None, keepdims: bool = False):
        r"""
        Returns the product of each row of the input tensor in the given dimension ``axis``.
        r"""Returns the product of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

        :param axis: the dimension or dimensions to reduce.
        :param keepdims: whether the output tensor has ndim retained or not.
        :return: output tensor.

        Examples:
        Args:
            axis: the dimension or dimensions to reduce.
            keepdims: whether the output tensor has ndim retained or not.

        .. testcode::
        Returns:
            output tensor.

            from megengine import tensor
            a = tensor([False, True, True, False])
            b = tensor([1.0, 2.0, 3.0, 4.0])
            print(a.prod().numpy())
            print(b.prod().numpy())
        Examples:
            .. testcode::

        Outputs:
               from megengine import tensor
               a = tensor([False, True, True, False])
               b = tensor([1.0, 2.0, 3.0, 4.0])
               print(a.prod().numpy())
               print(b.prod().numpy())

        .. testoutput::
            Outputs:

            0
            24.0
            .. testoutput::

               0
               24.0
        """
        return _reduce("product")(self, axis, keepdims)

    def min(self, axis=None, keepdims: bool = False):
        r"""
        Returns the min value of each row of the input tensor in the given dimension ``axis``.
        r"""Returns the min value of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

        :param axis: the dimension or dimensions to reduce.
        :param keepdims: whether the output tensor has ndim retained or not.
        :return: output tensor.

        Examples:
        Args:
            axis: the dimension or dimensions to reduce.
            keepdims: whether the output tensor has ndim retained or not.

        .. testcode::
        Returns:
            output tensor.

            from megengine import tensor
            a = tensor([False, True, True, False])
            b = tensor([1.0, 2.0, 3.0, 4.0])
            print(a.min().numpy())
            print(b.min().numpy())
        Examples:
            .. testcode::

        Outputs:
               from megengine import tensor
               a = tensor([False, True, True, False])
               b = tensor([1.0, 2.0, 3.0, 4.0])
               print(a.min().numpy())
               print(b.min().numpy())

        .. testoutput::
            Outputs:

            False
            1.0
            .. testoutput::

               False
               1.0
        """
        return _reduce("min")(self, axis, keepdims)

    def max(self, axis=None, keepdims: bool = False):
        r"""
        Returns the max value of each row of the input tensor in the given dimension ``axis``.
        r"""Returns the max value of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

        :param axis: the dimension or dimensions to reduce.
        :param keepdims: whether the output tensor has ndim retained or not.
        :return: output tensor.

        Examples:
        Args:
            axis: the dimension or dimensions to reduce.
            keepdims: whether the output tensor has ndim retained or not.

        .. testcode::
        Returns:
            output tensor.

            from megengine import tensor
            a = tensor([False, True, True, False])
            b = tensor([1.0, 2.0, 3.0, 4.0])
            print(a.max().numpy())
            print(b.max().numpy())
        Examples:
            .. testcode::

        Outputs:
               from megengine import tensor
               a = tensor([False, True, True, False])
               b = tensor([1.0, 2.0, 3.0, 4.0])
               print(a.max().numpy())
               print(b.max().numpy())

        .. testoutput::
            Outputs:

            True
            4.0
            .. testoutput::

               True
               4.0
        """
        return _reduce("max")(self, axis, keepdims)

    def mean(self, axis=None, keepdims: bool = False):
        r"""
        Returns the mean value of each row of the input tensor in the given dimension ``axis``.
        r"""Returns the mean value of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

        :param axis: the dimension or dimensions to reduce.
        :param keepdims: whether the output tensor has ndim retained or not.
        :return: output tensor.
        Args:
            axis: the dimension or dimensions to reduce.
            keepdims: whether the output tensor has ndim retained or not.

        Examples:
        Returns:
            output tensor.

        .. testcode::

            from megengine import tensor
            a = tensor([False, True, True, False])
            b = tensor([1.0, 2.0, 3.0, 4.0])
            print(a.mean().numpy())
            print(b.mean().numpy())
        Examples:
            .. testcode::

        Outputs:
               from megengine import tensor
               a = tensor([False, True, True, False])
               b = tensor([1.0, 2.0, 3.0, 4.0])
               print(a.mean().numpy())
               print(b.mean().numpy())

        .. testoutput::
            Outputs:

            0.5
            2.5
            .. testoutput::

               0.5
               2.5
        """
        return _reduce("mean")(self, axis, keepdims)
--- a/imperative/python/megengine/core/tensor/dtype.py
+++ b/imperative/python/megengine/core/tensor/dtype.py
@@ -47,17 +47,17 @@ class QuantDtypeMeta(
        ["name", "cname", "np_dtype_str", "qmin", "qmax", "is_unsigned"],
    )
 ):
    r"""
    Store metadata for quantize dtype. Could be used to create custom quant dtype
    r"""Store metadata for quantize dtype. Could be used to create custom quant dtype
    for QAT when the network don't need to be converted for inference, but only
    to export network metadata for third-party platform inference.

    :param name: a unique name string.
    :param cname: used in :func:`~.create_quantized_dtype` for model dump and inference.
    :param np_dtype_str: used in :func:`~.create_quantized_dtype` to generate ``np.dtype``.
    :param qmin: a int number indicating quant dtype's lowerbound.
    :param qmax: a int number indicating quant dtype's upperbound.
    :param is_unsigned: a helper value that could be inference from np_dtype_str.
    Args:
        name: a unique name string.
        cname: used in :func:`~.create_quantized_dtype` for model dump and inference.
        np_dtype_str: used in :func:`~.create_quantized_dtype` to generate ``np.dtype``.
        qmin: a int number indicating quant dtype's lowerbound.
        qmax: a int number indicating quant dtype's upperbound.
        is_unsigned: a helper value that could be inference from np_dtype_str.
    """

    def __new__(
@@ -77,7 +77,7 @@ class QuantDtypeMeta(
        return self

    def __deepcopy__(self, _):
        """
        r"""
        Ignore deepcopy so that a dtype meta can be treated as singleton, for more
        strict check in :meth:`~.FakeQuantize.fake_quant_forward`.
        """
@@ -113,17 +113,17 @@ def _check_zero_point(zp: int, dtype_meta: QuantDtypeMeta):
 def create_quantized_dtype(
    dtype_meta: QuantDtypeMeta, scale: float, zp: Union[int, None]
 ):
    r"""
    Get quantized dtype with metadata attribute according to _metadata_dict.

    r"""Get quantized dtype with metadata attribute according to _metadata_dict.
    
    Note that unsigned dtype must have ``zero_point`` and signed dtype must
    not have ``zero_point``, to be consitent with tensor generated by calling
    compiled function from `CompGraph.compile(inputs, outspec)`.

    :param dtype_meta: a QuantDtypeMeta indicating which dtype to return. the
        ``cname`` attribute cannot be ``None``.
    :param scale: a number for scale to store in dtype's metadata
    :param zp: a number for zero_point to store in dtype's metadata
    Args:
        dtype_meta: a QuantDtypeMeta indicating which dtype to return. the
            ``cname`` attribute cannot be ``None``.
        scale: a number for scale to store in dtype's metadata
        zp: a number for zero_point to store in dtype's metadata
    """
    if dtype_meta.cname is None:
        raise ValueError("dtype {} without cname attr is not supported.")
@@ -152,8 +152,7 @@ def create_quantized_dtype(


 def quint8(scale, zero_point):
    """
    Consturct a quantized unsigned int8 data type with ``scale`` (float) and
    r"""Consturct a quantized unsigned int8 data type with ``scale`` (float) and
    ``zero_point`` (uint8). The real value represented by a quint8 data type is
    float_val = scale * (uint8_val - zero_point)
    """
@@ -161,24 +160,21 @@ def quint8(scale, zero_point):


 def qint8(scale):
    """
    Construct a quantized int8 data type with ``scale`` (float). The real value
    r"""Construct a quantized int8 data type with ``scale`` (float). The real value
    represented by a qint8 data type is float_val = scale * int8_val
    """
    return create_quantized_dtype(_builtin_quant_dtypes["qint8"], scale, None)


 def qint32(scale):
    """
    Construct a quantized int32 data type with ``scale`` (float). The real value
    r"""Construct a quantized int32 data type with ``scale`` (float). The real value
    represented by a qint32 data type is float_val = scale * int32_val
    """
    return create_quantized_dtype(_builtin_quant_dtypes["qint32"], scale, None)


 def quint4(scale, zero_point):
    """
    Consturct a quantized unsigned int4 data type with ``scale`` (float) and
    r"""Consturct a quantized unsigned int4 data type with ``scale`` (float) and
    ``zero_point`` (uint8). The real value represented by a quint4 data type is
    float_val = scale * (uint4_val - zero_point)
    """
@@ -186,8 +182,7 @@ def quint4(scale, zero_point):


 def qint4(scale):
    """
    Construct a quantized int4 data type with ``scale`` (float). The real value
    r"""Construct a quantized int4 data type with ``scale`` (float). The real value
    represented by a qint4 data type is float_val = scale * int4_val
    """
    return create_quantized_dtype(_builtin_quant_dtypes["qint4"], scale, None)
@@ -244,95 +239,95 @@ def _convert_from_quantized_dtype(arr: np.ndarray, dtype_meta: QuantDtypeMeta):


 def convert_to_quint8(arr: np.ndarray, q: np.dtype):
    """
    Quantize a float NumPy ndarray into a quint8 one with specified params.
    r"""Quantize a float NumPy ndarray into a quint8 one with specified params.

    :param arr: Input ndarray.
    :param q: Target data type, should be a quint8.
    Args:
        arr: Input ndarray.
        q: Target data type, should be a quint8.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["quint8"])


 def convert_from_quint8(arr: np.ndarray):
    """
    Dequantize a quint8 NumPy ndarray into a float one.
    r"""Dequantize a quint8 NumPy ndarray into a float one.

    :param arr: Input ndarray.
    Args:
        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["quint8"])


 def convert_to_qint8(arr: np.ndarray, q: np.dtype):
    """
    Quantize a float NumPy ndarray into a qint8 one with specified params.
    r"""Quantize a float NumPy ndarray into a qint8 one with specified params.

    :param arr: Input ndarray.
    :param q: Target data type, should be a qint8.
    Args:
        arr: Input ndarray.
        q: Target data type, should be a qint8.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint8"])


 def convert_from_qint8(arr: np.ndarray):
    """
    Dequantize a qint8 NumPy ndarray into a float one.
    r"""Dequantize a qint8 NumPy ndarray into a float one.

    :param arr: Input ndarray.
    Args:
        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint8"])


 def convert_to_qint32(arr: np.ndarray, q: np.dtype):
    """
    Quantize a float NumPy ndarray into a qint32 one with specified params.
    r"""Quantize a float NumPy ndarray into a qint32 one with specified params.

    :param arr: Input ndarray.
    :param q: Target data type, should be a qint8.
    Args:
        arr: Input ndarray.
        q: Target data type, should be a qint8.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint32"])


 def convert_from_qint32(arr):
    """
    Dequantize a qint32 NumPy ndarray into a float one.
    r"""Dequantize a qint32 NumPy ndarray into a float one.

    :param arr: Input ndarray.
    Args:
        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint32"])


 def convert_to_quint4(arr: np.ndarray, q: np.dtype):
    """
    Quantize a float NumPy ndarray into a quint4 one with specified params.
    r"""Quantize a float NumPy ndarray into a quint4 one with specified params.

    :param arr: Input ndarray.
    :param q: Target data type, should be a quint4.
    Args:
        arr: Input ndarray.
        q: Target data type, should be a quint4.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["quint4"])


 def convert_from_quint4(arr: np.ndarray):
    """
    Dequantize a quint4 NumPy ndarray into a float one.
    r"""Dequantize a quint4 NumPy ndarray into a float one.

    :param arr: Input ndarray.
    Args:
        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["quint4"])


 def convert_to_qint4(arr: np.ndarray, q: np.dtype):
    """
    Quantize a float NumPy ndarray into a qint4 one with specified params.
    r"""Quantize a float NumPy ndarray into a qint4 one with specified params.

    :param arr: Input ndarray.
    :param q: Target data type, should be a qint4.
    Args:
        arr: Input ndarray.
        q: Target data type, should be a qint4.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint4"])


 def convert_from_qint4(arr: np.ndarray):
    """
    Dequantize a qint4 NumPy ndarray into a float one.
    r"""Dequantize a qint4 NumPy ndarray into a float one.

    :param arr: Input ndarray.
    Args:
        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint4"])
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -24,11 +24,11 @@ from .core import TensorBase


 def set_priority_to_id(dest_vars):
    """
    For all oprs in the subgraph constructed by dest_vars,
    r"""For all oprs in the subgraph constructed by dest_vars,
    sets its priority to id if its original priority is zero.
    
    :param dest_vars: target vars representing the graph.

    Args:
        dest_vars: target vars representing the graph.
    """
    dest_vec = []
    for i in dest_vars:
@@ -220,54 +220,50 @@ class OpNode:


 def optimize_for_inference(dest_vars, **kwargs):
    r"""
    Applies optimize_for_inference pass for computing graph.

        :param dest_vars: list of output vars in the computing graph

        :Keyword Arguments:

            * enable_io16xc32 --
                whether to use float16 for I/O between oprs and use
                float32 as internal computation precision. Note the output var would be
                changed to float16.
            * enable_ioc16 --
                whether to use float16 for both I/O and computation
                precision.

            * enable_hwcd4 --
                whether to use NHWCD4 data layout. This is faster on some
                OpenCL backend.
            * enable_nchw88 --
                whether to use NCHW88 data layout, currently
                used in X86 AVX backend.
            * enable_nchw44 --
                whether to use NCHW44 data layout, currently
                used in arm backend.
            * enable_nchw44_dot --
                whether to use NCHW44_dot data layout, currently
                used in armv8.2+dotprod backend.
            * enable_nchw4 --
                whether to use NCHW4 data layout, currently
                used in nvidia backend(based on cudnn).
            * enable_nchw32 --
                whether to use NCHW32 data layout, currently
                used in nvidia backend with tensorcore(based on cudnn).
            * enable_chwn4 --
                whether to use CHWN4 data layout, currently
                used in nvidia backend with tensorcore.
            * enable_nchw64 --
                whether to use NCHW64 data layout, used for fast int4
                support on Nvidia GPU.

            * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
                into one opr.
            * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
                input for inference on nvidia backend(this optimization pass will
                result in mismatch of the precision of output of training and
                inference)
            * enable_fuse_preprocess: whether to fuse astype\pad channel\dimshuffle and
                etc opr from h2d opr.
    r"""Applies optimize_for_inference pass for computing graph.

    Args:
        dest_vars: list of output vars in the computing graph

    Keyword Arguments:

        * enable_io16xc32 --
          whether to use float16 for I/O between oprs and use
          float32 as internal computation precision. Note the output var would be
          changed to float16.
        * enable_ioc16 --
          whether to use float16 for both I/O and computation
          precision.
        * enable_hwcd4 --
          whether to use NHWCD4 data layout. This is faster on some
          OpenCL backend.
        * enable_nchw88 --
          whether to use NCHW88 data layout, currently
          used in X86 AVX backend.
        * enable_nchw44 --
          whether to use NCHW44 data layout, currently
          used in arm backend.
        * enable_nchw44_dot --
          whether to use NCHW44_dot data layout, currently
          used in armv8.2+dotprod backend.
        * enable_nchw4 --
          whether to use NCHW4 data layout, currently
          used in nvidia backend(based on cudnn).
        * enable_nchw32 --
          whether to use NCHW32 data layout, currently
          used in nvidia backend with tensorcore(based on cudnn).
        * enable_chwn4 --
          whether to use CHWN4 data layout, currently
          used in nvidia backend with tensorcore.
        * enable_nchw64 --
          whether to use NCHW64 data layout, used for fast int4
          support on Nvidia GPU.
        * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
          into one opr.
        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
          input for inference on nvidia backend(this optimization pass will
          result in mismatch of the precision of output of training and
          inference)
    """
    inference_options = GraphOptimizeOptions()
    inference_optimize_layout_transform_map = {
@@ -305,11 +301,13 @@ def optimize_for_inference(dest_vars, **kwargs):


 def deserialize_infer_option(x: int) -> Dict[str, bool]:
    r"""
    Deserailize optimize options generated by ``imperative_rt.GraphOptimizeOptions``.
    r"""Deserailize optimize options generated by ``imperative_rt.GraphOptimizeOptions``.

    :param x: inference options represented by int.
    :return: inference options represented by dict.
    Args:
        x: inference options represented by int.

    Returns:
        inference options represented by dict.
    """

    inference_options = GraphOptimizeOptions.deserialize(x)
@@ -346,13 +344,12 @@ def deserialize_infer_option(x: int) -> Dict[str, bool]:


 def modify_opr_algo_strategy_inplace(dest_vars, strategy: str):
    """
    C++ graph version of :func:`~.set_execution_strategy`. Used to inplacely modify
    r"""C++ graph version of :func:`~.set_execution_strategy`. Used to inplacely modify
    dumped graph's fast-run strategy.

    :param dest_vars: list of output vars in the computing graph.
    :param strategy: fast-run algorithms strategy.

    Args:
        dest_vars: list of output vars in the computing graph.
        strategy: fast-run algorithms strategy.
    """
    dest_vars = _unwrap(dest_vars)
    _imperative_rt.modify_opr_algo_strategy_inplace(dest_vars, strategy)
@@ -383,39 +380,40 @@ def dump_graph(
    append_json=False,
    metadata=None
 ) -> Tuple[bytes, CompGraphDumpResult]:
    """
    serialize the computing graph of `output_vars` and get byte result.

    :param output_vars: output variables which are the graph's end point.

        .. note::

            The underlying C++ API only accepts a var list. If a dict is given,
            the vars would be renamed to the given names.

    :param keep_var_name: level for keeping variable names:

        * 0: none of the names are kept
        * 1: (default)keep names of output vars
        * 2: keep names of all (output and internal) vars
    :param keep_opr_name: whether to keep operator names.
    :param keep_param_name: whether to keep param names, so param values can be
        easily manipulated after loading model
    :param keep_opr_priority: whether to keep priority setting for operators
    :param strip_info_file: a string for path or a file handler. if is not None,
        then the dump information for code strip would be written to ``strip_info_file``
    :param append_json: will be check when `strip_info_file` is not None. if set
        true, the information for code strip will be append to strip_info_file.
        if set false, will rewrite strip_info_file
    :return: dump result as byte string, and an instance of namedtuple
    r"""serialize the computing graph of `output_vars` and get byte result.

    Args:
        output_vars: output variables which are the graph's end point.
        keep_var_name: level for keeping variable names:

            * 0: none of the names are kept
            * 1: (default)keep names of output vars
            * 2: keep names of all (output and internal) vars

        keep_opr_name: whether to keep operator names.
        keep_param_name: whether to keep param names, so param values can be
            easily manipulated after loading model
        keep_opr_priority: whether to keep priority setting for operators
        strip_info_file: a string for path or a file handler. if is not None,
            then the dump information for code strip would be written to ``strip_info_file``
        append_json: will be check when `strip_info_file` is not None. if set
            true, the information for code strip will be append to strip_info_file.
            if set false, will rewrite strip_info_file

    Note:
        The underlying C++ API only accepts a var list. If a dict is given,
        the vars would be renamed to the given names.

    Returns:
        dump result as byte string, and an instance of namedtuple
        :class:`CompGraphDumpResult`, whose fields are:

            * ``nr_opr`` number of operators dumped
            * ``tot_bytes`` total bytes for the whole graph
            * ``tensor_value_bytes`` bytes consumed for dumping tensor values
            * ``inputs`` names of input tensors
            * ``params`` list of names of dumped params
            * ``outputs`` names of output vars
        * ``nr_opr`` number of operators dumped
        * ``tot_bytes`` total bytes for the whole graph
        * ``tensor_value_bytes`` bytes consumed for dumping tensor values
        * ``inputs`` names of input tensors
        * ``params`` list of names of dumped params
        * ``outputs`` names of output vars
    """
    if isinstance(output_vars, dict):
        used_vars = set()
@@ -483,17 +481,19 @@ CompGraphLoadResult = collections.namedtuple(


 def load_graph(fpath) -> CompGraphLoadResult:
    """
    Load a serialized computing graph from file.
    r"""Load a serialized computing graph from file.

    Args:
        fpath: Path or Handle of the input file

    :param fpath: Path or Handle of the input file
    :return: An instance of namedtuple :class:`CompGraphLoadResult`,
    Returns:
        An instance of namedtuple :class:`CompGraphLoadResult`,
        whose fields are:

            * ``graph`` loaded CompGraph
            * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
            * ``output_vars_list`` A Python list, containing output vars in the
                                   order passed to serialize_comp_graph_to_file
        * ``graph`` loaded CompGraph
        * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
        * ``output_vars_list`` A Python list, containing output vars in the
          order passed to serialize_comp_graph_to_file
    """
    output_vars_map = []
    output_vars_list = []
--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -24,12 +24,12 @@ _enable_convert_inputs = True


 def get_convert_inputs():
    """ get the curerent state of `_enable_convert_inputs` """
    r"""get the curerent state of `_enable_convert_inputs`"""
    return _enable_convert_inputs


 def set_convert_inputs(flag):
    """ This function is a temporary workaround for reducing the overhead of operator
    r"""This function is a temporary workaround for reducing the overhead of operator
    invocations. The function `convert_inputs` is disabled if the global state
    `_enable_convert_inputs` is set to `False`, otherwise enabled. This function is for
    internal use only, and should be removed when the tensor-like system is refactored.
@@ -137,11 +137,11 @@ def setscalar(x):


 def astensor1d(x, *reference, dtype=None, device=None):
    """
    Convert something to 1D tensor. Support following types
    * sequence of scalar literal / tensor
    * numpy array
    * tensor (returned as is, regardless of dtype and device)
    """Convert something to 1D tensor. Support following types

      * sequence of scalar literal / tensor
      * numpy array
      * tensor (returned as is, regardless of dtype and device)
    """
    try:
        ndim = x.ndim
--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
@@ -33,16 +33,11 @@ default_collate_err_msg_format = (


 class Collator:
    r"""
    Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
    r"""Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
    Modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
    """

    def apply(self, inputs):
        """
        :param inputs: sequence_N(tuple(CHW, C, CK)).
        :return: tuple(NCHW, NC, NCK).
        """
        elem = inputs[0]
        elem_type = type(elem)
        if (
--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -44,28 +44,28 @@ def raise_timeout_error():

 class DataLoader:
    r"""Provides a convenient way to iterate on a given dataset.

    
    DataLoader combines a dataset with
    :class:`~.Sampler`, :class:`~.Transform` and :class:`~.Collator`,
    make it flexible to get minibatch continually from a dataset.

    :param dataset: dataset from which to load the minibatch.
    :param sampler: defines the strategy to sample data from the dataset.
    :param transform: defined the transforming strategy for a sampled batch.
        Default: None
    :param collator: defined the merging strategy for a transformed batch.
        Default: None
    :param num_workers: the number of sub-process to load, transform and collate
        the batch. ``0`` means using single-process. Default: 0
    :param timeout: if positive, means the timeout value(second) for collecting a
        batch from workers. Default: 0
    :param timeout_event: callback function triggered by timeout, default to raise
        runtime error.
    :param divide: define the paralleling strategy in multi-processing mode.
        ``True`` means one batch is divided into :attr:`num_workers` pieces, and
        the workers will process these pieces parallelly. ``False`` means
        different sub-process will process different batch. Default: False

    Args:
        dataset: dataset from which to load the minibatch.
        sampler: defines the strategy to sample data from the dataset.
        transform: defined the transforming strategy for a sampled batch.
            Default: None
        collator: defined the merging strategy for a transformed batch.
            Default: None
        num_workers: the number of sub-process to load, transform and collate
            the batch. ``0`` means using single-process. Default: 0
        timeout: if positive, means the timeout value(second) for collecting a
            batch from workers. Default: 0
        timeout_event: callback function triggered by timeout, default to raise
            runtime error.
        divide: define the paralleling strategy in multi-processing mode.
            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
            the workers will process these pieces parallelly. ``False`` means
            different sub-process will process different batch. Default: False
    """
    __initialized = False

--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -11,8 +11,7 @@ from typing import Tuple


 class Dataset(ABC):
    r"""
    An abstract base class for all datasets.
    r"""An abstract base class for all datasets.

    __getitem__ and __len__ method are aditionally needed.
    """
@@ -31,8 +30,7 @@ class Dataset(ABC):


 class StreamDataset(Dataset):
    r"""
    An abstract class for stream data.
    r"""An abstract class for stream data.

    __iter__ method is aditionally needed.
    """
@@ -53,10 +51,9 @@ class StreamDataset(Dataset):


 class ArrayDataset(Dataset):
    r"""
    ArrayDataset is a dataset for numpy array data.
    r"""ArrayDataset is a dataset for numpy array data.

    One or more numpy arrays are needed to initiate the dataset. 
    One or more numpy arrays are needed to initiate the dataset.
    And the dimensions represented sample number are expected to be the same.
    """

--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -21,8 +21,7 @@ logger = get_logger(__name__)


 class CIFAR10(VisionDataset):
    r""" :class:`~.Dataset` for CIFAR10 meta data.
    """
    r""":class:`~.Dataset` for CIFAR10 meta data."""

    url_path = "http://www.cs.utoronto.ca/~kriz/"
    raw_file_name = "cifar-10-python.tar.gz"
@@ -138,8 +137,7 @@ class CIFAR10(VisionDataset):


 class CIFAR100(CIFAR10):
    r""" :class:`~.Dataset` for CIFAR100 meta data.
    """
    r""":class:`~.Dataset` for CIFAR100 meta data."""

    url_path = "http://www.cs.utoronto.ca/~kriz/"
    raw_file_name = "cifar-100-python.tar.gz"
--- a/imperative/python/megengine/data/dataset/vision/cityscapes.py
+++ b/imperative/python/megengine/data/dataset/vision/cityscapes.py
@@ -23,9 +23,7 @@ from .meta_vision import VisionDataset


 class Cityscapes(VisionDataset):
    r"""
    `Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
    """
    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset."""

    supported_order = (
        "image",
--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -46,9 +46,7 @@ def has_valid_annotation(anno, order):


 class COCO(VisionDataset):
    r"""
    `MS COCO <http://cocodataset.org/#home>`_ Dataset.
    """
    r"""`MS COCO <http://cocodataset.org/#home>`_ Dataset."""

    supported_order = (
        "image",
--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -26,22 +26,21 @@ from .utils import is_img


 class ImageFolder(VisionDataset):
    r"""
    ImageFolder is a class for loading image data and labels from a organized folder.

    r"""ImageFolder is a class for loading image data and labels from a organized folder.
    
    The folder is expected to be organized as followed: root/cls/xxx.img_ext

    
    Labels are indices of sorted classes in the root directory.

    :param root: root directory of an image folder.
    :param loader: a function used to load image from path,
                   if ``None``, default function that loads
                   images with PIL will be called.
    :param check_valid_func: a function used to check if files in folder are
                             expected image files, if ``None``, default function
                             that checks file extensions will be called.
    :param class_name: if ``True``, return class name instead of class index.

    Args:
        root: root directory of an image folder.
        loader: a function used to load image from path,
            if ``None``, default function that loads
            images with PIL will be called.
        check_valid_func: a function used to check if files in folder are
            expected image files, if ``None``, default function
            that checks file extensions will be called.
        class_name: if ``True``, return class name instead of class index.
    """

    def __init__(self, root: str, check_valid_func=None, class_name: bool = False):
--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -30,11 +30,10 @@ logger = get_logger(__name__)


 class ImageNet(ImageFolder):
    r"""
    Load ImageNet from raw files or folder. Expected folder looks like:

    .. code-block:: bash

    r"""Load ImageNet from raw files or folder. Expected folder looks like:
    
    .. code-block:: shell
    
        ${root}/
        |       [REQUIRED TAR FILES]
        |-  ILSVRC2012_img_train.tar
@@ -45,22 +44,8 @@ class ImageNet(ImageFolder):
        |-  val/cls/xxx.${img_ext}
        |-  ILSVRC2012_devkit_t12/data/meta.mat
        |-  ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt

    
    If the image folders don't exist, raw tar files are required to get extracted and processed.
    """

    raw_file_meta = {
        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
    }  # ImageNet raw files
    default_train_dir = "train"
    default_val_dir = "val"
    default_devkit_dir = "ILSVRC2012_devkit_t12"

    def __init__(self, root: str = None, train: bool = True, **kwargs):
        r"""
        Initialization:

        * if ``root`` contains ``self.target_folder`` depending on ``train``:

@@ -77,10 +62,22 @@ class ImageNet(ImageFolder):

            * raise error.

        :param root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
        :param train: if ``True``, load the train split, otherwise load the validation split.
        """
    Args:
        root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
        train: if ``True``, load the train split, otherwise load the validation split.

    """

    raw_file_meta = {
        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
    }  # ImageNet raw files
    default_train_dir = "train"
    default_val_dir = "val"
    default_devkit_dir = "ILSVRC2012_devkit_t12"

    def __init__(self, root: str = None, train: bool = True, **kwargs):
        # process the root path
        if root is None:
            self.root = self._default_root
--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -22,8 +22,7 @@ logger = get_logger(__name__)


 class MNIST(VisionDataset):
    r""" :class:`~.Dataset` for MNIST meta data.
    """
    r""":class:`~.Dataset` for MNIST meta data."""

    url_path = "http://yann.lecun.com/exdb/mnist/"
    """
--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -23,9 +23,7 @@ from .meta_vision import VisionDataset


 class Objects365(VisionDataset):
    r"""
    `Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
    """
    r"""`Objects365 <https://www.objects365.org/overview.html>`_ Dataset."""

    supported_order = (
        "image",
--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -24,9 +24,7 @@ from .meta_vision import VisionDataset


 class PascalVOC(VisionDataset):
    r"""
    `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
    """
    r"""`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset."""

    supported_order = (
        "image",
--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
@@ -17,9 +17,7 @@ import megengine.distributed as dist


 class Sampler(ABC):
    r"""
    An abstract base class for all Sampler
    """
    r"""An abstract base class for all Sampler"""

    @abstractmethod
    def __init__(self):
@@ -27,19 +25,19 @@ class Sampler(ABC):


 class MapSampler(Sampler):
    r"""
    Sampler for map dataset.

    :param dataset: dataset to sample from.
    :param batch_size: batch size for batch method.
    :param drop_last: set ``True`` to drop the last incomplete batch,
        if the dataset size is not divisible by the batch size. If ``False`` and 
        the size of dataset is not divisible by the batch_size, then the last batch will
        be smaller. Default: False
    :param num_samples: number of samples assigned to one rank.
    :param world_size: number of ranks.
    :param rank: rank id, non-negative interger within 0 and ``world_size``.
    :param seed: seed for random operators.
    r"""Sampler for map dataset.

    Args:
        dataset: dataset to sample from.
        batch_size: batch size for batch method.
        drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and
            the size of dataset is not divisible by the batch_size, then the last batch will
            be smaller. Default: False
        num_samples: number of samples assigned to one rank.
        world_size: number of ranks.
        rank: rank id, non-negative interger within 0 and ``world_size``.
        seed: seed for random operators.
    """

    def __init__(
@@ -106,14 +104,11 @@ class MapSampler(Sampler):
            return int(math.ceil(self.num_samples / self.batch_size))

    def sample(self):
        """
        Return a list contains all sample indices.
        """
        r"""Return a list contains all sample indices."""
        raise NotImplementedError

    def scatter(self, indices) -> List:
        r"""
        Scatter method is used for splitting indices into subset, each subset
        r"""Scatter method is used for splitting indices into subset, each subset
        will be assigned to a rank. Indices are evenly splitted by default.
        If customized indices assignment method is needed, please rewrite this method.
        """
@@ -130,9 +125,7 @@ class MapSampler(Sampler):
        return indices

    def batch(self) -> Iterator[List[Any]]:
        r"""
        Batch method provides a batch indices generator.
        """
        r"""Batch method provides a batch indices generator."""
        indices = list(self.sample())

        # user might pass the world_size parameter without dist,
@@ -150,18 +143,15 @@ class MapSampler(Sampler):


 class StreamSampler(Sampler):
    r"""
    Sampler for stream dataset.

    .. warning::
    r"""Sampler for stream dataset.

    Warning:
        In the case of multiple machines, sampler should ensure that each worker gets
        different data. But this class cannot do it yet, please build your own
        dataset and sampler to achieve this goal.

    Usually, :meth:`~.StreamDataset.__iter__` can return different iterator by
    ``rank = dist.get_rank()``. So that they will get different data.

    """

    def __init__(self, batch_size=1):
@@ -175,18 +165,18 @@ class StreamSampler(Sampler):


 class SequentialSampler(MapSampler):
    r"""
    Sample elements sequentially.

    :param dataset: dataset to sample from.
    :param batch_size: batch size for batch method.
    :param drop_last: set ``True`` to drop the last incomplete batch,
        if the dataset size is not divisible by the batch size. If ``False`` and 
        the size of dataset is not divisible by the batch_size, then the last batch will
        be smaller. Default: False
    :param indices: indice of samples.
    :param world_size: number of ranks.
    :param rank: rank id, non-negative interger within 0 and ``world_size``.
    r"""Sample elements sequentially.

    Args:
        dataset: dataset to sample from.
        batch_size: batch size for batch method.
        drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and
            the size of dataset is not divisible by the batch_size, then the last batch will
            be smaller. Default: False
        indices: indice of samples.
        world_size: number of ranks.
        rank: rank id, non-negative interger within 0 and ``world_size``.
    """

    def __init__(
@@ -207,9 +197,7 @@ class SequentialSampler(MapSampler):
        self.indices = indices

    def sample(self) -> Iterator[Any]:
        r"""
        Return a generator.
        """
        r"""Return a generator."""
        if self.indices is None:
            return iter(range(len(self.dataset)))
        else:
@@ -217,19 +205,19 @@ class SequentialSampler(MapSampler):


 class RandomSampler(MapSampler):
    r"""
    Sample elements randomly without replacement.

    :param dataset: dataset to sample from.
    :param batch_size: batch size for batch method.
    :param drop_last: set ``True`` to drop the last incomplete batch,
        if the dataset size is not divisible by the batch size. If ``False`` and 
        the size of dataset is not divisible by the batch_size, then the last batch will
        be smaller. Default: False
    :param indices: indice of samples.
    :param world_size: number of ranks.
    :param rank: rank id, non-negative interger within 0 and ``world_size``.
    :param seed: seed for random operators.
    r"""Sample elements randomly without replacement.

    Args:
        dataset: dataset to sample from.
        batch_size: batch size for batch method.
        drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and
            the size of dataset is not divisible by the batch_size, then the last batch will
            be smaller. Default: False
        indices: indice of samples.
        world_size: number of ranks.
        rank: rank id, non-negative interger within 0 and ``world_size``.
        seed: seed for random operators.
    """

    def __init__(
@@ -258,20 +246,20 @@ class RandomSampler(MapSampler):


 class ReplacementSampler(MapSampler):
    r"""
    Sample elements randomly with replacement.

    :param dataset: dataset to sample from.
    :param batch_size: batch size for batch method.
    :param drop_last: set ``True`` to drop the last incomplete batch,
        if the dataset size is not divisible by the batch size. If ``False`` and 
        the size of dataset is not divisible by the batch_size, then the last batch will
        be smaller. Default: False
    :param num_samples: number of samples assigned to one rank.
    :param weights: weights for sampling indices, it could be unnormalized weights.
    :param world_size: number of ranks.
    :param rank: rank id, non-negative interger within 0 and ``world_size``.
    :param seed: seed for random operators.
    r"""Sample elements randomly with replacement.

    Args:
        dataset: dataset to sample from.
        batch_size: batch size for batch method.
        drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and
            the size of dataset is not divisible by the batch_size, then the last batch will
            be smaller. Default: False
        num_samples: number of samples assigned to one rank.
        weights: weights for sampling indices, it could be unnormalized weights.
        world_size: number of ranks.
        rank: rank id, non-negative interger within 0 and ``world_size``.
        seed: seed for random operators.
    """

    def __init__(
--- a/imperative/python/megengine/data/tools/_queue.py
+++ b/imperative/python/megengine/data/tools/_queue.py
@@ -59,15 +59,13 @@ class _PlasmaStoreManager:

 class PlasmaShmQueue:
    def __init__(self, maxsize: int = 0):
        r"""
        Use pyarrow in-memory plasma store to implement shared memory queue.

        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
        Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
        and communication overhead, leading to better performance in multi-process
        application.

        :type maxsize: int
        :param maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
        Args:
            maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
        """

        # Lazy start the plasma store manager
--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -11,9 +11,7 @@ from typing import Sequence, Tuple


 class Transform(ABC):
    """
    Rewrite apply method in subclass.
    """
    r"""Rewrite apply method in subclass."""

    def apply_batch(self, inputs: Sequence[Tuple]):
        return tuple(self.apply(input) for input in inputs)
--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -15,7 +15,7 @@ import numpy as np


 def wrap_keepdims(func):
    """Wraper to keep the dimension of input images unchanged."""
    r"""Wraper to keep the dimension of input images unchanged."""

    @functools.wraps(func)
    def wrapper(image, *args, **kwargs):
@@ -33,41 +33,47 @@ def wrap_keepdims(func):

@wrap_keepdims
 def to_gray(image):
    r"""
    Change BGR format image's color space to gray.
    r"""Change BGR format image's color space to gray.

    :param image: input BGR format image, with `(H, W, C)` shape.
    :return: gray format image, with `(H, W, C)` shape.
    Args:
        image: input BGR format image, with `(H, W, C)` shape.

    Returns:
        gray format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


@wrap_keepdims
 def to_bgr(image):
    r"""
    Change gray format image's color space to BGR.
    r"""Change gray format image's color space to BGR.

    Args:
        image: input Gray format image, with `(H, W, C)` shape.

    :param image: input Gray format image, with `(H, W, C)` shape.
    :return: BGR format image, with `(H, W, C)` shape.
    Returns:
        BGR format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)


@wrap_keepdims
 def pad(input, size, value):
    r"""
    Pad input data with *value* and given *size*.

    :param input: input data, with `(H, W, C)` shape.
    :param size: padding size of input data, it could be integer or sequence.
        If it is an integer, the input data will be padded in four directions.
        If it is a sequence contains two integer, the bottom and right side
        of input data will be padded.
        If it is a sequence contains four integer, the top, bottom, left, right
        side of input data will be padded with given size.
    :param value: padding value of data, could be a sequence of int or float.
        If it is float value, the dtype of image will be casted to float32 also.
    :return: padded image.
    r"""Pad input data with *value* and given *size*.

    Args:
        input: input data, with `(H, W, C)` shape.
        size: padding size of input data, it could be integer or sequence.
            If it is an integer, the input data will be padded in four directions.
            If it is a sequence contains two integer, the bottom and right side
            of input data will be padded.
            If it is a sequence contains four integer, the top, bottom, left, right
            side of input data will be padded with given size.
        value: padding value of data, could be a sequence of int or float.
            If it is float value, the dtype of image will be casted to float32 also.

    Returns:
        padded image.
    """
    if isinstance(size, int):
        size = (size, size, size, size)
@@ -80,32 +86,33 @@ def pad(input, size, value):

@wrap_keepdims
 def flip(image, flipCode):
    r"""
    Accordding to the flipCode (the type of flip), flip the input image.

    :param image: input image, with `(H, W, C)` shape.
    :param flipCode: code that indicates the type of flip.
    r"""Accordding to the flipCode (the type of flip), flip the input image.

        * 1 : Flip horizontally
    Args:
        image: input image, with `(H, W, C)` shape.
        flipCode: code that indicates the type of flip.

        * 0 : Flip vertically
            * 1 : Flip horizontally
            * 0 : Flip vertically
            * -1: Flip horizontally and vertically

        * -1: Flip horizontally and vertically

    :return: BGR format image, with `(H, W, C)` shape.
    Returns:
        BGR format image, with `(H, W, C)` shape.
    """
    return cv2.flip(image, flipCode=flipCode)


@wrap_keepdims
 def resize(input, size, interpolation=cv2.INTER_LINEAR):
    r"""
    Resize the input data to given size.
    r"""Resize the input data to given size.

    Args:
        input: input data, could be image or masks, with `(H, W, C)` shape.
        size: target size of input data, with (height, width) shape.
        interpolation: interpolation method.

    :param input: input data, could be image or masks, with `(H, W, C)` shape.
    :param size: target size of input data, with (height, width) shape.
    :param interpolation: interpolation method.
    :return: resized data, with `(H, W, C)` shape.
    Returns:
        resized data, with `(H, W, C)` shape.
    """
    if len(size) != 2:
        raise ValueError("resize needs (h, w), but got {}".format(size))
--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -42,36 +42,36 @@ __all__ = [


 class VisionTransform(Transform):
    r"""
    Base class of all transforms used in computer vision.
    r"""Base class of all transforms used in computer vision.
    Calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
    method. If you want to implement a self-defined transform method for image,
    rewrite _apply_image method in subclass.

    :param order: input type order. Input is a tuple containing different structures,
        order is used to specify the order of structures. For example, if your input
        is (image, boxes) type, then the ``order`` should be ("image", "boxes").
        Current available strings and data type are describe below:

        * "image": input image, with shape of `(H, W, C)`.
        * "coords": coordinates, with shape of `(N, 2)`.
        * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
          the 1st "xy" represents top left point of a box,
          the 2nd "xy" represents right bottom point.
        * "mask": map used for segmentation, with shape of `(H, W, 1)`.
        * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
          and K for number of keypoints in one instance. The first two dimensions
          of last axis is coordinate of keypoints and the the 3rd dimension is
          the label of keypoints.
        * "polygons": a sequence containing numpy arrays, its length is the number of instances.
          Each numpy array represents polygon coordinate of one instance.
        * "category": categories for some data type. For example, "image_category"
          means category of the input image and "boxes_category" means categories of
          bounding boxes.
        * "info": information for images such as image shapes and image path.

        You can also customize your data types only if you implement the corresponding
        _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
    Args:
        order: input type order. Input is a tuple containing different structures,
            order is used to specify the order of structures. For example, if your input
            is (image, boxes) type, then the ``order`` should be ("image", "boxes").
            Current available strings and data type are describe below:
    
            * "image": input image, with shape of `(H, W, C)`.
            * "coords": coordinates, with shape of `(N, 2)`.
            * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
              the 1st "xy" represents top left point of a box,
              the 2nd "xy" represents right bottom point.
            * "mask": map used for segmentation, with shape of `(H, W, 1)`.
            * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
              and K for number of keypoints in one instance. The first two dimensions
              of last axis is coordinate of keypoints and the the 3rd dimension is
              the label of keypoints.
            * "polygons": a sequence containing numpy arrays, its length is the number of instances.
              Each numpy array represents polygon coordinate of one instance.
            * "category": categories for some data type. For example, "image_category"
              means category of the input image and "boxes_category" means categories of
              bounding boxes.
            * "info": information for images such as image shapes and image path.
    
    You can also customize your data types only if you implement the corresponding
    _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
    """

    def __init__(self, order=None):
@@ -154,13 +154,13 @@ class VisionTransform(Transform):


 class ToMode(VisionTransform):
    r"""
    Change input data to a target mode.
    r"""Change input data to a target mode.
    For example, most transforms use HWC mode image,
    while the neural network might use CHW mode input tensor.

    :param mode: output mode of input. Default: "CHW"
    :param order: the same with :class:`VisionTransform`
    Args:
        mode: output mode of input. Default: "CHW"
        order: the same with :class:`VisionTransform`
    """

    def __init__(self, mode="CHW", *, order=None):
@@ -183,32 +183,31 @@ class ToMode(VisionTransform):


 class Compose(VisionTransform):
    r"""
    Composes several transforms together.

    :param transforms: list of :class:`VisionTransform` to compose.
    :param batch_compose: whether use shuffle_indices for batch data or not.
        If True, use original input sequence.
        Otherwise, the shuffle_indices will be used for transforms.
    :param shuffle_indices: indices used for random shuffle, start at 1.
        For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
        will be random shuffled, the 2nd and 4th transform will also be shuffled.
    :param order: the same with :class:`VisionTransform`

    r"""Composes several transforms together.

    Args:
        transforms: list of :class:`VisionTransform` to compose.
        batch_compose: whether use shuffle_indices for batch data or not.
            If True, use original input sequence.
            Otherwise, the shuffle_indices will be used for transforms.
        shuffle_indices: indices used for random shuffle, start at 1.
            For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
            will be random shuffled, the 2nd and 4th transform will also be shuffled.
        order: the same with :class:`VisionTransform`
    
    Examples:

    .. testcode::

        from megengine.data.transform import RandomHorizontalFlip, RandomVerticalFlip, CenterCrop, ToMode, Compose

        transform_func = Compose([
            RandomHorizontalFlip(),
            RandomVerticalFlip(),
            CenterCrop(100),
            ToMode("CHW"),
            ],
            shuffle_indices=[(1, 2, 3)]
            )
        .. testcode::
        
           from megengine.data.transform import RandomHorizontalFlip, RandomVerticalFlip, CenterCrop, ToMode, Compose
           
           transform_func = Compose([
           RandomHorizontalFlip(),
           RandomVerticalFlip(),
           CenterCrop(100),
           ToMode("CHW"),
           ],
           shuffle_indices=[(1, 2, 3)]
           )
    """

    def __init__(
@@ -260,13 +259,13 @@ class Compose(VisionTransform):


 class TorchTransformCompose(VisionTransform):
    r"""
    Compose class used for transforms in torchvision, only support PIL image,
    r"""Compose class used for transforms in torchvision, only support PIL image,
    some transforms with tensor in torchvision are not supported,
    such as Normalize and ToTensor in torchvision.

    :param transforms: the same with ``Compose``.
    :param order: the same with :class:`VisionTransform`.
    Args:
        transforms: the same with ``Compose``.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, transforms, *, order=None):
@@ -302,19 +301,19 @@ class TorchTransformCompose(VisionTransform):


 class Pad(VisionTransform):
    r"""
    Pad the input data.

    :param size: padding size of input image, it could be integer or sequence.
        If it is an integer, the input image will be padded in four directions.
        If it is a sequence containing two integers, the bottom and right side
        of image will be padded.
        If it is a sequence containing four integers, the top, bottom, left, right
        side of image will be padded with given size.
    :param value: padding value of image, could be a sequence of int or float.
        if it is float value, the dtype of image will be casted to float32 also.
    :param mask_value: padding value of segmentation map.
    :param order: the same with :class:`VisionTransform`.
    r"""Pad the input data.

    Args:
        size: padding size of input image, it could be integer or sequence.
            If it is an integer, the input image will be padded in four directions.
            If it is a sequence containing two integers, the bottom and right side
            of image will be padded.
            If it is a sequence containing four integers, the top, bottom, left, right
            side of image will be padded with given size.
        value: padding value of image, could be a sequence of int or float.
            if it is float value, the dtype of image will be casted to float32 also.
        mask_value: padding value of segmentation map.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, size=0, value=0, mask_value=0, *, order=None):
@@ -350,18 +349,18 @@ class Pad(VisionTransform):


 class Resize(VisionTransform):
    r"""
    Resize the input data.

    :param output_size: target size of image, with (height, width) shape.
    :param interpolation: interpolation method. All methods are listed below:

        * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
        * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
        * cv2.INTER_AREA – resampling using pixel area relation.
        * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
        * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
    :param order: the same with :class:`VisionTransform`.
    r"""Resize the input data.

    Args:
        output_size: target size of image, with (height, width) shape.
        interpolation: interpolation method. All methods are listed below:
    
            * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
            * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
            * cv2.INTER_AREA – resampling using pixel area relation.
            * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
            * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -410,9 +409,7 @@ class Resize(VisionTransform):


 class ShortestEdgeResize(VisionTransform):
    r"""
    Resize the input data with specified shortset edge.
    """
    r"""Resize the input data with specified shortset edge."""

    def __init__(
        self,
@@ -481,11 +478,11 @@ class ShortestEdgeResize(VisionTransform):


 class RandomResize(VisionTransform):
    r"""
    Resize the input data randomly.
    r"""Resize the input data randomly.

    :param scale_range: range of scaling.
    :param order: the same with :class:`VisionTransform`.
    Args:
        scale_range: range of scaling.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -526,15 +523,15 @@ class RandomResize(VisionTransform):


 class RandomCrop(VisionTransform):
    r"""
    Crop the input data randomly. Before applying the crop transform,
    r"""Crop the input data randomly. Before applying the crop transform,
    pad the image first. If target size is still bigger than the size of
    padded image, pad the image size to target size.

    :param output_size: target size of output image, with (height, width) shape.
    :param padding_size: the same with `size` in ``Pad``.
    :param padding_value: the same with `value` in ``Pad``.
    :param order: the same with :class:`VisionTransform`.
    Args:
        output_size: target size of output image, with (height, width) shape.
        padding_size: the same with `size` in ``Pad``.
        padding_value: the same with `value` in ``Pad``.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(
@@ -584,16 +581,16 @@ class RandomCrop(VisionTransform):


 class RandomResizedCrop(VisionTransform):
    r"""
    Crop the input data to random size and aspect ratio.
    r"""Crop the input data to random size and aspect ratio.
    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
    After applying crop transfrom, the input data will be resized to given size.

    :param output_size: target size of output image, with (height, width) shape.
    :param scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
    :param ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
    :param order: the same with :class:`VisionTransform`.
    Args:
        output_size: target size of output image, with (height, width) shape.
        scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
        ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
        order: the same with :class:`VisionTransform`.
    """

    def __init__(
@@ -674,11 +671,11 @@ class RandomResizedCrop(VisionTransform):


 class CenterCrop(VisionTransform):
    r"""
    Crops the given the input data at the center.
    r"""Crops the given the input data at the center.

    :param output_size: target size of output image, with (height, width) shape.
    :param order: the same with :class:`VisionTransform`.
    Args:
        output_size: target size of output image, with (height, width) shape.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, output_size, *, order=None):
@@ -718,11 +715,11 @@ class CenterCrop(VisionTransform):


 class RandomHorizontalFlip(VisionTransform):
    r"""
    Horizontally flip the input data randomly with a given probability.
    r"""Horizontally flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
    :param order: the same with :class:`VisionTransform`.
    Args:
        p: probability of the input data being flipped. Default: 0.5
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, prob: float = 0.5, *, order=None):
@@ -751,11 +748,11 @@ class RandomHorizontalFlip(VisionTransform):


 class RandomVerticalFlip(VisionTransform):
    r"""
    Vertically flip the input data randomly with a given probability.
    r"""Vertically flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
    :param order: the same with :class:`VisionTransform`.
    Args:
        p: probability of the input data being flipped. Default: 0.5
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, prob: float = 0.5, *, order=None):
@@ -784,15 +781,15 @@ class RandomVerticalFlip(VisionTransform):


 class Normalize(VisionTransform):
    r"""
    Normalize the input data with mean and standard deviation.
    r"""Normalize the input data with mean and standard deviation.
    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
    this transform will normalize each channel of the input data.
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``

    :param mean: sequence of means for each channel.
    :param std: sequence of standard deviations for each channel.
    :param order: the same with :class:`VisionTransform`.
    Args:
        mean: sequence of means for each channel.
        std: sequence of standard deviations for each channel.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -811,13 +808,13 @@ class Normalize(VisionTransform):


 class GaussianNoise(VisionTransform):
    r"""
    Add random gaussian noise to the input data.
    r"""Add random gaussian noise to the input data.
    Gaussian noise is generated with given mean and std.

    :param mean: Gaussian mean used to generate noise.
    :param std: Gaussian standard deviation used to generate noise.
    :param order: the same with :class:`VisionTransform`
    Args:
        mean: Gaussian mean used to generate noise.
        std: Gaussian standard deviation used to generate noise.
        order: the same with :class:`VisionTransform`
    """

    def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -839,12 +836,12 @@ class GaussianNoise(VisionTransform):


 class BrightnessTransform(VisionTransform):
    r"""
    Adjust brightness of the input data.
    r"""Adjust brightness of the input data.

    :param value: how much to adjust the brightness. Can be any
        non negative number. 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    Args:
        value: how much to adjust the brightness. Can be any
            non negative number. 0 gives the original image.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -871,12 +868,12 @@ class BrightnessTransform(VisionTransform):


 class ContrastTransform(VisionTransform):
    r"""
    Adjust contrast of the input data.
    r"""Adjust contrast of the input data.

    :param value: how much to adjust the contrast. Can be any
        non negative number. 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    Args:
        value: how much to adjust the contrast. Can be any
            non negative number. 0 gives the original image.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -903,12 +900,12 @@ class ContrastTransform(VisionTransform):


 class SaturationTransform(VisionTransform):
    r"""
    Adjust saturation of the input data.
    r"""Adjust saturation of the input data.

    :param value: how much to adjust the saturation. Can be any
        non negative number. 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    Args:
        value: how much to adjust the saturation. Can be any
            non negative number. 0 gives the original image.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -935,12 +932,12 @@ class SaturationTransform(VisionTransform):


 class HueTransform(VisionTransform):
    r"""
    Adjust hue of the input data.
    r"""Adjust hue of the input data.

    :param value: how much to adjust the hue. Can be any number
        between 0 and 0.5, 0 gives the original image.
    :param order: the same with :class:`VisionTransform`.
    Args:
        value: how much to adjust the hue. Can be any number
            between 0 and 0.5, 0 gives the original image.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -974,22 +971,22 @@ class HueTransform(VisionTransform):


 class ColorJitter(VisionTransform):
    r"""
    Randomly change the brightness, contrast, saturation and hue of an image.

    :param brightness: how much to jitter brightness.
        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
        or the given [min, max]. Should be non negative numbers.
    :param contrast: how much to jitter contrast.
        Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
        or the given [min, max]. Should be non negative numbers.
    :param saturation: how much to jitter saturation.
        Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
        or the given [min, max]. Should be non negative numbers.
    :param hue: how much to jitter hue.
        Chosen uniformly from [-hue, hue] or the given [min, max].
        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
    :param order: the same with :class:`VisionTransform`.
    r"""Randomly change the brightness, contrast, saturation and hue of an image.

    Args:
        brightness: how much to jitter brightness.
            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
            or the given [min, max]. Should be non negative numbers.
        contrast: how much to jitter contrast.
            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
            or the given [min, max]. Should be non negative numbers.
        saturation: how much to jitter saturation.
            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
            or the given [min, max]. Should be non negative numbers.
        hue: how much to jitter hue.
            Chosen uniformly from [-hue, hue] or the given [min, max].
            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
        order: the same with :class:`VisionTransform`.
    """

    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):
@@ -1014,11 +1011,10 @@ class ColorJitter(VisionTransform):


 class Lighting(VisionTransform):
    r"""
    Apply AlexNet-Style "lighting" augmentation to input data.

    r"""Apply AlexNet-Style "lighting" augmentation to input data.
    
    Input images are assumed to have 'RGB' channel order.

    
    The degree of color jittering is randomly sampled via a normal distribution,
    with standard deviation given by the scale parameter.
    """
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -54,10 +54,10 @@ _device_type_set = {"cpu", "gpu", "xpu", "rocm"}


 def get_device_count(device_type: str) -> int:
    """
    Gets number of devices installed on this system.
    r"""Gets number of devices installed on this system.

    :param device_type: device type, one of 'gpu' or 'cpu'
    Args:
        device_type: device type, one of 'gpu' or 'cpu'
    """
    assert device_type in _device_type_set, "device must be one of {}".format(
        _device_type_set
@@ -67,73 +67,59 @@ def get_device_count(device_type: str) -> int:


 def is_cuda_available() -> bool:
    """
    Returns whether cuda device is available on this system.

    """
    r"""Returns whether cuda device is available on this system."""
    t = _str2device_type("gpu")
    return CompNode._get_device_count(t, False) > 0


 def is_cambricon_available() -> bool:
    """
    Returns whether cambricon device is available on this system.

    """
    r"""Returns whether cambricon device is available on this system."""
    t = _str2device_type("cambricon")
    return CompNode._get_device_count(t, False) > 0


 def is_atlas_available() -> bool:
    """
    Returns whether atlas device is available on this system.

    """
    r"""Returns whether atlas device is available on this system."""
    t = _str2device_type("atlas")
    return CompNode._get_device_count(t, False) > 0


 def is_rocm_available() -> bool:
    """Returns whether rocm device is available on this system.

    """
    r"""Returns whether rocm device is available on this system."""
    t = _str2device_type("rocm")
    return CompNode._get_device_count(t, False) > 0


 def set_default_device(device: str = "xpux"):
    r"""
    Sets default computing node.

    :param device: default device type. The type can be 'cpu0', 'cpu1', etc.,
        or 'gpu0', 'gpu1', etc., to specify the particular cpu or gpu to use.
        'cpux' and  'gpux' can also be used to specify any number of cpu or gpu devices.

        'multithread' device type is avaliable when inference, which implements
        multi-threading parallelism at the operator level. For example,
        'multithread4' will compute with 4 threads.

        The default value is 'xpux' to specify any device available. The priority of using gpu is higher when both gpu and cpu are available.

        It can also be set by environment variable `MGE_DEFAULT_DEVICE`.
    r"""Sets default computing node.

    Args:
        device: default device type.

    Note:
        * The type can be 'cpu0', 'cpu1', etc., or 'gpu0', 'gpu1', etc.,
          to specify the particular CPU or GPU to use.
        * 'cpux' and  'gpux' can also be used to specify any number of CPU or GPU devices.
        * The default value is 'xpux' to specify any device available.
        * The priority of using GPU is higher when both GPU and CPU are available.
        * 'multithread' device type is avaliable when inference,
          which implements multi-threading parallelism at the operator level.
          For example, 'multithread4' will compute with 4 threads.
        * It can also be set by environment variable ``MGE_DEFAULT_DEVICE``.
    """
    assert _valid_device(device), "Invalid device name {}".format(device)
    CompNode._set_default_device(device)


 def get_default_device() -> str:
    r"""
    Gets default computing node.

    r"""Gets default computing node.
    It returns the value set by :func:`~.set_default_device`.
    """
    return CompNode._get_default_device()


 def get_mem_status_bytes(device: Optional[str] = None):
    r"""
    Get total and free memory on the computing device in bytes.
    """
    r"""Get total and free memory on the computing device in bytes."""
    if device is None:
        device = get_default_device()
    tot, free = CompNode(device).get_mem_status_bytes
@@ -150,15 +136,17 @@ def set_prealloc_config(
    growth_factor=2.0,
    device_type=DeviceType.CUDA,
 ):
    """
    Specifies how to pre-allocate from raw device allocator.

    :param alignment: specifies the alignment in bytes.
    :param min_req: min request size in bytes.
    :param max_overhead: max overhead above required size in bytes.
    :param growth_factor: `request size / cur allocated`
    :param device_type: the device type

    r"""Specifies how to pre-allocate from raw device allocator.

    Args:
        alignment: specifies the alignment in bytes.
        min_req: min request size in bytes.
        max_overhead: max overhead above required size in bytes.
        growth_factor: request size / cur allocated`
        device_type: the device type
        alignment: int:
        min_req: int:
        max_overhead: int:
    """
    assert alignment > 0
    assert min_req > 0
--- a/imperative/python/megengine/distributed/init.py
+++ b/imperative/python/megengine/distributed/init.py
@@ -31,17 +31,15 @@ from .server import Client, Server

@mproperty
 def backend(mod):
    r"""
    Get or set backend of collective communication.
    r"""Get or set backend of collective communication.
    Available backends are ['nccl', 'shm', 'rccl']

    Examples:

    .. code-block::

        import megengine.distributed as dist
        dist.backend = "nccl"
        .. code-block::

            import megengine.distributed as dist
            dist.backend = "nccl"
    """
    assert group._sd, "please call init_process_group first"
    return group._sd.backend
--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -50,7 +50,7 @@ def _backend():


 def collective_comm(inp, mode, group, device):
    """Helper function for applying collective communication functions."""
    r"""Helper function for applying collective communication functions."""
    assert isinstance(group, Group)
    if group is None:
        return inp
@@ -158,8 +158,7 @@ class _ReduceSum(Function):
 def reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
    r"""
    Reduce tensor data across the specified group by sum.
    r"""Reduce tensor data across the specified group by sum.
    Only root process will receive the final result.

    Args:
@@ -176,22 +175,20 @@ def reduce_sum(
        Reduced tensor if in root process, None in other processes.

    Examples:

    .. code-block::

        input = Tensor([rank])
        # Rank 0 # input: Tensor([0])
        # Rank 1 # input: Tensor([1])
        output = reduce_sum(input)
        # Rank 0 # output: Tensor([1])
        # Rank 1 # output: None

        input = Tensor([rank])
        group = Group([1, 0]) # first rank is root
        output = reduce_sum(input, group)
        # Rank 0 # output: None
        # Rank 1 # output: Tensor([1])

        .. code-block::

           input = Tensor([rank])
           # Rank 0 # input: Tensor([0])
           # Rank 1 # input: Tensor([1])
           output = reduce_sum(input)
           # Rank 0 # output: Tensor([1])
           # Rank 1 # output: None

           input = Tensor([rank])
           group = Group([1, 0]) # first rank is root
           output = reduce_sum(input, group)
           # Rank 0 # output: None
           # Rank 1 # output: Tensor([1])
    """
    op = _ReduceSum(group, device)
    (out,) = apply(op, inp)
@@ -222,8 +219,7 @@ class _Broadcast(Function):
 def broadcast(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
    r"""
    Broadcast tensor data from root process to others.
    r"""Broadcast tensor data from root process to others.

    Args:
        inp: Input tensor.
@@ -240,21 +236,20 @@ def broadcast(

    Examples:

    .. code-block::

        input = Tensor([rank])
        # Rank 0 # input: Tensor([0])
        # Rank 1 # input: Tensor([1])
        output = broadcast(input)
        # Rank 0 # output: Tensor([0])
        # Rank 1 # output: Tensor([0])
        .. code-block::

        input = Tensor([rank])
        group = Group([1, 0]) # first rank is root
        output = broadcast(input, group)
        # Rank 0 # output: Tensor([1])
        # Rank 1 # output: Tensor([1])
           input = Tensor([rank])
           # Rank 0 # input: Tensor([0])
           # Rank 1 # input: Tensor([1])
           output = broadcast(input)
           # Rank 0 # output: Tensor([0])
           # Rank 1 # output: Tensor([0])

           input = Tensor([rank])
           group = Group([1, 0]) # first rank is root
           output = broadcast(input, group)
           # Rank 0 # output: Tensor([1])
           # Rank 1 # output: Tensor([1])
    """
    shape, dtype = _bcast_shape_dtype(group, inp)
    if group.rank != 0:
@@ -278,8 +273,7 @@ def _bcast_param(
 def all_gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
    r"""
    Gather tensors across the specified group and concat them at first dimension.
    r"""Gather tensors across the specified group and concat them at first dimension.

    Args:
        inp: Input tensor.
@@ -298,21 +292,20 @@ def all_gather(

    Examples:

    .. code-block::

        input = Tensor([rank])
        # Rank 0 # input: Tensor([0])
        # Rank 1 # input: Tensor([1])
        output = all_gather(input)
        # Rank 0 # output: Tensor([0 1])
        # Rank 1 # output: Tensor([0 1])
        .. code-block::

        input = Tensor([rank])
        group = Group([1, 0])
        output = all_gather(input, group)
        # Rank 0 # output: Tensor([1 0])
        # Rank 1 # output: Tensor([1 0])
           input = Tensor([rank])
           # Rank 0 # input: Tensor([0])
           # Rank 1 # input: Tensor([1])
           output = all_gather(input)
           # Rank 0 # output: Tensor([0 1])
           # Rank 1 # output: Tensor([0 1])

           input = Tensor([rank])
           group = Group([1, 0])
           output = all_gather(input, group)
           # Rank 0 # output: Tensor([1 0])
           # Rank 1 # output: Tensor([1 0])
    """
    mode = CollectiveComm.Mode.ALL_GATHER
    out = collective_comm(inp, mode, group, device)
@@ -338,8 +331,7 @@ def all_gather(
 def reduce_scatter_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0
 ) -> Tensor:
    r"""
    Reduce tensors across the specified group by sum and split them at first dimension.
    r"""Reduce tensors across the specified group by sum and split them at first dimension.

    Args:
        inp: Input tensor.
@@ -358,21 +350,20 @@ def reduce_scatter_sum(

    Examples:

    .. code-block::

        input = Tensor([0 1])
        # Rank 0 # input: Tensor([0 1])
        # Rank 1 # input: Tensor([0 1])
        output = reduce_scatter_sum(input)
        # Rank 0 # output: Tensor([0])
        # Rank 1 # output: Tensor([2])
        .. code-block::

        input = Tensor([0 1])
        group = Group([1, 0])
        output = reduce_scatter_sum(input, group)
        # Rank 0 # output: Tensor([2])
        # Rank 1 # output: Tensor([0])
           input = Tensor([0 1])
           # Rank 0 # input: Tensor([0 1])
           # Rank 1 # input: Tensor([0 1])
           output = reduce_scatter_sum(input)
           # Rank 0 # output: Tensor([0])
           # Rank 1 # output: Tensor([2])

           input = Tensor([0 1])
           group = Group([1, 0])
           output = reduce_scatter_sum(input, group)
           # Rank 0 # output: Tensor([2])
           # Rank 1 # output: Tensor([0])
    """
    group_size = group.size if group is not None else 1
    assert (
@@ -398,8 +389,7 @@ def reduce_scatter_sum(
 def all_reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
    r"""
    Reduce tensors across the specified group by sum.
    r"""Reduce tensors across the specified group by sum.

    Args:
        inp: Input tensor.
@@ -416,15 +406,14 @@ def all_reduce_sum(

    Examples:

    .. code-block::

        input = Tensor(rank)
        # Rank 0 # input: Tensor(0)
        # Rank 1 # input: Tensor(1)
        output = all_reduce_sum(input)
        # Rank 0 # output: Tensor(1)
        # Rank 1 # output: Tensor(1)
        .. code-block::

           input = Tensor(rank)
           # Rank 0 # input: Tensor(0)
           # Rank 1 # input: Tensor(1)
           output = all_reduce_sum(input)
           # Rank 0 # output: Tensor(1)
           # Rank 1 # output: Tensor(1)
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_SUM
    return collective_comm(inp, mode, group, device)
@@ -433,8 +422,7 @@ def all_reduce_sum(
 def all_reduce_max(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
    r"""
    Reduce tensors across the specified group by max.
    r"""Reduce tensors across the specified group by max.

    Args:
        inp: Input tensor.
@@ -451,15 +439,14 @@ def all_reduce_max(

    Examples:

    .. code-block::

        input = Tensor(rank)
        # Rank 0 # input: Tensor(0)
        # Rank 1 # input: Tensor(1)
        output = all_reduce_max(input)
        # Rank 0 # output: Tensor(1)
        # Rank 1 # output: Tensor(1)
        .. code-block::

           input = Tensor(rank)
           # Rank 0 # input: Tensor(0)
           # Rank 1 # input: Tensor(1)
           output = all_reduce_max(input)
           # Rank 0 # output: Tensor(1)
           # Rank 1 # output: Tensor(1)
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_MAX
    return collective_comm(inp, mode, group, device)
@@ -468,8 +455,7 @@ def all_reduce_max(
 def all_reduce_min(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
    r"""
    Reduce tensors across the specified group by min.
    r"""Reduce tensors across the specified group by min.

    Args:
        inp: Input tensor.
@@ -486,15 +472,14 @@ def all_reduce_min(

    Examples:

    .. code-block::

        input = Tensor(rank)
        # Rank 0 # input: Tensor(0)
        # Rank 1 # input: Tensor(1)
        output = all_reduce_min(input)
        # Rank 0 # output: Tensor(0)
        # Rank 1 # output: Tensor(0)
        .. code-block::

           input = Tensor(rank)
           # Rank 0 # input: Tensor(0)
           # Rank 1 # input: Tensor(1)
           output = all_reduce_min(input)
           # Rank 0 # output: Tensor(0)
           # Rank 1 # output: Tensor(0)
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_MIN
    return collective_comm(inp, mode, group, device)
@@ -520,8 +505,7 @@ class _Gather(Function):
 def gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
    r"""
    Gather tensors across the specified group.
    r"""Gather tensors across the specified group.
    Only root process will receive the final result.

    Args:
@@ -534,27 +518,23 @@ def gather(
            Specify "gpu0:1" to execute this operator on diffrent cuda stream,
            1 is stream id, and default stream id is 0.
        axis: The concat axis for collective_comm result
            The default axis is 0
    Returns:
        Result tensor if in root process, None if in other process

    Examples:

    .. code-block::

        input = Tensor([rank])
        # Rank 0 # input: Tensor([0])
        # Rank 1 # input: Tensor([1])
        output = gather(input)
        # Rank 0 # output: Tensor([0 1])
        # Rank 1 # output: None
        .. code-block::

        input = Tensor([rank])
        group = Group([1, 0]) # first rank is root
        output = gather(input, group)
        # Rank 0 # output: None
        # Rank 1 # output: Tensor([1 0])
           input = Tensor([rank])
           # Rank 0 # input: Tensor([0])
           # Rank 1 # input: Tensor([1])
           output = gather(input)
           # Rank 0 # output: Tensor([0 1])
           # Rank 1 # output: None

           input = Tensor([rank])
           group = Group([1, 0]) # first rank is root
           output = gather(input, group)
           # Rank 0 # output: None
           # Rank 1 # output: Tensor([1 0])
    """
    assert (
        axis < inp.ndim
@@ -607,8 +587,7 @@ class _Scatter(Function):
 def scatter(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
    r"""
    Split tensor in root process at first dimension.
    r"""Split tensor in root process at first dimension.

    Args:
        inp: Input tensor.
@@ -627,21 +606,20 @@ def scatter(

    Examples:

    .. code-block::

        input = Tensor([0 1]) + rank*2
        # Rank 0 # input: Tensor([0 1])
        # Rank 1 # input: Tensor([2 3])
        output = scatter(input)
        # Rank 0 # output: Tensor([0])
        # Rank 1 # output: Tensor([1])
        .. code-block::

        input = Tensor([0 1]) + rank*2
        group = Group([1, 0]) # first rank is root
        output = scatter(input, group)
        # Rank 0 # output: Tensor([3])
        # Rank 1 # output: Tensor([2])
           input = Tensor([0 1]) + rank*2
           # Rank 0 # input: Tensor([0 1])
           # Rank 1 # input: Tensor([2 3])
           output = scatter(input)
           # Rank 0 # output: Tensor([0])
           # Rank 1 # output: Tensor([1])

           input = Tensor([0 1]) + rank*2
           group = Group([1, 0]) # first rank is root
           output = scatter(input, group)
           # Rank 0 # output: Tensor([3])
           # Rank 1 # output: Tensor([2])
    """
    shape, dtype = _bcast_shape_dtype(group, inp)
    if group.rank != 0:
@@ -680,8 +658,7 @@ def all_to_all(
    split_axis: int = 0,
    concat_axis: int = 0,
 ) -> Tensor:
    r"""
    Each process scatter input tensor to all processes and return gathered tensor.
    r"""Each process scatter input tensor to all processes and return gathered tensor.

    Args:
        inp: Input tensor.
@@ -694,29 +671,26 @@ def all_to_all(
            1 is stream id, and default stream id is 0.
        split_axis: The axis that collectivecomm will split data
            the default axis is 0
        split_axis: The axis that collectivecomm will concat data
            the default axis is 0

    Returns:
        Result tensor.

    Examples:

    .. code-block::

        input = Tensor([0 1]) + rank*2
        # Rank 0 # input: Tensor([0 1])
        # Rank 1 # input: Tensor([2 3])
        output = all_to_all(input)
        # Rank 0 # output: Tensor([0 2])
        # Rank 1 # output: Tensor([1 3])
        .. code-block::

        input = Tensor([0 1]) + rank*2
        group = Group([1, 0])
        output = all_to_all(input, group)
        # Rank 0 # output: Tensor([0 3])
        # Rank 1 # output: Tensor([2 1])
           input = Tensor([0 1]) + rank*2
           # Rank 0 # input: Tensor([0 1])
           # Rank 1 # input: Tensor([2 3])
           output = all_to_all(input)
           # Rank 0 # output: Tensor([0 2])
           # Rank 1 # output: Tensor([1 3])

           input = Tensor([0 1]) + rank*2
           group = Group([1, 0])
           output = all_to_all(input, group)
           # Rank 0 # output: Tensor([0 3])
           # Rank 1 # output: Tensor([2 1])
    """
    group_size = group.size if group is not None else 1
    assert (
@@ -805,8 +779,7 @@ class _RemoteRecv(Function):


 def remote_send(inp: Tensor, dest_rank: int):
    r"""
    Send tensor to another process.
    r"""Send tensor to another process.

    Args:
        inp: Tensor to send.
@@ -816,17 +789,15 @@ def remote_send(inp: Tensor, dest_rank: int):
        None.

    Examples:

    .. code-block::

        if rank == 0:
            data = mge.tensor(1)
            # Tensor(1)
            F.distributed.remote_send(data, 1) # return None
        else:
            data = F.distributed.remote_recv(0)
            # Tensor(1)

        .. code-block::

           if rank == 0:
               data = mge.tensor(1)
               # Tensor(1)
               F.distributed.remote_send(data, 1) # return None
           else:
               data = F.distributed.remote_recv(0)
               # Tensor(1)
    """
    group = _SendRecvGroup(get_rank(), dest_rank)
    _bcast_shape_dtype(group, inp)
@@ -844,8 +815,7 @@ def remote_send(inp: Tensor, dest_rank: int):


 def remote_recv(src_rank: int, device: Optional[str] = None, inp=None) -> Tensor:
    r"""
    Receive a tensor from another process.
    r"""Receive a tensor from another process.

    Args:
        src_rank: Rank of source process.
@@ -862,14 +832,13 @@ def remote_recv(src_rank: int, device: Optional[str] = None, inp=None) -> Tensor

    .. code-block::

        if rank == 0:
            data = mge.tensor(1)
            # Tensor(1)
            F.distributed.remote_send(data, 1) # return None
        else:
            data = F.distributed.remote_recv(0)
            # Tensor(1)

       if rank == 0:
           data = mge.tensor(1)
           # Tensor(1)
           F.distributed.remote_send(data, 1) # return None
       else:
           data = F.distributed.remote_recv(0)
           # Tensor(1)
    """
    group = _SendRecvGroup(src_rank, get_rank())
    shape, dtype = _bcast_shape_dtype(group, None)
--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -36,15 +36,13 @@ _sd = None


 class Group:
    r"""
    Include ranked nodes running collective communication (See :mod:`~.functional.distributed`).
    r"""Include ranked nodes running collective communication (See :mod:`~.functional.distributed`).

    By default collectives operate on the default group (also called ``WORLD``) 
    and require all processes to enter the distributed function call. 
    By default collectives operate on the default group (also called ``WORLD``)
    and require all processes to enter the distributed function call.

    :param proc_ranks: rank list of the group, the first one is root rank.

    
    Args:
        proc_ranks: rank list of the group, the first one is root rank.
    """

    def __init__(self, proc_ranks):
@@ -116,15 +114,15 @@ def init_process_group(
    backend: Optional[str] = "auto",
    device_type: str = "xpu",
 ) -> None:
    """
    Initialize the distributed process group and specify the device used in the current process

    :param master_ip: ip address of the master node.
    :param port: port available for all processes to communicate.
    :param world_size: total number of processes participating in the job.
    :param rank: rank of the current process.
    :param device: the GPU device id to bind this process to.
    :param backend: communicator backend, currently support 'nccl' and 'shm'.
    r"""Initialize the distributed process group and specify the device used in the current process

    Args:
        master_ip: ip address of the master node.
        port: port available for all processes to communicate.
        world_size: total number of processes participating in the job.
        rank: rank of the current process.
        device: the GPU device id to bind this process to.
        backend: communicator backend, currently support 'nccl' and 'shm'.
    """
    physical_device_type = what_is_xpu() if device_type == "xpu" else device_type
    if not isinstance(master_ip, str):
@@ -180,10 +178,10 @@ def _set_machine_ranks(ranks) -> None:

@contextmanager
 def override_backend(new_backend: str):
    """
    Override distributed backend
    r"""Override distributed backend

    :param new_backend: communicator backend set in this context.
    Args:
        new_backend: communicator backend set in this context.
    """
    global _sd
    assert _sd, "please call init_process_group first"
@@ -196,51 +194,51 @@ def override_backend(new_backend: str):


 def is_distributed() -> bool:
    """Return True if the distributed process group has been initialized."""
    r"""Return True if the distributed process group has been initialized."""
    return _sd is not None


 def get_rank() -> int:
    """Get the rank of the current process."""
    r"""Get the rank of the current process."""
    return _sd.proc_rank if _sd is not None else 0


 def get_world_size() -> int:
    """Get the total number of processes participating in the job."""
    r"""Get the total number of processes participating in the job."""
    return _sd.world_size if _sd is not None else 1


 def get_backend() -> str:
    """Get the backend str."""
    r"""Get the backend str."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.backend if _sd is not None else None


 def get_py_server_addr() -> Tuple[str, int]:
    """Get master_ip and port of python XML RPC server."""
    r"""Get master_ip and port of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.py_server_port


 def get_mm_server_addr() -> Tuple[str, int]:
    """Get master_ip and port of C++ mm_server."""
    r"""Get master_ip and port of C++ mm_server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.mm_server_port


 def get_client() -> Client:
    """Get client of python XML RPC server."""
    r"""Get client of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.client


 def new_group(proc_ranks: List[int]) -> Group:
    """Build a subgroup containing certain ranks."""
    r"""Build a subgroup containing certain ranks."""
    return Group(proc_ranks)


 def group_barrier(group: Group = WORLD) -> None:
    """Block until all ranks in the group reach this barrier."""
    r"""Block until all ranks in the group reach this barrier."""
    # if running with single node, skip it
    if _sd is None:
        return
--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -28,39 +28,40 @@ from .group import WORLD, Group, group_barrier, is_distributed, override_backend


 def param_pack_split(inp: Tensor, offsets: list, shapes: list):
    r"""
    Returns split tensor to tensor list as offsets and shapes described,
            only used for ``parampack``.
    r"""Returns split tensor to tensor list as offsets and shapes described,
    only used for ``parampack``.

    :param inp: input tensor.
    :param offsets: offsets of outputs, length of `2 * n`,
    Args:
        inp: input tensor.
        offsets: offsets of outputs, length of `2 * n`,
            while n is tensor nums you want to split,
            format `[begin0, end0, begin1, end1]`.
    :param shapes: tensor shapes of outputs.
    :return: splitted tensors.
        shapes: tensor shapes of outputs.

    Examples:
    Returns:
        splitted tensors.

    .. testcode::
    Examples:

        import numpy as np
        from megengine import tensor
        from megengine.distributed.helper import param_pack_split
        .. testcode::

        a = tensor(np.ones((10,), np.int32))
        b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
        print(b.numpy())
        print(c.numpy())
           import numpy as np
           from megengine import tensor
           from megengine.distributed.helper import param_pack_split

    Outputs:
           a = tensor(np.ones((10,), np.int32))
           b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
           print(b.numpy())
           print(c.numpy())

    .. testoutput::
        Outputs:

        [1]
        [[1 1 1]
         [1 1 1]
         [1 1 1]]
        .. testoutput::

           [1]
           [[1 1 1]
            [1 1 1]
            [1 1 1]]
    """
    op = ParamPackSplit()
    op.offsets = offsets
@@ -73,36 +74,37 @@ def param_pack_split(inp: Tensor, offsets: list, shapes: list):


 def param_pack_concat(inps: list, offsets: Tensor, offsets_val: list):
    r"""
    Returns concated tensor, only used for ``parampack``.
    r"""Returns concated tensor, only used for ``parampack``.

    :param inps: input tensors.
    :param offsets: device value of offsets.
    :param offsets_val: offsets of inputs, length of `2 * n`,
    Args:
         inps: input tensors.
         offsets: device value of offsets.
         offsets_val: offsets of inputs, length of `2 * n`,
            format `[begin0, end0, begin1, end1]`.
    :return: concated tensor.

    Examples:
    Returns:
         concated tensor.

    .. testcode::
    Examples:

        import numpy as np
        from megengine import tensor
        from megengine.distributed.helper import param_pack_concat
         .. testcode::

        a = tensor(np.ones((1,), np.int32))
        b = tensor(np.ones((3, 3), np.int32))
        offsets_val = [0, 1, 1, 10]
        offsets = tensor(offsets_val, np.int32)
        c = param_pack_concat([a, b], offsets, offsets_val)
        print(c.numpy())
            import numpy as np
            from megengine import tensor
            from megengine.distributed.helper import param_pack_concat

    Outputs:
            a = tensor(np.ones((1,), np.int32))
            b = tensor(np.ones((3, 3), np.int32))
            offsets_val = [0, 1, 1, 10]
            offsets = tensor(offsets_val, np.int32)
            c = param_pack_concat([a, b], offsets, offsets_val)
            print(c.numpy())

    .. testoutput::
         Outputs:

        [1 1 1 1 1 1 1 1 1 1]
         .. testoutput::

            [1 1 1 1 1 1 1 1 1 1]
    """
    op = ParamPackConcat()
    op.offsets = offsets_val
@@ -165,9 +167,9 @@ class TensorFuture(Future):


 def synchronized(func: Callable):
    r"""Decorator. Decorated function will synchronize when finished.
    Specifically, we use this to prevent data race during hub.load
    """
    Decorator. Decorated function will synchronize when finished.
    Specifically, we use this to prevent data race during hub.load"""

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
@@ -199,23 +201,23 @@ get_device_count_by_fork = deprecated_func(


 def bcast_list_(inps: list, group: Group = WORLD):
    """
    Broadcast tensors between given group.
    r"""Broadcast tensors between given group.

    :param inps: input tensors.
    :param group: communication group.
    Args:
        inps: input tensors.
        group: communication group.
    """
    for inp in inps:
        inp._reset(_bcast_param(inp, group))


 class AllreduceCallback:
    """
    Allreduce Callback with tensor fusion optimization.
    r"""Allreduce Callback with tensor fusion optimization.

    :param reduce_method: the method to reduce gradiants.
    :param group: communication group.
    :param backend: override distributed backend in allreduce
    Args:
        reduce_method: the method to reduce gradiants.
        group: communication group.
        backend: override distributed backend in allreduce
    """

    def __init__(self, reduce_method: str, group: Group = WORLD, backend: str = None):
--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -39,7 +39,7 @@ def _run_wrapped(
    queue: mp.Queue,
    machine_ranks: list,
 ):
    """Init distributed process group and run wrapped function."""
    r"""Init distributed process group and run wrapped function."""
    _check_device_initialized(device_type, dev)
    init_process_group(
        master_ip=master_ip,
@@ -64,15 +64,16 @@ def _run_wrapped(


 class launcher:
    """Decorator for launching multiple processes in single-machine multi-gpu training.

    :param func: the function you want to launch in distributed mode.
    :param n_gpus: how many devices each node.
    :param world_size: how many devices totally.
    :param rank_start: start number for rank.
    :param master_ip: ip address for master node (where the rank 0 is).
    :param port: server port for distributed server.
    :param backend: set default collective communication backend.
    r"""Decorator for launching multiple processes in single-machine multi-gpu training.

    Args:
        func: the function you want to launch in distributed mode.
        n_gpus: how many devices each node.
        world_size: how many devices totally.
        rank_start: start number for rank.
        master_ip: ip address for master node (where the rank 0 is).
        port: server port for distributed server.
        backend: set default collective communication backend.
    """

    def __new__(cls, *args, **kwargs):
--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -20,11 +20,11 @@ from ..utils.future import Future


 class Methods:
    """
    Distributed Server Method.
    r"""Distributed Server Method.
    Used for exchange information between distributed nodes.

    :param mm_server_port: multiple machine rpc server port.
    Args:
        mm_server_port: multiple machine rpc server port.
    """

    def __init__(self, mm_server_port):
@@ -39,19 +39,19 @@ class Methods:
        self.bcast_dict = {}

    def connect(self):
        """Method for checking connection success."""
        r"""Method for checking connection success."""
        return True

    def get_mm_server_port(self):
        """Get multiple machine rpc server port."""
        r"""Get multiple machine rpc server port."""
        return self.mm_server_port

    def set_is_grad(self, key, is_grad):
        """
        Mark send/recv need gradiants by key.
        r"""Mark send/recv need gradiants by key.

        :param key: key to match send/recv op.
        :param is_grad: whether this op need grad.
        Args:
            key: key to match send/recv op.
            is_grad: whether this op need grad.
        """
        with self.lock:
            future = self.dict_is_grad[key]
@@ -59,10 +59,10 @@ class Methods:
        return True

    def check_is_grad(self, key):
        """
        Check whether send/recv need gradiants.
        r"""Check whether send/recv need gradiants.

        :param key: key to match send/recv op.
        Args:
            key: key to match send/recv op.
        """
        with self.lock:
            future = self.dict_is_grad[key]
@@ -72,11 +72,11 @@ class Methods:
        return ret

    def set_remote_tracer(self, key, tracer_set):
        """
        Set tracer dict for tracing send/recv op.
        r"""Set tracer dict for tracing send/recv op.

        :param key: key to match send/recv op.
        :param tracer_set: valid tracer set.
        Args:
            key: key to match send/recv op.
            tracer_set: valid tracer set.
        """
        with self.lock:
            future = self.dict_remote_tracer[key]
@@ -84,10 +84,10 @@ class Methods:
        return True

    def check_remote_tracer(self, key):
        """
        Get tracer dict for send/recv op.
        r"""Get tracer dict for send/recv op.

        :param key: key to match send/recv op.
        Args:
            key: key to match send/recv op.
        """
        with self.lock:
            future = self.dict_remote_tracer[key]
@@ -97,11 +97,11 @@ class Methods:
        return ret

    def group_barrier(self, key, size):
        """
        A barrier wait for all group member.
        r"""A barrier wait for all group member.

        :param key: group key to match each other.
        :param size: group size.
        Args:
            key: group key to match each other.
            size: group size.
        """
        with self.lock:
            self.dict_barrier_counter[key] += 1
@@ -116,14 +116,14 @@ class Methods:
        return True

    def user_set(self, key, val):
        """Set user defined key-value pairs across processes."""
        r"""Set user defined key-value pairs across processes."""
        with self.lock:
            future = self.user_dict[key]
        future.set(val)
        return True

    def user_get(self, key):
        """Get user defined key-value pairs across processes."""
        r"""Get user defined key-value pairs across processes."""
        with self.lock:
            future = self.user_dict[key]
        return future.get()
@@ -161,12 +161,12 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):


 def _start_server(py_server_port, queue):
    """
    Start python distributed server and multiple machine server.
    r"""Start python distributed server and multiple machine server.

    :param py_server_port: python server port.
    :param mm_server_port: multiple machine server port.
    :param queue: server port will put in this queue, puts exception when process fails.
    Args:
        py_server_port: python server port.
        mm_server_port: multiple machine server port.
        queue: server port will put in this queue, puts exception when process fails.
    """
    try:
        mm_server_port = create_mm_server("0.0.0.0", 0)
@@ -182,11 +182,11 @@ def _start_server(py_server_port, queue):


 class Server:
    """
    Distributed Server for distributed training.
    r"""Distributed Server for distributed training.
    Should be running at master node.

    :param port: python server port.
    Args:
        port: python server port.
    """

    def __init__(self, port=0):
@@ -204,11 +204,11 @@ class Server:


 class Client:
    """
    Distributed Client for distributed training.
    r"""Distributed Client for distributed training.

    :param master_ip: ip address of master node.
    :param port: port of server at master node.
    Args:
        master_ip: ip address of master node.
        port: port of server at master node.
    """

    def __init__(self, master_ip, port):
@@ -218,7 +218,7 @@ class Client:
        self.bcast_dict = defaultdict(lambda: 0)

    def connect(self):
        """Check connection success."""
        r"""Check connection success."""
        while True:
            try:
                self.proxy = ServerProxy(
@@ -230,62 +230,62 @@ class Client:
                time.sleep(1)

    def get_mm_server_port(self):
        """Get multiple machine server port."""
        r"""Get multiple machine server port."""
        return self.proxy.get_mm_server_port()

    def set_is_grad(self, key, is_grad):
        """
        Mark send/recv need gradiants by key.
        r"""Mark send/recv need gradiants by key.

        :param key: key to match send/recv op.
        :param is_grad: whether this op need grad.
        Args:
            key: key to match send/recv op.
            is_grad: whether this op need grad.
        """
        self.proxy.set_is_grad(key, is_grad)

    def check_is_grad(self, key):
        """
        Check whether send/recv need gradiants.
        r"""Check whether send/recv need gradiants.

        :param key: key to match send/recv op.
        Args:
            key: key to match send/recv op.
        """
        return self.proxy.check_is_grad(key)

    def set_remote_tracer(self, key, tracer_set):
        """
        Set tracer dict for tracing send/recv op.
        r"""Set tracer dict for tracing send/recv op.

        :param key: key to match send/recv op.
        :param tracer_set: valid tracer set.
        Args:
            key: key to match send/recv op.
            tracer_set: valid tracer set.
        """
        self.proxy.set_remote_tracer(key, tracer_set)

    def check_remote_tracer(self, key):
        """
        Get tracer dict for send/recv op.
        r"""Get tracer dict for send/recv op.

        :param key: key to match send/recv op.
        Args:
            key: key to match send/recv op.
        """
        return self.proxy.check_remote_tracer(key)

    def group_barrier(self, key, size):
        """
        A barrier wait for all group member.
        r"""A barrier wait for all group member.

        :param key: group key to match each other.
        :param size: group size.
        Args:
            key: group key to match each other.
            size: group size.
        """
        self.proxy.group_barrier(key, size)

    def user_set(self, key, val):
        """Set user defined key-value pairs across processes."""
        r"""Set user defined key-value pairs across processes."""
        return self.proxy.user_set(key, val)

    def user_get(self, key):
        """Get user defined key-value pairs across processes."""
        r"""Get user defined key-value pairs across processes."""
        return self.proxy.user_get(key)

    def user_pop(self, key):
        """Get user defined key-value pairs and delete the resources when the get is done"""
        r"""Get user defined key-value pairs and delete the resources when the get is done"""
        return self.proxy.user_pop(key)

    def bcast_val(self, val, key, size):
--- a/imperative/python/megengine/dtr/dtr.py
+++ b/imperative/python/megengine/dtr/dtr.py
@@ -30,24 +30,20 @@ def _str2bytes(text: str) -> int:

@property
 def eviction_threshold(mod):
    r"""
    Get or set the eviction threshold in bytes. It can also be set to a string,
    r"""Get or set the eviction threshold in bytes. It can also be set to a string,
    whose formatting supports byte(B), kilobyte(KB), megabyte(MB) and
    gigabyte(GB) units.

    .. note::

    
    Note: 
       When GPU memory usage exceeds this value, DTR will heuristically select
       and evict resident tensors until the amount of used memory falls below
       this threshold.

    
    Examples:
        .. code-block::

    .. code-block::

        import megengine as mge
        mge.dtr.eviction_threshold = "2GB"

           import megengine as mge
           mge.dtr.eviction_threshold = "2GB"
    """
    return _eviction_threshold

@@ -66,24 +62,21 @@ def eviction_threshold(mod, value: Union[int, str]):

@property
 def evictee_minimum_size(mod):
    r"""
    Get or set the memory threshold of tensors in bytes. It can also be set to a
    r"""Get or set the memory threshold of tensors in bytes. It can also be set to a
    string, whose formatting supports byte(B), kilobyte(KB), megabyte(MB) and
    gigabyte(GB) units.

    .. note::

    
    Note:
       Only tensors whose size exceeds this threshold will be added to the
       candidate set. A tensor that is not added to the candidate set will
       never be evicted during its lifetime.

    
    Examples:
    
        .. code-block::

    .. code-block::

        import megengine as mge
        mge.dtr.evictee_minimum_size = "2MB"

           import megengine as mge
           mge.dtr.evictee_minimum_size = "2MB"
    """
    return _evictee_minimum_size

@@ -102,19 +95,16 @@ def evictee_minimum_size(mod, value: Union[int, str]):

@property
 def enable_sqrt_sampling(mod):
    r"""
    Get or set whether sqrt sampling is allowed. Sqrt sampling means that given
    r"""Get or set whether sqrt sampling is allowed. Sqrt sampling means that given
    the size of the candidate set is N, only enumerate sqrt(N) tensors. When
    the number of tensors is very high, enabling this optimization will speed
    up the training.
    
    Examples:    
        .. code-block::

    Examples:

    .. code-block::

        import megengine as mge
        mge.dtr.enable_sqrt_sampling = True

           import megengine as mge
           mge.dtr.enable_sqrt_sampling = True
    """
    return _enable_sqrt_sampling

@@ -127,9 +117,7 @@ def enable_sqrt_sampling(mod, value: bool):


 def enable():
    r"""
    Enable to record computing path of tensors and to perform DTR policy.
    """
    r"""Enable to record computing path of tensors and to perform DTR policy."""
    _set_defrag(True)
    _set_option("enable_dtr_auto_drop", 1)
    _set_option("enable_drop", 1)
@@ -138,9 +126,7 @@ def enable():


 def disable():
    r"""
    Stop recording computing path of tensors and performing DTR policy.
    """
    r"""Stop recording computing path of tensors and performing DTR policy."""
    _set_defrag(False)
    _set_option("enable_dtr_auto_drop", 0)
    _set_option("enable_drop", 0)
--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -23,8 +23,7 @@ if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:


 def get_execution_strategy() -> Strategy:
    """
    Returns the execution strategy of :class:`~module..Conv2d` and :func:`~.matmul`
    r"""Returns the execution strategy of :class:`~module..Conv2d` and :func:`~.matmul`

    See :func:`~.set_execution_strategy` for possible return values
    """
@@ -32,31 +31,32 @@ def get_execution_strategy() -> Strategy:


 def set_execution_strategy(option):
    """
    Sets the execution strategy of :class:`~module.Conv2d` and :func:`~.matmul`
    r"""Sets the execution strategy of :class:`~module.Conv2d` and :func:`~.matmul`

    Args:
        option: Decides how :class:`~.module.Conv2d`and :func:`~.matmul` algorithms are chosen.
            Available value Strategy

    :param option: Decides how :class:`~module.Conv2d`and :func:`~.matmul` algorithms are chosen.
        Available value Strategy
        * HEURISTIC uses heuristic to choose the fastest algorithm.
        * PROFILE runs possible algorithms on real device to find the best one.
        * REPRODUCIBLE uses the algorithms that is reproducible.
        * OPTIMIZED uses the algorithms that is optimized.
                * HEURISTIC uses heuristic to choose the fastest algorithm.
                * PROFILE runs possible algorithms on real device to find the best one.
                * REPRODUCIBLE uses the algorithms that is reproducible.
                * OPTIMIZED uses the algorithms that is optimized.

        The default strategy is HEURISTIC, this options can be combined to
        form a combination option, e.g. PROFILE | REPRODUCIBLE
        can combined a option that uses the fastest of profiling result that is also reproducible.
    The default strategy is HEURISTIC, this options can be combined to
    form a combination option, e.g. PROFILE | REPRODUCIBLE
    can combined a option that uses the fastest of profiling result that is also reproducible.

        Available values string:
    Available values string:

        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
        * 'PROFILE' runs possible algorithms on real device to find the best one.
        * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
        * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
    * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
    * 'PROFILE' runs possible algorithms on real device to find the best one.
    * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
    * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
    * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.

        The default strategy is 'HEURISTIC'.
    The default strategy is 'HEURISTIC'.

        It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
    It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
    """
    valid_string_option = {
        "REPRODUCIBLE": Strategy.REPRODUCIBLE,
--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -78,182 +78,163 @@ def _elemwise_multi_type(*args, mode, **kwargs):


 def add(x, y):
    """
    Element-wise `addition`.
    At least one operand should be tensor.

    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.

    :param x: input tensor.
    :return: computed tensor.
    r"""Element-wise `addition`.

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.add(x, y)
        print(out.numpy())
            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            out = F.add(x, y)
            print(out.numpy())

    Outputs:
        Outputs:

    .. testoutput::

        [[ 0.  2.  4.]
         [ 6.  8. 10.]]
        .. testoutput::

            [[ 0.  2.  4.]
             [ 6.  8. 10.]]
    """
    return _elwise(x, y, mode=Elemwise.Mode.ADD)


 def sub(x, y):
    """Element-wise `subtraction`."""
    r"""Element-wise `subtraction`."""
    return _elwise(x, y, mode=Elemwise.Mode.SUB)


 def mul(x, y):
    """Element-wise `multiplication`."""
    r"""Element-wise `multiplication`."""
    return _elwise(x, y, mode=Elemwise.Mode.MUL)


 def div(x, y):
    """Element-wise `(x / y)`."""
    r"""Element-wise `(x / y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.TRUE_DIV)


 def floor_div(x, y):
    """Element-wise `floor(x / y)`."""
    r"""Element-wise `floor(x / y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.FLOOR_DIV)


 def neg(x):
    """Element-wise `negation`."""
    r"""Element-wise `negation`."""
    return _elwise(x, mode=Elemwise.Mode.NEGATE)


 def pow(x, y):
    """Element-wise `power`."""
    r"""Element-wise `power`."""
    return _elwise(x, y, mode=Elemwise.Mode.POW)


 def mod(x, y):
    """Element-wise `remainder of division`."""
    r"""Element-wise `remainder of division`."""
    return _elwise(x, y, mode=Elemwise.Mode.MOD)


 def abs(x):
    """Element-wise `absolute value`."""
    r"""Element-wise `absolute value`."""
    return _elwise(x, mode=Elemwise.Mode.ABS)


 def exp(x):
    """Element-wise `exponential`."""
    r"""Element-wise `exponential`."""
    return _elwise(x, mode=Elemwise.Mode.EXP)


 def expm1(x):
    """Element-wise `exp(x)-1`."""
    r"""Element-wise `exp(x)-1`."""
    return _elwise(x, mode=Elemwise.Mode.EXPM1)


 def log(x):
    """Element-wise `logarithm (base e)`."""
    r"""Element-wise `logarithm (base e)`."""
    return _elwise(x, mode=Elemwise.Mode.LOG)


 def log1p(x):
    """Element-wise `log(x+1) (base e)`."""
    r"""Element-wise `log(x+1) (base e)`."""
    return _elwise(x, mode=Elemwise.Mode.LOG1P)


 def sqrt(x: Tensor) -> Tensor:
    """
    Element-wise `sqrt`.
    Returns ``NaN`` for negative input value.

    :param x: input tensor.
    :return: computed tensor.
    r"""Element-wise `sqrt`.

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.sqrt(x)
        print(out.numpy().round(decimals=4))
            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            out = F.sqrt(x)
            print(out.numpy().round(decimals=4))

    Outputs:
        Outputs:

    .. testoutput::

        [[0.     1.     1.4142]
         [1.7321 2.     2.2361]]
        .. testoutput::

            [[0.     1.     1.4142]
             [1.7321 2.     2.2361]]
    """
    return x ** 0.5


 def square(x: Tensor) -> Tensor:
    """
    Returns a new tensor with the square of the elements of input tensor.

    :param inp: input tensor.
    :return: computed tensor.
    r"""Element-wise `square`.

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.functional as F
            import numpy as np
            import megengine as mge
            import megengine.functional as F

        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.square(data)
        print(out.numpy().round(decimals=4))
            data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            out = F.square(data)
            print(out.numpy().round(decimals=4))

    Outputs:
        Outputs:

    .. testoutput::

        [[ 0.  1.  4.]
         [ 9. 16. 25.]]
        .. testoutput::

            [[ 0.  1.  4.]
             [ 9. 16. 25.]]
    """
    return x ** 2


 def round(x):
    """Element-wise `rounding to int`."""
    r"""Element-wise `rounding to int`."""
    return _elwise(x, mode=Elemwise.Mode.ROUND)


 def ceil(x):
    """Element-wise `ceiling`."""
    r"""Element-wise `ceiling`."""
    return _elwise(x, mode=Elemwise.Mode.CEIL)


 def floor(x):
    """Element-wise `floor`."""
    r"""Element-wise `floor`."""
    return _elwise(x, mode=Elemwise.Mode.FLOOR)


 def maximum(x, y):
    """Element-wise `maximum of array elements`."""
    r"""Element-wise `maximum of array elements`."""
    return _elwise(x, y, mode=Elemwise.Mode.MAX)


 def minimum(x, y):
    """Element-wise `minimum of array elements`."""
    r"""Element-wise `minimum of array elements`."""
    return _elwise(x, y, mode=Elemwise.Mode.MIN)


@@ -261,62 +242,57 @@ def minimum(x, y):


 def cos(x):
    """
    Element-wise `cosine`.

    :param x: input tensor.
    :return: computed tensor.
    r"""Element-wise `cosine`.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.cos(x)
        print(out.numpy().round(decimals=4))
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            out = F.cos(x)
            print(out.numpy().round(decimals=4))

    .. testoutput::
        Outputs:

        [[ 1.      0.5403 -0.4161]
         [-0.99   -0.6536  0.2837]]
        .. testoutput::

            [[ 1.      0.5403 -0.4161]
             [-0.99   -0.6536  0.2837]]
    """
    return _elwise(x, mode=Elemwise.Mode.COS)


 def sin(x):
    """Element-wise `sine`."""
    r"""Element-wise `sine`."""
    return _elwise(x, mode=Elemwise.Mode.SIN)


 def tan(x):
    """Element-wise `tangent`."""
    r"""Element-wise `tangent`."""
    return sin(x) / cos(x)


 def acos(x):
    """Element-wise `inverse cosine`."""
    r"""Element-wise `inverse cosine`."""
    return _elwise(x, mode=Elemwise.Mode.ACOS)


 def asin(x):
    """Element-wise `inverse sine`."""
    r"""Element-wise `inverse sine`."""
    return _elwise(x, mode=Elemwise.Mode.ASIN)


 def atan(x):
    """Element-wise `inverse tangent`."""
    r"""Element-wise `inverse tangent`."""
    return _elwise(x, 1, mode=Elemwise.Mode.ATAN2)


 def atan2(y, x):
    """Element-wise `2-argument arctangent`."""
    r"""Element-wise `2-argument arctangent`."""
    return _elwise(y, x, mode=Elemwise.Mode.ATAN2)


@@ -355,38 +331,33 @@ def atanh(x):


 def left_shift(x, y):
    """
    Element-wise `bitwise binary: x << y`.
    r"""Element-wise `bitwise binary: x << y`.

    :param x: input tensor, should be int.
    :param y: how many bits to be left-shifted.
    :return: computed tensor.
        Examples:

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

        x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
        out = F.left_shift(x, 2)
        print(out.numpy())
            x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
            out = F.left_shift(x, 2)
            print(out.numpy())

    Outputs:
        Outputs:

    .. testoutput::
        .. testoutput::

        [[ 0  4  8]
         [12 16 20]]
            [[ 0  4  8]
             [12 16 20]]

    """
    return _elwise(x, y, mode=Elemwise.Mode.SHL)


 def right_shift(x, y):
    """Element-wise `bitwise binary: x >> y`."""
    r"""Element-wise `bitwise binary: x >> y`."""
    return _elwise(x, y, mode=Elemwise.Mode.SHR)


@@ -394,22 +365,22 @@ def right_shift(x, y):


 def logical_and(x, y):
    """Element-wise `logical and: x && y`."""
    r"""Element-wise `logical and: x && y`."""
    return _elwise(x, y, mode=Elemwise.Mode.AND)


 def logical_not(x):
    """Element-wise `logical not: ~x`."""
    r"""Element-wise `logical not: ~x`."""
    return _elwise(x, mode=Elemwise.Mode.NOT)


 def logical_or(x, y):
    """Element-wise `logical or: x || y`."""
    r"""Element-wise `logical or: x || y`."""
    return _elwise(x, y, mode=Elemwise.Mode.OR)


 def logical_xor(x, y):
    """Element-wise `logical xor: x ^ y`."""
    r"""Element-wise `logical xor: x ^ y`."""
    return _elwise(x, y, mode=Elemwise.Mode.XOR)


@@ -417,59 +388,53 @@ def logical_xor(x, y):


 def equal(x, y):
    """
    Element-wise `(x == y)`.

    :param x: input tensor 1.
    :param y: input tensor 2.
    :return: computed tensor.
    r"""Element-wise `(x == y)`.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.equal(x, y)
        print(out.numpy())
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
            out = F.equal(x, y)
            print(out.numpy())

    .. testoutput::
        Outputs:

        [[1. 1. 1.]
         [1. 1. 1.]]
        .. testoutput::

            [[1. 1. 1.]
             [1. 1. 1.]]
    """
    return _elwise(x, y, mode=Elemwise.Mode.EQ)


 def not_equal(x, y):
    """Element-wise `(x != y)`."""
    r"""Element-wise `(x != y)`."""
    return x != y


 def less(x, y):
    """Element-wise `(x < y)`."""
    r"""Element-wise `(x < y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.LT)


 def less_equal(x, y):
    """Element-wise `(x <= y)`."""
    r"""Element-wise `(x <= y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.LEQ)


 def greater(x, y):
    """Element-wise `(x > y)`."""
    r"""Element-wise `(x > y)`."""
    return _elwise(y, x, mode=Elemwise.Mode.LT)


 def greater_equal(x, y):
    """Element-wise `(x >= y)`."""
    r"""Element-wise `(x >= y)`."""
    return _elwise(y, x, mode=Elemwise.Mode.LEQ)


@@ -477,43 +442,45 @@ def greater_equal(x, y):


 def clip(x: Tensor, lower=None, upper=None) -> Tensor:
    r"""
    Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
    r"""Clamps all elements in input tensor into the range ``[ lower, upper ]`` and returns
    a resulting tensor:

    .. math::

        y_i = \begin{cases}
            \text{lower} & \text{if } x_i < \text{lower} \\
            x_i & \text{if } \text{lower} \leq x_i \leq \text{upper} \\
            \text{upper} & \text{if } x_i > \text{upper}
        \end{cases}

    :param x: input tensor.
    :param lower: lower-bound of the range to be clamped to.
    :param upper: upper-bound of the range to be clamped to.
    :return: output clamped tensor.
    Args:
        x: input tensor.
        lower: lower-bound of the range to be clamped to.
        upper: upper-bound of the range to be clamped to.

    Examples:
    Returns:
        output clamped tensor.

    .. testcode::
    Examples:

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        a = tensor(np.arange(5).astype(np.int32))
        print(F.clip(a, 2, 4).numpy())
        print(F.clip(a, lower=3).numpy())
        print(F.clip(a, upper=3).numpy())
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            a = tensor(np.arange(5).astype(np.int32))
            print(F.clip(a, 2, 4).numpy())
            print(F.clip(a, lower=3).numpy())
            print(F.clip(a, upper=3).numpy())

    .. testoutput::
        Outputs:

        [2 2 2 3 4]
        [3 3 3 3 4]
        [0 1 2 3 3]
        .. testoutput::

            [2 2 2 3 4]
            [3 3 3 3 4]
            [0 1 2 3 3]
    """
    assert (
        lower is not None or upper is not None
--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
@@ -23,14 +23,14 @@ def tensorrt_runtime_opr(inputs, *, data: bytes = None):


 def cambricon_runtime_opr(inputs, data, symbol, tensor_dim_mutable):
    r"""
    Load a serialized Cambricon model as a runtime operator in MegEngine.

    :param inputs: list of input tensors.
    :param data: the serialized Cambricon model.
    :param symbol: name of the function in Cambricon model.
    :param tensor_dim_mutable: whether the input tensors' shapes are mutable
        in ``cnrtModel_t``.
    r"""Load a serialized Cambricon model as a runtime operator in MegEngine.

    Args:
        inputs: list of input tensors.
        data: the serialized Cambricon model.
        symbol: name of the function in Cambricon model.
        tensor_dim_mutable: whether the input tensors' shapes are mutable
            in ``cnrtModel_t``.
    """

    op = builtin.CambriconRuntime(data, len(data), symbol, tensor_dim_mutable)
@@ -38,11 +38,11 @@ def cambricon_runtime_opr(inputs, data, symbol, tensor_dim_mutable):


 def atlas_runtime_opr(inputs, data):
    r"""
    Load a serialized Atlas model as a runtime operator in MegEngine.
    r"""Load a serialized Atlas model as a runtime operator in MegEngine.

    :param inputs: list of input tensors.
    :param data: the serialized Atlas model.
    Args:
        inputs: list of input tensors.
        data: the serialized Atlas model.
    """

    op = builtin.AtlasRuntime(data, len(data))
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -26,9 +26,7 @@ __all__ = [


 def _reduce_output(loss_fn):
    r"""
    Wrapper to apply canonical reductions to loss outputs.
    """
    r"""Wrapper to apply canonical reductions to loss outputs."""

    @functools.wraps(loss_fn)
    def reduced_loss_fn(*args, reduction="mean", **kwargs):
@@ -45,13 +43,14 @@ def _reduce_output(loss_fn):

@_reduce_output
 def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
    r"""
    Calculates the mean absolute error (MAE) between
    r"""Calculates the mean absolute error (MAE) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean absolute error can be described as:

    .. math:: \ell(x,y) = mean\left(L \right)
    .. math::

       \ell(x,y) = mean\left(L \right)

    where

@@ -63,30 +62,32 @@ def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

    :param pred: predicted result from model.
    :param label: ground truth to compare.
    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
    :return: loss value.
    Args:
        pred: predicted result from model.
        label: ground truth to compare.
        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

    Examples:
    Returns:
        loss value.

    .. testcode::
    Examples:

        import numpy as np
        import megengine as mge
        import megengine.functional as F
        .. testcode::

        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
        loss = F.nn.l1_loss(ipt, tgt)
        print(loss.numpy())
            import numpy as np
            import megengine as mge
            import megengine.functional as F

    Outputs:
            ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
            tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
            loss = F.nn.l1_loss(ipt, tgt)
            print(loss.numpy())

    .. testoutput::
        Outputs:

        2.75
        .. testoutput::

            2.75
    """
    diff = pred - label
    return abs(diff)
@@ -94,53 +95,56 @@ def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:

@_reduce_output
 def square_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
    r"""
    Calculates the mean squared error (squared L2 norm) between
    r"""Calculates the mean squared error (squared L2 norm) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean squared error can be described as:

    .. math:: \ell(x, y) = mean\left( L \right)
    .. math::

       \ell(x, y) = mean\left( L \right)

    where

    .. math::

        L = \{l_1,\dots,l_N\}, \quad
        l_n = \left( x_n - y_n \right)^2,
       L = \{l_1,\dots,l_N\}, \quad
       l_n = \left( x_n - y_n \right)^2,

    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

    :param pred: predicted result from model.
    :param label: ground truth to compare.
    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
    :return: loss value.
    Args:
        pred: predicted result from model.
        label: ground truth to compare.
        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

    Returns:
        loss value.

    Shape:
        - pred: :math:`(N, *)` where :math:`*` means any number of additional
          dimensions.
        - label: :math:`(N, *)`. Same shape as ``pred``.
      * pred: :math:`(N, *)` where :math:`*` means any number of additional
        dimensions.
      * label: :math:`(N, *)`. Same shape as ``pred``.

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.functional as F
        .. testcode::

        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
        loss = F.nn.square_loss(ipt, tgt)
        print(loss.numpy())
            import numpy as np
            import megengine as mge
            import megengine.functional as F

    Outputs:
            ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
            tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
            loss = F.nn.square_loss(ipt, tgt)
            print(loss.numpy())

    .. testoutput::
        Outputs:

        9.75
        .. testoutput::

            9.75
    """
    diff = pred - label
    return diff ** 2
@@ -155,8 +159,7 @@ def cross_entropy(
    label_smooth: float = 0,
    reduction: str = "mean",
 ) -> Tensor:
    r"""
    Computes the multi-class cross entropy loss (using logits by default).
    r"""Computes the multi-class cross entropy loss (using logits by default).

    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
    class probabilities are given by softmax.
@@ -170,35 +173,37 @@ def cross_entropy(
    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
    k is the index of label distribution. :math:`\alpha` is ``label_smooth`` and :math:`K` is the number of classes.

    :param pred: input tensor representing the predicted probability.
    :param label: input tensor representing the classification label.
    :param axis: an axis along which softmax will be applied. Default: 1
    :param with_logits: whether to apply softmax first. Default: True
    :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
    :return: loss value.
    Args:
        pred: input tensor representing the predicted probability.
        label: input tensor representing the classification label.
        axis: an axis along which softmax will be applied. Default: 1
        with_logits: whether to apply softmax first. Default: True
        label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

    Examples:
    Returns:
        loss value.

    .. testcode::
    Examples:

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        data_shape = (1, 2)
        label_shape = (1, )
        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
        label = tensor(np.ones(label_shape, dtype=np.int32))
        loss = F.nn.cross_entropy(pred, label)
        print(loss.numpy().round(decimals=4))
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            data_shape = (1, 2)
            label_shape = (1, )
            pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
            label = tensor(np.ones(label_shape, dtype=np.int32))
            loss = F.nn.cross_entropy(pred, label)
            print(loss.numpy().round(decimals=4))

    .. testoutput::
        Outputs:

        0.6931
        .. testoutput::

            0.6931
    """
    n0 = pred.ndim
    n1 = label.ndim
@@ -226,37 +231,38 @@ def cross_entropy(
 def binary_cross_entropy(
    pred: Tensor, label: Tensor, with_logits: bool = True, reduction: str = "mean",
 ) -> Tensor:
    r"""
    Computes the binary cross entropy loss (using logits by default).
    r"""Computes the binary cross entropy loss (using logits by default).

    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
    class probabilities are given by sigmoid.

    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
    :param label: `(N, *)`, same shape as the input.
    :param with_logits: bool, whether to apply sigmoid first. Default: True
    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
    :return: loss value.
    Args:
        pred: `(N, *)`, where `*` means any number of additional dimensions.
        label: `(N, *)`, same shape as the input.
        with_logits: bool, whether to apply sigmoid first. Default: True
        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

    Examples:
    Returns:
        loss value.

    .. testcode::
    Examples:

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
        label = tensor(np.ones((1, 2), dtype=np.float32))
        loss = F.nn.binary_cross_entropy(pred, label)
        print(loss.numpy().round(decimals=4))
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
            label = tensor(np.ones((1, 2), dtype=np.float32))
            loss = F.nn.binary_cross_entropy(pred, label)
            print(loss.numpy().round(decimals=4))

    .. testoutput::
        Outputs:

        0.6931
        .. testoutput::

            0.6931
    """
    if not with_logits:
        return -(label * log(pred) + (1 - label) * log(1 - pred))
@@ -269,37 +275,38 @@ def binary_cross_entropy(
 def hinge_loss(
    pred: Tensor, label: Tensor, norm: str = "L1", reduction: str = "mean"
 ) -> Tensor:
    r"""
    Caculates the hinge loss which is often used in SVM.
    r"""Caculates the hinge loss which is often used in SVM.

    The hinge loss can be described as:

    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_{ij}*y_{ij}))

    :param pred: input tensor representing the predicted probability, shape is `(N, C)`.
    :param label: input tensor representing the binary classification label, shape is `(N, C)`.
    :param norm: specify the norm to caculate the loss, should be "L1" or "L2".
    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
    :return: loss value.
    Args:
        pred: input tensor representing the predicted probability, shape is `(N, C)`.
        label: input tensor representing the binary classification label, shape is `(N, C)`.
        norm: specify the norm to caculate the loss, should be "L1" or "L2".
        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

    Examples:
    Returns:
        loss value.

    .. testcode::
    Examples:

        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
        loss = F.nn.hinge_loss(pred, label)
        print(loss.numpy())
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
            label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
            loss = F.nn.hinge_loss(pred, label)
            print(loss.numpy())

    .. testoutput::
        Outputs:

        1.5
        .. testoutput::

            1.5
    """
    norm = norm.upper()
    assert norm in ["L1", "L2"], "norm must be L1 or L2"
--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
--- a/imperative/python/megengine/functional/metric.py
+++ b/imperative/python/megengine/functional/metric.py
@@ -19,33 +19,16 @@ from .tensor import broadcast_to, transpose
 def topk_accuracy(
    logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
 ) -> Union[Tensor, Iterable[Tensor]]:
    r"""
    Calculates the classification accuracy given predicted logits and ground-truth labels.
    r"""Calculates the classification accuracy given predicted logits and ground-truth labels.

    :param logits: model predictions of shape `[batch_size, num_classes]`,
        representing the probability (likelyhood) of each class.
    :param target: ground-truth labels, 1d tensor of int32.
    :param topk: specifies the topk values, could be an int or tuple of ints. Default: 1
    :return: tensor(s) of classification accuracy between 0.0 and 1.0.
    Args:
        logits: model predictions of shape `[batch_size, num_classes]`,
            representing the probability (likelyhood) of each class.
        target: ground-truth labels, 1d tensor of int32.
        topk: specifies the topk values, could be an int or tuple of ints. Default: 1

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F

        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
        target = tensor(np.arange(8, dtype=np.int32))
        top1, top5 = F.metric.topk_accuracy(logits, target, (1, 5))
        print(top1.numpy(), top5.numpy())

    Outputs:

    .. testoutput::

        0.0 0.375
    Returns:
        tensor(s) of classification accuracy between 0.0 and 1.0.
    """
    if isinstance(topk, int):
        topk = (topk,)
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -28,32 +28,28 @@ def conv_bias_activation(
    conv_mode="cross_correlation",
    compute_mode="default",
 ) -> Tensor:
    """
    Convolution bias with activation operation, only for inference.

    :param inp: feature map of the convolution operation.
    :param weight: convolution kernel.
    :param bias: bias added to the result of convolution
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides
        of its spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`.
    :type conv_mode: string or :class:`Convolution.Mode`.
    :param conv_mode: supports 'cross_correlation' or 'convolution'. Default:
        'cross_correlation'
    :param dtype: support for ``np.dtype``, Default: np.int8
    :type compute_mode: string or
        :class:`Convolution.ComputeMode`.
    :param compute_mode: when set to "default", no special requirements will be
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result,
        but only effective when input and output are of float16 dtype.
    r"""Convolution bias with activation operation, only for inference.

    Args:
        inp: feature map of the convolution operation.
        weight: convolution kernel.
        bias: bias added to the result of convolution
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides
            of its spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and the shape of weight should be `(groups, out_channel // groups,
            in_channels // groups, height, width)`.
        conv_mode: supports 'cross_correlation' or 'convolution'. Default:
            'cross_correlation'
        dtype: support for ``np.dtype``, Default: np.int8
        compute_mode: when set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result,
            but only effective when input and output are of float16 dtype.
    """
    ph, pw = _pair(padding)
    sh, sw = _pair_nonzero(stride)
@@ -91,32 +87,28 @@ def batch_conv_bias_activation(
    conv_mode="cross_correlation",
    compute_mode="default",
 ) -> Tensor:
    """
    Batch convolution bias with activation operation, only for inference.

    :param inp: feature map of the convolution operation.
    :param weight: convolution kernel in batched way.
    :param bias: bias added to the result of convolution
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides
        of its spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`.
    :type conv_mode: string or :class:`Convolution.Mode`.
    :param conv_mode: supports 'cross_correlation' or 'convolution'. Default:
        'cross_correlation'
    :param dtype: support for ``np.dtype``, Default: np.int8
    :type compute_mode: string or
        :class:`Convolution.ComputeMode`.
    :param compute_mode: when set to "default", no special requirements will be
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result,
        but only effective when input and output are of float16 dtype.
    r"""Batch convolution bias with activation operation, only for inference.

    Args:
        inp: feature map of the convolution operation.
        weight: convolution kernel in batched way.
        bias: bias added to the result of convolution
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides
            of its spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and the shape of weight should be `(groups, out_channel // groups,
            in_channels // groups, height, width)`.
        conv_mode: supports 'cross_correlation' or 'convolution'. Default:
            'cross_correlation'
        dtype: support for ``np.dtype``, Default: np.int8
        compute_mode: when set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result,
            but only effective when input and output are of float16 dtype.
    """
    ph, pw = _pair(padding)
    sh, sw = _pair_nonzero(stride)
--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -19,37 +19,36 @@ __all__ = ["topk_accuracy"]
 def _assert_equal(
    expect: Tensor, actual: Tensor, *, maxerr: float = 0.0001, verbose: bool = False
 ):
    r"""
    Asserts two tensors equal and returns expected value (first input).
    r"""Asserts two tensors equal and returns expected value (first input).
    It is a variant of python assert which is symbolically traceable (similar to ``numpy.testing.assert_equal``).
    If we want to verify the correctness of model, just ``assert`` its states and outputs.
    While sometimes we need to verify the correctness at different backends for *dumped* model
    (or in :class:`~jit.trace` context), and no python code could be executed in that case.
    Thus we have to use :func:`~functional.utils._assert_equal` instead.

    :param expect: expected tensor value
    :param actual: tensor to check value
    :param maxerr: max allowed error; error is defined as the minimal of absolute and relative error
    :param verbose: whether to print maxerr to stdout during opr exec
    :return: expected tensor
    Args:
        expect: expected tensor value
        actual: tensor to check value
        maxerr: max allowed error; error is defined as the minimal of absolute and relative error
        verbose: whether to print maxerr to stdout during opr exec

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

        x = tensor([1, 2, 3], np.float32)
        y = tensor([1, 2, 3], np.float32)
        print(F.utils._assert_equal(x, y, maxerr=0).numpy())
            x = tensor([1, 2, 3], np.float32)
            y = tensor([1, 2, 3], np.float32)
            print(F.utils._assert_equal(x, y, maxerr=0).numpy())

    Outputs:
        Outputs:

    .. testoutput::
        .. testoutput::

        [1. 2. 3.]
            [1. 2. 3.]
    """
    err = (
        abs(expect - actual)
--- a/imperative/python/megengine/functional/vision.py
+++ b/imperative/python/megengine/functional/vision.py
@@ -21,31 +21,32 @@ from .tensor import broadcast_to, concat, expand_dims, reshape, transpose


 def cvt_color(inp: Tensor, mode: str = ""):
    r"""
    Convert images from one format to another
    r"""Convert images from one format to another

    :param inp: input images.
    :param mode: format mode.
    :return: convert result.
    Args:
        inp: input images.
        mode: format mode.

    Examples:
    Returns:
        convert result.

    .. testcode::
    Examples:

        import numpy as np
        import megengine as mge
        import megengine.functional as F
        .. testcode::

        x = mge.tensor(np.array([[[[-0.58675045, 1.7526233, 0.10702174]]]]).astype(np.float32))
        y = F.vision.cvt_color(x, mode="RGB2GRAY")
        print(y.numpy())
            import numpy as np
            import megengine as mge
            import megengine.functional as F

    Outputs:
            x = mge.tensor(np.array([[[[-0.58675045, 1.7526233, 0.10702174]]]]).astype(np.float32))
            y = F.vision.cvt_color(x, mode="RGB2GRAY")
            print(y.numpy())

    .. testoutput::
        Outputs:

        [[[[0.86555195]]]]
        .. testoutput::

            [[[[0.86555195]]]]
    """
    mode = mode.upper()
    assert mode in builtin.CvtColor.Mode.__dict__, "unspport mode for cvt_color"
@@ -63,37 +64,38 @@ def roi_pooling(
    mode: str = "max",
    scale: float = 1.0,
 ) -> Tensor:
    """
    Applies roi pooling on input feature.
    r"""Applies roi pooling on input feature.

    :param inp: tensor that represents the input feature, `(N, C, H, W)` images.
    :param rois: `(K, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
    :param output_shape: `(height, width)` of output rois feature.
    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: "max"
    :param scale: scale the input boxes by this number. Default: 1.0
    :return: `(K, C, output_shape[0], output_shape[1])` feature of rois.
    Args:
        inp: tensor that represents the input feature, `(N, C, H, W)` images.
        rois: K, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
        output_shape: height, width)` of output rois feature.
        mode: max" or "average", use max/average align just like max/average pooling. Default: "max"
        scale: scale the input boxes by this number. Default: 1.0

    Examples:
    Returns:
        ``K, C, output_shape[0], output_shape[1])`` feature of rois.

    .. testcode::
    Examples:

            import numpy as np
            from megengine import tensor
            import megengine.functional as F
        .. testcode::

            np.random.seed(42)
            inp = tensor(np.random.randn(1, 1, 128, 128))
            rois = tensor(np.random.random((4, 5)))
            y = F.vision.roi_pooling(inp, rois, (2, 2))
            print(y.numpy()[0].round(decimals=4))
                import numpy as np
                from megengine import tensor
                import megengine.functional as F

    Outputs:
                np.random.seed(42)
                inp = tensor(np.random.randn(1, 1, 128, 128))
                rois = tensor(np.random.random((4, 5)))
                y = F.vision.roi_pooling(inp, rois, (2, 2))
                print(y.numpy()[0].round(decimals=4))

    .. testoutput::
        Outputs:

            [[[-0.1383 -0.1383]
              [-0.5035 -0.5035]]]
        .. testoutput::

                [[[-0.1383 -0.1383]
                  [-0.5035 -0.5035]]]
    """
    assert mode.lower() in ["max", "average"], "only max/average mode is supported"
    if isinstance(output_shape, int):
@@ -116,17 +118,17 @@ def correlation(
    pad_size: int = 0,
    is_multiply: bool = True,
 ) -> Tensor:
    """ Applies correlation to inputs.

    :param data1:  Input data1 to the correlation. format must be nchw
    :param data2:  Input data2 to the correlation. format must be nchw
    :param kernel_size: (int (non-negative), optional, default=1) – kernel size for Correlation must be an odd number
    :param max_displacement: (int (non-negative), optional, default=1) – Max displacement of Correlation
    :param stride1: (int (non-negative), optional, default=1) – stride1 quantize data1 globally
    :param stride2: (int (non-negative), optional, default=1) – stride2 quantize data2 within the neighborhood centered around data1
    :param pad_size: (int (non-negative), optional, default=0) – pad for Correlation
    :param is_multiply: (boolean, optional, default=True) – operation type is either multiplication or absolute difference

    r"""Applies correlation to inputs.

    Args:
        data1: Input data1 to the correlation. format must be nchw
        data2: Input data2 to the correlation. format must be nchw
        kernel_size: int (non-negative), optional, default=1) – kernel size for Correlation must be an odd number
        max_displacement: int (non-negative), optional, default=1) – Max displacement of Correlation
        stride1: int (non-negative), optional, default=1) – stride1 quantize data1 globally
        stride2: int (non-negative), optional, default=1) – stride2 quantize data2 within the neighborhood centered around data1
        pad_size: int (non-negative), optional, default=0) – pad for Correlation
        is_multiply: boolean, optional, default=True) – operation type is either multiplication or absolute difference
    """

    op = builtin.Correlation(
@@ -152,41 +154,42 @@ def roi_align(
    sample_points: Union[int, tuple, list] = 2,
    aligned: bool = True,
 ) -> Tensor:
    """
    Applies roi align on input feature.

    :param inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
    :param rois: `(N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
    :param output_shape: `(height, width)` shape of output rois feature.
    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: "average"
    :param spatial_scale: scale the input boxes by this number. Default: 1.0
    :param sample_points: number of inputs samples to take for each output sample.
        0 to take samples densely. Default: 2
    :param aligned: wheather to align the input feature, with `aligned=True`,
        we first appropriately scale the ROI and then shift it by -0.5. Default: True
    :return: output tensor.
    r"""Applies roi align on input feature.

    Args:
        inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
        rois: N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
        output_shape: height, width)` shape of output rois feature.
        mode: max" or "average", use max/average align just like max/average pooling. Default: "average"
        spatial_scale: scale the input boxes by this number. Default: 1.0
        sample_points: number of inputs samples to take for each output sample.
            0 to take samples densely. Default: 2
        aligned: wheather to align the input feature, with `aligned=True`,
            we first appropriately scale the ROI and then shift it by -0.5. Default: True

    Returns:
        output tensor.

    Examples:

    .. testcode::
        .. testcode::

            import numpy as np
            from megengine import tensor
            import megengine.functional as F
                import numpy as np
                from megengine import tensor
                import megengine.functional as F

            np.random.seed(42)
            inp = tensor(np.random.randn(1, 1, 128, 128))
            rois = tensor(np.random.random((4, 5)))
            y = F.vision.roi_align(inp, rois, (2, 2))
            print(y.numpy()[0].round(decimals=4))

    Outputs:
                np.random.seed(42)
                inp = tensor(np.random.randn(1, 1, 128, 128))
                rois = tensor(np.random.random((4, 5)))
                y = F.vision.roi_align(inp, rois, (2, 2))
                print(y.numpy()[0].round(decimals=4))

    .. testoutput::
        Outputs:

            [[[0.175  0.175 ]
              [0.1359 0.1359]]]
        .. testoutput::

                [[[0.175  0.175 ]
                  [0.1359 0.1359]]]
    """
    if inp.dtype != np.float32:
        inp = inp.astype(np.float32)
@@ -217,43 +220,43 @@ def roi_align(
 def nms(
    boxes: Tensor, scores: Tensor, iou_thresh: float, max_output: Optional[int] = None
 ) -> Tensor:
    r"""
    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union(IoU).
    r"""Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union(IoU).

    :param boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format.
    :param iou_thresh: IoU threshold for overlapping.
    :param scores: tensor of shape `(N,)`, the score of boxes.
    :param max_output: the maximum number of boxes to keep; it is optional if this operator is not traced
        otherwise it required to be specified; if it is not specified, all boxes are kept.
    :return: indices of the elements that have been kept by NMS, sorted by scores.
    Args:
        boxes: tensor of shape `(N, 4)`; the boxes to perform nms on; each box is expected to be in `(x1, y1, x2, y2)` format.
        iou_thresh: IoU threshold for overlapping.
        scores: tensor of shape `(N,)`, the score of boxes.
        max_output: the maximum number of boxes to keep; it is optional if this operator is not traced
            otherwise it required to be specified; if it is not specified, all boxes are kept.

    .. note::
    Returns:
        indices of the elements that have been kept by NMS, sorted by scores.

        max_output should be specified and should have valid positive value under tracing
    Note:
        max_output should be specified and should have valid positive value under tracing.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        x = np.zeros((100,4))
        np.random.seed(42)
        x[:,:2] = np.random.rand(100,2)*20
        x[:,2:] = np.random.rand(100,2)*20 + 100
        scores = tensor(np.random.rand(100))
        inp = tensor(x)
        result = F.vision.nms(inp, scores, iou_thresh=0.7)
        print(result.numpy())
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            x = np.zeros((100,4))
            np.random.seed(42)
            x[:,:2] = np.random.rand(100,2)*20
            x[:,2:] = np.random.rand(100,2)*20 + 100
            scores = tensor(np.random.rand(100))
            inp = tensor(x)
            result = F.vision.nms(inp, scores, iou_thresh=0.7)
            print(result.numpy())

    .. testoutput::
        Outputs:

        [75 69]
        .. testoutput::

            [75 69]
    """
    assert (
        boxes.ndim == 2 and boxes.shape[1] == 4
@@ -286,45 +289,46 @@ def remap(
    scalar: float = 0.0,
    interp_mode: str = "linear",
 ) -> Tensor:
    r"""
    Applies remap transformation to batched 2D images.
    r"""Applies remap transformation to batched 2D images.

    The input images are transformed to the output images by the tensor map_xy.
    The output's H and W are same as map_xy's H and W.

    :param inp: input image
    :param map_xy: (batch, oh, ow, 2) transformation matrix
    :param border_mode: pixel extrapolation method.
        Default: "replicate". Currently also support "constant", "reflect",
        "reflect_101", "wrap".
    :param scalar: value used in case of a constant border. Default: 0
    :param interp_mode: interpolation methods.
        Default: "linear". Currently only support "linear" mode.
    :return: output tensor.
    Args:
        inp: input image
        map_xy: batch, oh, ow, 2) transformation matrix
        border_mode: pixel extrapolation method.
            Default: "replicate". Currently also support "constant", "reflect",
            "reflect_101", "wrap".
        scalar: value used in case of a constant border. Default: 0
        interp_mode: interpolation methods.
            Default: "linear". Currently only support "linear" mode.

    Returns:
        output tensor.

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        inp_shape = (1, 1, 4, 4)
        inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
        map_xy_shape = (1, 2, 2, 2)
        map_xy = tensor(np.array([[[1., 0.],[0., 1.]],
                            [[0., 1.],[0., 1.]]],
                             dtype=np.float32).reshape(map_xy_shape))
        out = F.vision.remap(inp, map_xy)
        print(out.numpy())

    Outputs:
            import numpy as np
            from megengine import tensor
            import megengine.functional as F
            inp_shape = (1, 1, 4, 4)
            inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
            map_xy_shape = (1, 2, 2, 2)
            map_xy = tensor(np.array([[[1., 0.],[0., 1.]],
                                [[0., 1.],[0., 1.]]],
                                dtype=np.float32).reshape(map_xy_shape))
            out = F.vision.remap(inp, map_xy)
            print(out.numpy())

    .. testoutput::
        Outputs:

        [[[[1. 4.]
           [4. 4.]]]]
        .. testoutput::

            [[[[1. 4.]
               [4. 4.]]]]
    """

    op = builtin.Remap(
@@ -344,27 +348,28 @@ def warp_affine(
    format: str = "NHWC",
    interp_mode: str = "linear",
 ) -> Tensor:
    """
    Batched affine transform on 2D images.

    :param inp: input image.
    :param mat: `(batch, 2, 3)` transformation matrix.
    :param out_shape: output tensor shape.
    :param border_mode: pixel extrapolation method.
        Default: "wrap". Currently "constant", "reflect",
        "reflect_101", "isolated", "wrap", "replicate", "transparent" are supported.
    :param border_val: value used in case of a constant border. Default: 0
    :param format: "NHWC" as default based on historical concerns,
        "NCHW" is also supported. Default: "NHWC".
    :param interp_mode: interpolation methods. Could be "linear", "nearest", "cubic", "area".
        Default: "linear".
    :return: output tensor.

    .. note::

       Here all available options for params are listed,
       however it does not mean that you can use all the combinations.
       On different platforms, different combinations are supported.
    r"""Batched affine transform on 2D images.

    Args:
        inp: input image.
        mat: batch, 2, 3)` transformation matrix.
        out_shape: output tensor shape.
        border_mode: pixel extrapolation method.
            Default: "wrap". Currently "constant", "reflect",
            "reflect_101", "isolated", "wrap", "replicate", "transparent" are supported.
        border_val: value used in case of a constant border. Default: 0
        format: NHWC" as default based on historical concerns,
            "NCHW" is also supported. Default: "NHWC".
        interp_mode: interpolation methods. Could be "linear", "nearest", "cubic", "area".
            Default: "linear".

    Returns:
        output tensor.

    Note:
        Here all available options for params are listed,
        however it does not mean that you can use all the combinations.
        On different platforms, different combinations are supported.
    """
    op = builtin.WarpAffine(
        border_mode=border_mode,
@@ -387,8 +392,7 @@ def warp_perspective(
    format: str = "NCHW",
    interp_mode: str = "linear",
 ) -> Tensor:
    r"""
    Applies perspective transformation to batched 2D images.
    r"""Applies perspective transformation to batched 2D images.

    The input images are transformed to the output images by the transformation matrix:

@@ -401,48 +405,49 @@ def warp_perspective(
    Optionally, we can set `mat_idx` to assign different transformations to the same image,
    otherwise the input images and transformations should be one-to-one correnspondence.

    :param inp: input image.
    :param mat: `(batch, 3, 3)` transformation matrix.
    :param out_shape: `(h, w)` size of the output image.
    :param mat_idx: `(batch, )` image batch idx assigned to each matrix. Default: None
    :param border_mode: pixel extrapolation method.
        Default: "replicate". Currently also support "constant", "reflect",
        "reflect_101", "wrap".
    :param border_val: value used in case of a constant border. Default: 0
    :param format: "NHWC" is also supported. Default: "NCHW".
    :param interp_mode: interpolation methods.
        Default: "linear". Currently only support "linear" mode.
    :return: output tensor.

    .. note::

       The transformation matrix is the inverse of that used by `cv2.warpPerspective`.
    Args:
        inp: input image.
        mat: batch, 3, 3)` transformation matrix.
        out_shape: h, w)` size of the output image.
        mat_idx: batch, )` image batch idx assigned to each matrix. Default: None
        border_mode: pixel extrapolation method.
            Default: "replicate". Currently also support "constant", "reflect",
            "reflect_101", "wrap".
        border_val: value used in case of a constant border. Default: 0
        format: NHWC" is also supported. Default: "NCHW".
        interp_mode: interpolation methods.
            Default: "linear". Currently only support "linear" mode.

    Returns:
        output tensor.

    Note:
        The transformation matrix is the inverse of that used by `cv2.warpPerspective`.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        inp_shape = (1, 1, 4, 4)
        x = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
        M_shape = (1, 3, 3)
        # M defines a translation: dst(1, 1, h, w) = rst(1, 1, h+1, w+1)
        M = tensor(np.array([[1., 0., 1.],
                             [0., 1., 1.],
                             [0., 0., 1.]], dtype=np.float32).reshape(M_shape))
        out = F.vision.warp_perspective(x, M, (2, 2))
        print(out.numpy())
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            inp_shape = (1, 1, 4, 4)
            x = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
            M_shape = (1, 3, 3)
            # M defines a translation: dst(1, 1, h, w) = rst(1, 1, h+1, w+1)
            M = tensor(np.array([[1., 0., 1.],
                                [0., 1., 1.],
                                [0., 0., 1.]], dtype=np.float32).reshape(M_shape))
            out = F.vision.warp_perspective(x, M, (2, 2))
            print(out.numpy())

    .. testoutput::
        Outputs:

        [[[[ 5.  6.]
           [ 9. 10.]]]]
        .. testoutput::

            [[[[ 5.  6.]
               [ 9. 10.]]]]
    """
    if inp.dtype == np.float32:
        mat = mat.astype("float32")
@@ -467,48 +472,48 @@ def interpolate(
    mode: str = "bilinear",
    align_corners: Optional[bool] = None,
 ) -> Tensor:
    r"""
    Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.

    :param inp: input tensor.
    :param size: size of the output tensor. Default: None
    :param scale_factor: scaling factor of the output tensor. Default: None
    :param mode: interpolation methods, acceptable values are:
        "bilinear", "linear", "bicubic" and "nearest". Default: "bilinear"
    :param align_corners: This only has an effect when `mode`
        is "bilinear" or "linear". Geometrically, we consider the pixels of the input
        and output as squares rather than points. If set to ``True``, the input
        and output tensors are aligned by the center points of their corner
        pixels, preserving the values at the corner pixels. If set to ``False``,
        the input and output tensors are aligned by the corner points of their
        corner pixels, and the interpolation uses edge value padding for
        out-of-boundary values, making this operation *independent* of input size

    :return: output tensor.
    r"""Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.

    Args:
        inp: input tensor.
        size: size of the output tensor. Default: None
        scale_factor: scaling factor of the output tensor. Default: None
        mode: interpolation methods, acceptable values are:
            "bilinear", "linear", "bicubic" and "nearest". Default: "bilinear"
        align_corners: This only has an effect when `mode`
            is "bilinear" or "linear". Geometrically, we consider the pixels of the input
            and output as squares rather than points. If set to ``True``, the input
            and output tensors are aligned by the center points of their corner
            pixels, preserving the values at the corner pixels. If set to ``False``,
            the input and output tensors are aligned by the corner points of their
            corner pixels, and the interpolation uses edge value padding for
            out-of-boundary values, making this operation *independent* of input size

    Returns:
        output tensor.

    Examples:

    .. testcode::

        import numpy as np
        from megengine import tensor
        import megengine.functional as F
        .. testcode::

        x = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
        out = F.vision.interpolate(x, [4, 4], align_corners=False)
        print(out.numpy())
        out2 = F.vision.interpolate(x, scale_factor=2.)
        np.testing.assert_allclose(out.numpy(), out2.numpy())
            import numpy as np
            from megengine import tensor
            import megengine.functional as F

    Outputs:
            x = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
            out = F.vision.interpolate(x, [4, 4], align_corners=False)
            print(out.numpy())
            out2 = F.vision.interpolate(x, scale_factor=2.)
            np.testing.assert_allclose(out.numpy(), out2.numpy())

    .. testoutput::
        Outputs:

        [[[[1.   1.25 1.75 2.  ]
           [1.5  1.75 2.25 2.5 ]
           [2.5  2.75 3.25 3.5 ]
           [3.   3.25 3.75 4.  ]]]]
        .. testoutput::

            [[[[1.   1.25 1.75 2.  ]
               [1.5  1.75 2.25 2.5 ]
               [2.5  2.75 3.25 3.5 ]
               [3.   3.25 3.75 4.  ]]]]
    """
    mode = mode.lower()
    if mode not in ["bilinear", "linear", "bicubic", "nearest"]:
@@ -623,15 +628,15 @@ def interpolate(


 def nvof(src: Tensor, precision: int = 1) -> Tensor:
    r"""
    Implements NVIDIA Optical Flow SDK.
    r"""Implements NVIDIA Optical Flow SDK.

    Args:
        src: input tensor with shape (n, t, h, w, c4) and unit8 dtype.
        precision: 0:NV_OF_PERF_LEVEL_SLOW 1:NV_OF_PERF_LEVEL_MEDIUM 2:NV_OF_PERF_LEVEL_FAST.

    :src shape: input tensor with shape (n, t, h, w, c4).
    :src dtype: uint8.
    :param precision: 0:NV_OF_PERF_LEVEL_SLOW 1:NV_OF_PERF_LEVEL_MEDIUM 2:NV_OF_PERF_LEVEL_FAST.
    :output shape: ``(n, t-1, (h+out_grid_size-1)//out_grid_size, (w+out_grid_size-1)//out_grid_size, c2)``.
        By default, out_grid_size = 4.
    :output dtype: int16.
    Returns:
        output tensor with shape: ``(n, t-1, (h+out_grid_size-1)//out_grid_size, (w+out_grid_size-1)//out_grid_size, c2)``.
        By default, out_grid_size = 4. dtype: int16.

    .. code-block:: python

@@ -643,7 +648,6 @@ def nvof(src: Tensor, precision: int = 1) -> Tensor:
        src = tensor(x)
        result = F.nn.nvof(src, precision=1)
        print(result.numpy())

    """
    assert src.ndim == 5 and src.shape[4] == 4

--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -7,24 +7,24 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 class FetcherError(Exception):
    """Base class for fetch related error."""
    r"""Base class for fetch related error."""


 class InvalidRepo(FetcherError):
    """The repo provided was somehow invalid."""
    r"""The repo provided was somehow invalid."""


 class InvalidGitHost(FetcherError):
    """The git host provided was somehow invalid."""
    r"""The git host provided was somehow invalid."""


 class GitPullError(FetcherError):
    """A git pull error occurred."""
    r"""A git pull error occurred."""


 class GitCheckoutError(FetcherError):
    """A git checkout error occurred."""
    r"""A git checkout error occurred."""


 class InvalidProtocol(FetcherError):
    """The protocol provided was somehow invalid."""
    r"""The protocol provided was somehow invalid."""
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -102,24 +102,18 @@ class GitSSHFetcher(RepoFetcherBase):
        commit: str = None,
        silent: bool = True,
    ) -> str:
        """
        Fetches git repo by SSH protocol

        :param git_host:
            host address of git repo.
            Example: github.com
        :param repo_info:
            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified.
            Example: ``"brain_sdk/MegBrain[:hub]"``
        :param use_cache:
            whether to use locally fetched code or completely re-fetch.
        :param commit:
            commit id on github or gitlab.
        :param silent:
            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
            displaying on the screen.
        :return:
        """Fetches git repo by SSH protocol

        Args:
            git_host: host address of git repo. Eg: github.com
            repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
                tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
            use_cache: whether to use locally fetched code or completely re-fetch.
            commit: commit id on github or gitlab.
            silent: whether to accept the stdout and stderr of the subprocess with PIPE, instead of
                displaying on the screen.

        Returns:
            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
@@ -217,24 +211,19 @@ class GitHTTPSFetcher(RepoFetcherBase):
        commit: str = None,
        silent: bool = True,
    ) -> str:
        """
        Fetches git repo by HTTPS protocol.

        :param git_host:
            host address of git repo.
            Example: github.com
        :param repo_info:
            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified.
            Example: ``"brain_sdk/MegBrain[:hub]"``
        :param use_cache:
            whether to use locally cached code or completely re-fetch.
        :param commit:
            commit id on github or gitlab.
        :param silent:
            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
            displaying on the screen.
        :return:
        """Fetches git repo by HTTPS protocol.

        Args:
            git_host: host address of git repo. Eg: github.com
            repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
                tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
            use_cache: whether to use locally cached code or completely re-fetch.
            commit: commit id on github or gitlab.
            silent: whether to accept the stdout and stderr of the subprocess with PIPE, instead of
                displaying on the screen.
 

        Returns:
            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
@@ -43,9 +43,7 @@ PROTOCOLS = {


 def _get_megengine_home() -> str:
    """
    MGE_HOME setting complies with the XDG Base Directory Specification
    """
    r"""MGE_HOME setting complies with the XDG Base Directory Specification"""
    megengine_home = os.path.expanduser(
        os.getenv(
            ENV_MGE_HOME,
@@ -95,24 +93,18 @@ def _init_hub(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ):
    """
    Imports hubmodule like python import.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param git_host:
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
    r"""Imports hubmodule like python import.

    Args:
        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
        git_host: host address of git repo. Eg: github.com
        use_cache: whether to use locally cached code or completely re-fetch.
        commit: commit id on github or gitlab.
        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
            The value should be one of HTTPS, SSH.

    Returns:
        a python module.
    """
    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
@@ -139,24 +131,18 @@ def list(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ) -> List[str]:
    """
    Lists all entrypoints available in repo hubconf.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param git_host:
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
    r"""Lists all entrypoints available in repo hubconf.

    Args:
        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
        git_host: host address of git repo. Eg: github.com
        use_cache: whether to use locally cached code or completely re-fetch.
        commit: commit id on github or gitlab.
        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
            The value should be one of HTTPS, SSH.

    Returns:
        all entrypoint names of the model.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
@@ -178,26 +164,19 @@ def load(
    protocol: str = DEFAULT_PROTOCOL,
    **kwargs
 ) -> Any:
    """
    Loads model from github or gitlab repo, with pretrained weights.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param entry:
        an entrypoint defined in hubconf.
    :param git_host:
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
    r"""Loads model from github or gitlab repo, with pretrained weights.

    Args:
        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
        entry: an entrypoint defined in hubconf.
        git_host: host address of git repo. Eg: github.com
        use_cache: whether to use locally cached code or completely re-fetch.
        commit: commit id on github or gitlab.
        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
            The value should be one of HTTPS, SSH.

    Returns:
        a single model with corresponding pretrained weights.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
@@ -219,30 +198,23 @@ def help(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ) -> str:
    """
    This function returns docstring of entrypoint ``entry`` by following steps:
    r"""This function returns docstring of entrypoint ``entry`` by following steps:

    1. Pull the repo code specified by git and repo_info.
    2. Load the entry defined in repo's hubconf.py
    3. Return docstring of function entry.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param entry:
        an entrypoint defined in hubconf.py
    :param git_host:
        host address of git repo.
        Example: github.com
    :param use_cache:
        whether to use locally cached code or completely re-fetch.
    :param commit:
        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
    Args:
        repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
        entry: an entrypoint defined in hubconf.py
        git_host: host address of git repo. Eg: github.com
        use_cache: whether to use locally cached code or completely re-fetch.
        commit: commit id on github or gitlab.
        protocol: which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
            The value should be one of HTTPS, SSH.

    Returns:
        docstring of entrypoint ``entry``.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
@@ -255,16 +227,17 @@ def help(


 def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
    """
    Loads MegEngine serialized object from the given URL.
    """Loads MegEngine serialized object from the given URL.

    If the object is already present in ``model_dir``, it's deserialized and
    returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.

    :param url: url to serialized object.
    :param model_dir: dir to cache target serialized file.
    Args:
        url: url to serialized object.
        model_dir: dir to cache target serialized file.

    :return: loaded object.
    Returns:
        loaded object.
    """
    if model_dir is None:
        model_dir = os.path.join(_get_megengine_home(), "serialized")
@@ -297,8 +270,7 @@ def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:


 class pretrained:
    r"""
    Decorator which helps to download pretrained weights from the given url.
    r"""Decorator which helps to download pretrained weights from the given url.

    For example, we can decorate a resnet18 function as follows

@@ -306,10 +278,10 @@ class pretrained:

        @hub.pretrained("https://url/to/pretrained_resnet18.pkl")
        def resnet18(**kwargs):
            return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)

    When decorated function is called with ``pretrained=True``, MegEngine will automatically
    download and fill the returned model with pretrained weights.
    Returns:
        When decorated function is called with ``pretrained=True``, MegEngine will automatically
        download and fill the returned model with pretrained weights.
    """

    def __init__(self, url):
--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
@@ -14,11 +14,11 @@ from typing import Iterator


 def load_module(name: str, path: str) -> types.ModuleType:
    """
    Loads module specified by name and path.
    r"""Loads module specified by name and path.

    :param name: module name.
    :param path: module path.
    Args:
        name: module name.
        path: module path.
    """
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)
@@ -27,20 +27,20 @@ def load_module(name: str, path: str) -> types.ModuleType:


 def check_module_exists(module: str) -> bool:
    """
    Checks whether python module exists or not.
    r"""Checks whether python module exists or not.

    :param module: name of module.
    Args:
        module: name of module.
    """
    return importlib.util.find_spec(module) is not None


@contextmanager
 def cd(target: str) -> Iterator[None]:
    """
    Changes current directory to target.
    """Changes current directory to target.

    :param target: target directory.
    Args:
        target: target directory.
    """
    prev = os.getcwd()
    os.chdir(os.path.expanduser(target))
--- a/imperative/python/megengine/jit/graph_opt_config.py
+++ b/imperative/python/megengine/jit/graph_opt_config.py
@@ -9,12 +9,12 @@


 class GraphOptimizationConfig:
    r"""
    Configuration for graph optimization: False for OFF, True for ON. The default value
    r"""Configuration for graph optimization: False for OFF, True for ON. The default value
    None means that opt_level will decide whther this optimization will be applied or not.

    :param jit_fuse_dimshuffle: whether to fuse dimshuffle in JIT optimization
    :param jit_fuse_reduce: whether to fuse reduce in JIT optimization
    Args:
        jit_fuse_dimshuffle: whether to fuse dimshuffle in JIT optimization
        jit_fuse_reduce: whether to fuse reduce in JIT optimization
    """

    def __init__(self):
--- a/imperative/python/megengine/jit/sublinear_memory_config.py
+++ b/imperative/python/megengine/jit/sublinear_memory_config.py
@@ -10,26 +10,26 @@ from ..device import get_device_count


 class SublinearMemoryConfig:
    r"""
    Configuration for sublinear memory optimization.

    :param thresh_nr_try: number of samples both for searching in linear space
        and around current thresh in sublinear memory optimization. Default: 10.
        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_THRESH_NR_TRY'.
    :param genetic_nr_iter: number of iterations to find the best checkpoints in genetic algorithm.
        Default: 0.
        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER'.
    :param genetic_pool_size: number of samples for the crossover random selection
        during genetic optimization. Default: 20.
        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_POOL_SIZE'.
    :param lb_memory_mb: memory lower bound of bottleneck size in MB for sublinear memory optimization.
        It can be used to perform manual tradeoff between memory and speed. Default: 0.
        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_LOWER_BOUND_MB'.
    :param num_worker: number of thread workers to search the optimum checkpoints
        in sublinear memory optimization. Default: half of cpu number in the system.
        Note: the value must be greater or equal to one.
        It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_WORKERS'.
    r"""Configuration for sublinear memory optimization.

    Args:
        thresh_nr_try: number of samples both for searching in linear space
            and around current thresh in sublinear memory optimization. Default: 10.
            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_THRESH_NR_TRY'.
        genetic_nr_iter: number of iterations to find the best checkpoints in genetic algorithm.
            Default: 0.
            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER'.
        genetic_pool_size: number of samples for the crossover random selection
            during genetic optimization. Default: 20.
            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_GENETIC_POOL_SIZE'.
        lb_memory_mb: memory lower bound of bottleneck size in MB for sublinear memory optimization.
            It can be used to perform manual tradeoff between memory and speed. Default: 0.
            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_LOWER_BOUND_MB'.
        num_worker: number of thread workers to search the optimum checkpoints
            in sublinear memory optimization. Default: half of cpu number in the system.
            Note: the value must be greater or equal to one.
            It can also be set through the environmental variable 'MGB_SUBLINEAR_MEMORY_WORKERS'.
    
    Note that the environmental variable MGB_COMP_GRAPH_OPT must be set to 'enable_sublinear_memory_opt=1'
    in order for the above environmental variable to be effective.
    """
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -120,21 +120,21 @@ _io_op_types = {AssertEqual, CollectiveComm, RemoteSend, RemoteRecv}


 class trace:
    """
    Wraps a callable and provide:
    """Wraps a callable and provide:

    * tracing via :meth:`.trace` and :meth:`.dump`
    * accelerated evalutaion via :meth:`.__call__`

    :param function: the function will be traced.
    :param symbolic: whether to apply symbolic execution for tracing. Default: False
    :param capture_as_const: capture global vars or closures as const value. Default: False
    :param sublinear_memory_config: configuration for sublinear memory optimization.
        If not None, it enables sublinear memory optimization with given setting.
    :param profiling: whether to profile compiled trace. Default: False
    :param opt_level: optimization level for compiling trace. Default: 2
    :param graph_opt_config: configuration for graph optimization. Default: None
    :param symbolic_shape: whether to use symbolic shape for tracing. Default: True
    Args:
        function: the function will be traced.
        symbolic: whether to apply symbolic execution for tracing. Default: False
        capture_as_const: capture global vars or closures as const value. Default: False
        sublinear_memory_config: configuration for sublinear memory optimization.
            If not None, it enables sublinear memory optimization with given setting.
        profiling: whether to profile compiled trace. Default: False
        opt_level: optimization level for compiling trace. Default: 2
        graph_opt_config: configuration for graph optimization. Default: None
        symbolic_shape: whether to use symbolic shape for tracing. Default: True
    """

    def __new__(cls, *args, **kwargs):
@@ -696,75 +696,74 @@ class trace:
        enable_metadata: bool = True,
        **kwargs
    ):
        r"""
        Serializes trace to file system.

        :param file: output file, could be file object or filename.
        :param arg_names: names of the input tensors in the traced function.
        :param output_names: names of the output tensors in the traced function,
            use the default name if not specified.
        :param append: whether output is appended to ``file``.
            Only works when ``file`` is str.
        :param keep_var_name: level for keeping variable names:

            * 0: none of the names are kept
            * 1: (default)keep names of output vars
            * 2: keep names of all (output and internal) vars
        :param keep_opr_name: whether to keep operator names.
        :param keep_param_name: whether to keep param names, so param values can be
            easily manipulated after loading model
        :param keep_opr_priority: whether to keep priority setting for operators
        :param strip_info_file: a string for path or a file handler. if is not None,
            then the dump information for code strip would be written to ``strip_info_file``
        :param append_json: will be check when `strip_info_file` is not None. if set
            true, the information for code strip will be append to strip_info_file.
            if set false, will rewrite strip_info_file
        :param optimize_for_inference: enbale optmizations,
            will skip all optimize options if this is False. Default: True
        :param user_info: any type object, which will be pickled to bytes.
        :param enable_metadata: whether to save metadata into output file.

        :Keyword Arguments:

            * enable_io16xc32 --
                whether to use float16 for I/O between oprs and use
                float32 as internal computation precision. Note the output var would be
                changed to float16.
            * enable_ioc16 --
                whether to use float16 for both I/O and computation
                precision.

            * enable_hwcd4 --
                whether to use NHWCD4 data layout. This is faster on some
                OpenCL backend.
            * enable_nchw88 --
                whether to use NCHW88 data layout, currently
                used in X86 AVX backend.
            * enable_nchw44 --
                whether to use NCHW44 data layout, currently
                used in arm backend.
            * enable_nchw44_dot --
                whether to use NCHW44_dot data layout, currently
                used in armv8.2+dotprod backend.
            * enable_nchw4 --
                whether to use NCHW4 data layout, currently
                used in nvidia backend(based on cudnn).
            * enable_nchw32 --
                whether to use NCHW32 data layout, currently
                used in nvidia backend with tensorcore(based on cudnn).
            * enable_chwn4 --
                whether to use CHWN4 data layout, currently
                used in nvidia backend with tensorcore.
            * enable_nchw64 --
                whether to use NCHW64 data layout, used for fast int4
                support on Nvidia GPU.

            * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
                into one opr.
            * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
                input for inference on nvidia backend(this optimization pass will
                result in mismatch of the precision of output of training and
                inference)
        r"""Serializes trace to file system.

        Args:
            file: output file, could be file object or filename.
            arg_names: names of the input tensors in the traced function.
            output_names: names of the output tensors in the traced function,
                use the default name if not specified.
            append: whether output is appended to ``file``.
                Only works when ``file`` is str.
            keep_var_name: level for keeping variable names:

                * 0: none of the names are kept
                * 1: (default)keep names of output vars
                * 2: keep names of all (output and internal) vars

            keep_opr_name: whether to keep operator names.
            keep_param_name: whether to keep param names, so param values can be
                easily manipulated after loading model
            keep_opr_priority: whether to keep priority setting for operators
            strip_info_file: a string for path or a file handler. if is not None,
                then the dump information for code strip would be written to ``strip_info_file``
            append_json: will be check when `strip_info_file` is not None. if set
                true, the information for code strip will be append to strip_info_file.
                if set false, will rewrite strip_info_file
            optimize_for_inference: enbale optmizations,
                will skip all optimize options if this is False. Default: True
            user_info: any type object, which will be pickled to bytes.
            enable_metadata: whether to save metadata into output file.

        Keyword Arguments:

        * enable_io16xc32 --
          whether to use float16 for I/O between oprs and use
          float32 as internal computation precision. Note the output var would be
          changed to float16.
        * enable_ioc16 --
          whether to use float16 for both I/O and computation
          precision.
        * enable_hwcd4 --
          whether to use NHWCD4 data layout. This is faster on some
          OpenCL backend.
        * enable_nchw88 --
          whether to use NCHW88 data layout, currently
          used in X86 AVX backend.
        * enable_nchw44 --
          whether to use NCHW44 data layout, currently
          used in arm backend.
        * enable_nchw44_dot --
          whether to use NCHW44_dot data layout, currently
          used in armv8.2+dotprod backend.
        * enable_nchw4 --
          whether to use NCHW4 data layout, currently
          used in nvidia backend(based on cudnn).
        * enable_nchw32 --
          whether to use NCHW32 data layout, currently
          used in nvidia backend with tensorcore(based on cudnn).
        * enable_chwn4 --
          whether to use CHWN4 data layout, currently
          used in nvidia backend with tensorcore.
        * enable_nchw64 --
          whether to use NCHW64 data layout, used for fast int4
          support on Nvidia GPU.
        * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
          into one opr.
        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
          input for inference on nvidia backend(this optimization pass will
          result in mismatch of the precision of output of training and
          inference)
        """
        if not self._capture_as_const:
            raise ValueError(
@@ -1033,10 +1032,10 @@ class trace:
                    )

    def get_profile(self):
        """
        Get profiling result for compiled trace.
        r"""Get profiling result for compiled trace.

        :return: a json compatible object.
        Return:
            a json compatible object.
        """
        if not self._profiler:
            raise RuntimeError("trace is not set with profiling=True")
@@ -1050,9 +1049,7 @@ class trace:


 class CompiledTensorProxy:
    """
    Duck-typed RawTensor
    """
    r"""Duck-typed RawTensor"""

    def __init__(self, handle):
        self.__handle = handle
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
@@ -17,14 +17,11 @@ _default_level = logging.getLevelName(_default_level_name.upper())


 def set_log_file(fout, mode="a"):
    r"""
    Sets log output file.

    :type fout: str or file-like
    :param fout: file-like object that supports write and flush, or string for
        the filename
    :type mode: str
    :param mode: specify the mode to open log file if *fout* is a string
    r"""Sets log output file.

    Args:
        fout: file-like object that supports write and flush, or string for the filename
        mode: specify the mode to open log file if *fout* is a string
    """
    if isinstance(fout, str):
        fout = open(fout, mode)
@@ -39,45 +36,31 @@ class MegEngineLogFormatter(logging.Formatter):
    max_lines = 256

    def _color_exc(self, msg):
        r"""
        Sets the color of message as the execution type.
        """
        r"""Sets the color of message as the execution type."""
        return "\x1b[34m{}\x1b[0m".format(msg)

    def _color_dbg(self, msg):
        r"""
        Sets the color of message as the debugging type.
        """
        r"""Sets the color of message as the debugging type."""
        return "\x1b[36m{}\x1b[0m".format(msg)

    def _color_warn(self, msg):
        r"""
        Sets the color of message as the warning type.
        """
        r"""Sets the color of message as the warning type."""
        return "\x1b[1;31m{}\x1b[0m".format(msg)

    def _color_err(self, msg):
        r"""
        Sets the color of message as the error type.
        """
        r"""Sets the color of message as the error type."""
        return "\x1b[1;4;31m{}\x1b[0m".format(msg)

    def _color_omitted(self, msg):
        r"""
        Sets the color of message as the omitted type.
        """
        r"""Sets the color of message as the omitted type."""
        return "\x1b[35m{}\x1b[0m".format(msg)

    def _color_normal(self, msg):
        r"""
        Sets the color of message as the normal type.
        """
        r"""Sets the color of message as the normal type."""
        return msg

    def _color_date(self, msg):
        r"""
        Sets the color of message the same as date.
        """
        r"""Sets the color of message the same as date."""
        return "\x1b[32m{}\x1b[0m".format(msg)

    def format(self, record):
@@ -150,9 +133,7 @@ class MegEngineLogFormatter(logging.Formatter):


 def get_logger(name=None, formatter=MegEngineLogFormatter):
    r"""
    Gets megengine logger with given name.
    """
    r"""Gets megengine logger with given name."""

    logger = logging.getLogger(name)
    if getattr(logger, "_init_done__", None):
@@ -170,12 +151,11 @@ def get_logger(name=None, formatter=MegEngineLogFormatter):


 def set_log_level(level, update_existing=True):
    """
    Sets default logging level.
    r"""Sets default logging level.

    :type level: int e.g. logging.INFO
    :param level: loggin level given by python :mod:`logging` module
    :param update_existing: whether to update existing loggers
    Args:
        level: loggin level given by python :mod:`logging` module
        update_existing: whether to update existing loggers
    """
    global _default_level  # pylint: disable=global-statement
    _default_level = level
@@ -202,12 +182,13 @@ try:
    _imperative_rt_logger.set_log_handler(_megbrain_logger)

    def set_mgb_log_level(level):
        r"""
        Sets megbrain log level
        r"""Sets megbrain log level

        Args:
            level: new log level

        :type level: int e.g. logging.INFO
        :param level: new log level
        :return: original log level
        Returns:
            original log level
        """
        _megbrain_logger.setLevel(level)
        if level == logging.getLevelName("ERROR"):
@@ -235,11 +216,10 @@ except ImportError as exc:

@contextlib.contextmanager
 def replace_mgb_log_level(level):
    r"""
    Replaces megbrain log level in a block and restore after exiting.
    r"""Replaces megbrain log level in a block and restore after exiting.

    :type level: int e.g. logging.INFO
    :param level: new log level
    Args:
        level: new log level
    """
    old = set_mgb_log_level(level)
    try:
@@ -249,8 +229,6 @@ def replace_mgb_log_level(level):


 def enable_debug_log():
    r"""
    Sets logging level to debug for all components.
    """
    r"""Sets logging level to debug for all components."""
    set_log_level(logging.DEBUG)
    set_mgb_log_level(logging.DEBUG)
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
@@ -14,8 +14,7 @@ from .module import Module


 class Softmax(Module):
    r"""
    Applies a softmax function. Softmax is defined as:
    r"""Applies a softmax function. Softmax is defined as:

    .. math::
            \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}
@@ -23,29 +22,29 @@ class Softmax(Module):
    It is applied to all elements along axis, and rescales elements so that
    they stay in the range `[0, 1]` and sum to 1.

    :param axis: Along which axis softmax will be applied. By default,
        softmax will apply along the highest ranked axis.
    Args:
        axis: Along which axis softmax will be applied. By default,
            softmax will apply along the highest ranked axis.

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
            import numpy as np
            import megengine as mge
            import megengine.module as M

        data = mge.tensor(np.array([-2,-1,0,1,2]).astype(np.float32))
        softmax = M.Softmax()
        output = softmax(data)
        with np.printoptions(precision=6):
            print(output.numpy())

    Outputs:
            data = mge.tensor(np.array([-2,-1,0,1,2]).astype(np.float32))
            softmax = M.Softmax()
            output = softmax(data)
            with np.printoptions(precision=6):
                print(output.numpy())

    .. testoutput::
        Outputs:

        [0.011656 0.031685 0.086129 0.234122 0.636409]
        .. testoutput::

            [0.011656 0.031685 0.086129 0.234122 0.636409]
    """

    def __init__(self, axis=None, **kwargs):
@@ -60,32 +59,31 @@ class Softmax(Module):


 class Sigmoid(Module):
    r"""
    Applies the element-wise function:
    r"""Applies the element-wise function:

    .. math::

        \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        .. testcode::

        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
        sigmoid = M.Sigmoid()
        output = sigmoid(data)
        with np.printoptions(precision=6):
            print(output.numpy())
            import numpy as np
            import megengine as mge
            import megengine.module as M

    Outputs:
            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
            sigmoid = M.Sigmoid()
            output = sigmoid(data)
            with np.printoptions(precision=6):
                print(output.numpy())

    .. testoutput::
        Outputs:

        [0.119203 0.268941 0.5      0.731059 0.880797]
        .. testoutput::

            [0.119203 0.268941 0.5      0.731059 0.880797]
    """

    def forward(self, inputs):
@@ -93,32 +91,31 @@ class Sigmoid(Module):


 class SiLU(Module):
    r"""
    Applies the element-wise function:
    r"""Applies the element-wise function:

    .. math::

        \text{SiLU}(x) = \frac{x}{1 + \exp(-x)}

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M

        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
        silu = M.SiLU()
        output = silu(data)
        with np.printoptions(precision=6):
            print(output.numpy())
            import numpy as np
            import megengine as mge
            import megengine.module as M

    Outputs:
            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
            silu = M.SiLU()
            output = silu(data)
            with np.printoptions(precision=6):
                print(output.numpy())

    .. testoutput::
        Outputs:

        [-0.238406 -0.268941  0.        0.731059  1.761594]
        .. testoutput::

            [-0.238406 -0.268941  0.        0.731059  1.761594]
    """

    def forward(self, inputs):
@@ -126,8 +123,7 @@ class SiLU(Module):


 class GELU(Module):
    r"""
    Applies the element-wise function:
    r"""Applies the element-wise function:

    .. math::
        \text{GELU}(x) = x\Phi(x)
@@ -136,24 +132,23 @@ class GELU(Module):

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
            import numpy as np
            import megengine as mge
            import megengine.module as M

        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
        gelu = M.GELU()
        output = gelu(data)
        with np.printoptions(precision=4):
            print(output.numpy())

    Outputs:
            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
            gelu = M.GELU()
            output = gelu(data)
            with np.printoptions(precision=4):
                print(output.numpy())

    .. testoutput::
        Outputs:

        [-0.0455 -0.1587  0.      0.8413  1.9545]
        .. testoutput::

            [-0.0455 -0.1587  0.      0.8413  1.9545]
    """

    def forward(self, inputs):
@@ -161,31 +156,29 @@ class GELU(Module):


 class ReLU(Module):
    r"""
    Applies the element-wise function:
    r"""Applies the element-wise function:

    .. math::
        \text{ReLU}(x) = \max(x, 0)

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
        relu = M.ReLU()
        output = relu(data)
        with np.printoptions(precision=6):
            print(output.numpy())
        .. testcode::

    Outputs:
            import numpy as np
            import megengine as mge
            import megengine.module as M
            data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
            relu = M.ReLU()
            output = relu(data)
            with np.printoptions(precision=6):
                print(output.numpy())

    .. testoutput::
        Outputs:

        [0. 0. 0. 1. 2.]
        .. testoutput::

            [0. 0. 0. 1. 2.]
    """

    def forward(self, x):
@@ -193,8 +186,7 @@ class ReLU(Module):


 class PReLU(Module):
    r"""
    Applies the element-wise function:
    r"""Applies the element-wise function:

    .. math::
        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
@@ -211,28 +203,28 @@ class PReLU(Module):
    Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`, each input channle will has it's own :math:`a`.

    :param num_parameters: number of :math:`a` to learn, there is only two
        values are legitimate: 1, or the number of channels at input. Default: 1
    :param init: the initial value of :math:`a`. Default: 0.25
    Args:
        num_parameters: number of :math:`a` to learn, there is only two
            values are legitimate: 1, or the number of channels at input. Default: 1
        init: the initial value of :math:`a`. Default: 0.25

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        data = mge.tensor(np.array([-1.2, -3.7, 2.7]).astype(np.float32))
        prelu = M.PReLU()
        output = prelu(data)
        print(output.numpy())

    Outputs:
            import numpy as np
            import megengine as mge
            import megengine.module as M
            data = mge.tensor(np.array([-1.2, -3.7, 2.7]).astype(np.float32))
            prelu = M.PReLU()
            output = prelu(data)
            print(output.numpy())

    .. testoutput::
        Outputs:

        [-0.3   -0.925  2.7  ]
        .. testoutput::

            [-0.3   -0.925  2.7  ]
    """

    def __init__(self, num_parameters: int = 1, init: float = 0.25, **kwargs):
@@ -257,8 +249,7 @@ class PReLU(Module):


 class LeakyReLU(Module):
    r"""
    Applies the element-wise function:
    r"""Applies the element-wise function:

    .. math::
        \text{LeakyReLU}(x) = \max(0,x) + negative\_slope \times \min(0,x)
@@ -274,23 +265,22 @@ class LeakyReLU(Module):

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32))
            import numpy as np
            import megengine as mge
            import megengine.module as M
            data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32))

        leakyrelu = M.LeakyReLU(0.01)
        output = leakyrelu(data)
        print(output.numpy())

    Outputs:
            leakyrelu = M.LeakyReLU(0.01)
            output = leakyrelu(data)
            print(output.numpy())

    .. testoutput::
        Outputs:

        [-0.08 -0.12  6.   10.  ]
        .. testoutput::

            [-0.08 -0.12  6.   10.  ]
    """

    def __init__(self, negative_slope: float = 0.01, **kwargs):
--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -25,8 +25,7 @@ class _AdaptivePoolNd(Module):


 class AdaptiveMaxPool2d(_AdaptivePoolNd):
    r"""
    Applies a 2D max adaptive pooling over an input.
    r"""Applies a 2D max adaptive pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    an output shape :math:`(OH, OW)`, this layer generates the output of
@@ -40,29 +39,30 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):
        \end{aligned}

    ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:

    * padding: (0, 0)
    * stride: (floor(IH / OH), floor(IW / OW))
    * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
            import numpy as np
            import megengine as mge
            import megengine.module as M

        m = M.AdaptiveMaxPool2d((2, 2))
        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
        oup = m(inp)
        print(oup.numpy())
            m = M.AdaptiveMaxPool2d((2, 2))
            inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
            oup = m(inp)
            print(oup.numpy())

    Outputs:
        Outputs:

    .. testoutput::
        .. testoutput::

        [[[[ 5.  7.]
           [13. 15.]]]]
            [[[[ 5.  7.]
               [13. 15.]]]]

    """

@@ -71,8 +71,7 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):


 class AdaptiveAvgPool2d(_AdaptivePoolNd):
    r"""
    Applies a 2D average pooling over an input.
    r"""Applies a 2D average pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    an output shape :math:`(OH, OW)`, this layer generates the output of
@@ -84,29 +83,30 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd):
                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)

    ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:

    * padding: (0, 0)
    * stride: (floor(IH / OH), floor(IW / OW))
    * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
            import numpy as np
            import megengine as mge
            import megengine.module as M

        m = M.AdaptiveAvgPool2d((2, 2))
        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
        oup = m(inp)
        print(oup.numpy())
            m = M.AdaptiveAvgPool2d((2, 2))
            inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
            oup = m(inp)
            print(oup.numpy())

    Outputs:
        Outputs:

    .. testoutput::
        .. testoutput::

        [[[[ 2.5  4.5]
           [10.5 12.5]]]]
            [[[[ 2.5  4.5]
               [10.5 12.5]]]]

    """

--- a/imperative/python/megengine/module/batch_matmul_activation.py
+++ b/imperative/python/megengine/module/batch_matmul_activation.py
@@ -14,9 +14,7 @@ from .module import Module


 class BatchMatMulActivation(Module):
    r"""
    Batched :func:`~.matmul` with activation(only :func:`~.relu` supported), no transpose anywhere.
    """
    r"""Batched :func:`~.matmul` with activation(only :func:`~.relu` supported), no transpose anywhere."""

    def __init__(
        self,
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -141,37 +141,29 @@ class _BatchNorm(Module):


 class SyncBatchNorm(_BatchNorm):
    r"""
    Applies Synchronized Batch Normalization for distributed training.

    :type num_features: int
    :param num_features: usually :math:`C` from an input of shape
        :math:`(N, C, H, W)` or the highest ranked dimension of an input
        less than 4D.
    :type eps: float
    :param eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    :type momentum: float
    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
        Default: 0.9
    :type affine: bool
    :param affine: a boolean value that when set to True, this module has
        learnable affine parameters. Default: True
    :type track_running_stats: bool
    :param track_running_stats: when set to True, this module tracks the
        running mean and variance. When set to False, this module does not
        track such statistics and always uses batch statistics in both training
        and eval modes. Default: True
    :type freeze: bool
    :param freeze: when set to True, this module does not update the
        running mean and variance, and uses the running mean and variance instead of
        the batch mean and batch variance to normalize the input. The parameter takes effect
        only when the module is initilized with track_running_stats as True.
        Default: False
    :type group: :class:`~megengine.distributed.Group`
    :param group: communication group, caculate mean and variance between this group.
        Default: :obj:`~megengine.distributed.WORLD`
    :return: output tensor.
    r"""Applies Synchronized Batch Normalization for distributed training.

    Args:
        num_features: usually :math:`C` from an input of shape
            :math:`(N, C, H, W)` or the highest ranked dimension of an input
            less than 4D.
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the ``running_mean`` and ``running_var`` computation.
            Default: 0.9
        affine: a boolean value that when set to True, this module has
            learnable affine parameters. Default: True
        track_running_stats: when set to True, this module tracks the
            running mean and variance. When set to False, this module does not
            track such statistics and always uses batch statistics in both training
            and eval modes. Default: True
        freeze: when set to True, this module does not update the
            running mean and variance, and uses the running mean and variance instead of
            the batch mean and batch variance to normalize the input. The parameter takes effect
            only when the module is initilized with track_running_stats as True.
            Default: False
        group: communication group, caculate mean and variance between this group.
            Default: :obj:`~.distributed.WORLD`
    """

    def __init__(
@@ -249,8 +241,7 @@ class SyncBatchNorm(_BatchNorm):


 class BatchNorm1d(_BatchNorm):
    r"""
    Applies Batch Normalization over a 2D/3D tensor.
    r"""Applies Batch Normalization over a 2D/3D tensor.

    Refer to :class:`~.BatchNorm2d` for more information.
    """
@@ -263,8 +254,7 @@ class BatchNorm1d(_BatchNorm):


 class BatchNorm2d(_BatchNorm):
    r"""
    Applies Batch Normalization over a 4D tensor.
    r"""Applies Batch Normalization over a 4D tensor.

    .. math::

@@ -287,56 +277,50 @@ class BatchNorm2d(_BatchNorm):
    statistics on `(N, H, W)` slices, it's common terminology to call this
    Spatial Batch Normalization.

    :type num_features: int
    :param num_features: usually :math:`C` from an input of shape
        :math:`(N, C, H, W)` or the highest ranked dimension of an input
        less than 4D.
    :type eps: float
    :param eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    :type momentum: float
    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
        Default: 0.9
    :type affine: bool
    :param affine: a boolean value that when set to True, this module has
        learnable affine parameters. Default: True
    :type track_running_stats: bool
    :param track_running_stats: when set to True, this module tracks the
        running mean and variance. When set to False, this module does not
        track such statistics and always uses batch statistics in both training
        and eval modes. Default: True

    :type freeze: bool
    :param freeze: when set to True, this module does not update the
        running mean and variance, and uses the running mean and variance instead of
        the batch mean and batch variance to normalize the input. The parameter takes effect
        only when the module is initilized with track_running_stats as True.
        Default: False
    Args:
        num_features: usually :math:`C` from an input of shape
            :math:`(N, C, H, W)` or the highest ranked dimension of an input
            less than 4D.
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the ``running_mean`` and ``running_var`` computation.
            Default: 0.9
        affine: a boolean value that when set to True, this module has
            learnable affine parameters. Default: True
        track_running_stats: when set to True, this module tracks the
            running mean and variance. When set to False, this module does not
            track such statistics and always uses batch statistics in both training
            and eval modes. Default: True
        freeze: when set to True, this module does not update the
            running mean and variance, and uses the running mean and variance instead of
            the batch mean and batch variance to normalize the input. The parameter takes effect
            only when the module is initilized with track_running_stats as True.
            Default: False

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
            import numpy as np
            import megengine as mge
            import megengine.module as M

        # With Learnable Parameters
        m = M.BatchNorm2d(4)
        inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
        oup = m(inp)
        print(m.weight.numpy().flatten(), m.bias.numpy().flatten())
        # Without L`e`arnable Parameters
        m = M.BatchNorm2d(4, affine=False)
        oup = m(inp)
        print(m.weight, m.bias)
            # With Learnable Parameters
            m = M.BatchNorm2d(4)
            inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
            oup = m(inp)
            print(m.weight.numpy().flatten(), m.bias.numpy().flatten())
            # Without L`e`arnable Parameters
            m = M.BatchNorm2d(4, affine=False)
            oup = m(inp)
            print(m.weight, m.bias)

    Outputs:
        Outputs:

    .. testoutput::
        .. testoutput::

        [1. 1. 1. 1.] [0. 0. 0. 0.]
        None None
            [1. 1. 1. 1.] [0. 0. 0. 0.]
            None None
    """

    def _check_input_ndim(self, inp):
--- a/imperative/python/megengine/module/concat.py
+++ b/imperative/python/megengine/module/concat.py
@@ -13,8 +13,7 @@ from .module import Module


 class Concat(Module):
    r"""
    A :class:`~.Module` to do functional :func:`~.concat`. Could be replaced with :class:`~.QATModule`
    r"""A :class:`~.Module` to do functional :func:`~.concat`. Could be replaced with :class:`~.QATModule`
    version :class:`~.qat.Concat` using :func:`~.quantize.quantize_qat`.
    """

--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -97,8 +97,7 @@ class _ConvNd(Module):

 class Conv1d(_ConvNd):

    r"""
    Applies a 1D convolution over an input tensor.
    r"""Applies a 1D convolution over an input tensor.

    For instance, given an input of the size :math:`(N, C_{\text{in}}, H)`,
    this layer generates an output of the size
@@ -121,52 +120,49 @@ class Conv1d(_ConvNd):
    a depthwise convolution with a depthwise multiplier `K`, can be constructed
    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions.
    :param stride: stride of the 1D convolution operation.
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 1D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Default: 1
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
        `cross_correlation`
    :param compute_mode: When set to "default", no special requirements will be
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.

    .. note::

       * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` ,
         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, kernel_size)``
       * ``bias`` usually has shape ``(1, out_channels, 1)``
    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions.
        stride: stride of the 1D convolution operation.
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 1D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and there would be an extra dimension at the beginning of the weight's
            shape. Default: 1
        bias: whether to add a bias onto the result of convolution. Default: True
        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
        compute_mode: When set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` ,
          if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, kernel_size)``
        * ``bias`` usually has shape ``(1, out_channels, 1)``

    Examples:

    .. testcode::
        .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
            import numpy as np
            import megengine as mge
            import megengine.module as M

        m = M.Conv1d(in_channels=3, out_channels=1, kernel_size=3)
        inp = mge.tensor(np.arange(0, 24).astype("float32").reshape(2, 3, 4))
        oup = m(inp)
        print(oup.numpy().shape)
            m = M.Conv1d(in_channels=3, out_channels=1, kernel_size=3)
            inp = mge.tensor(np.arange(0, 24).astype("float32").reshape(2, 3, 4))
            oup = m(inp)
            print(oup.numpy().shape)

    Outputs:
        Outputs:

    .. testoutput::

        (2, 1, 2)
        .. testoutput::

            (2, 1, 2)
    """

    def __init__(
@@ -245,8 +241,7 @@ class Conv1d(_ConvNd):


 class Conv2d(_ConvNd):
    r"""
    Applies a 2D convolution over an input tensor.
    r"""Applies a 2D convolution over an input tensor.

    For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
    this layer generates an output of the size
@@ -284,54 +279,51 @@ class Conv2d(_ConvNd):
    a depthwise convolution with a depthwise multiplier `K`, can be constructed
    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Default: 1
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
        `cross_correlation`
    :param compute_mode: When set to "default", no special requirements will be
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.

    .. note::

       * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
       * ``bias`` usually has shape ``(1, out_channels, *1)``
    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions. If kernel_size is
            an :class:`int`, the actual kernel size would be
            ``(kernel_size, kernel_size)``.
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and there would be an extra dimension at the beginning of the weight's
            shape. Default: 1
        bias: whether to add a bias onto the result of convolution. Default: True
        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
        compute_mode: When set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
        * ``bias`` usually has shape ``(1, out_channels, *1)``

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        .. testcode::

        m = M.Conv2d(in_channels=3, out_channels=1, kernel_size=3)
        inp = mge.tensor(np.arange(0, 96).astype("float32").reshape(2, 3, 4, 4))
        oup = m(inp)
        print(oup.numpy().shape)
            import numpy as np
            import megengine as mge
            import megengine.module as M

    Outputs:
            m = M.Conv2d(in_channels=3, out_channels=1, kernel_size=3)
            inp = mge.tensor(np.arange(0, 96).astype("float32").reshape(2, 3, 4, 4))
            oup = m(inp)
            print(oup.numpy().shape)

    .. testoutput::
        Outputs:

        (2, 1, 2, 2)
        .. testoutput::

            (2, 1, 2, 2)
    """

    def __init__(
@@ -411,8 +403,7 @@ class Conv2d(_ConvNd):

 class Conv3d(_ConvNd):

    r"""
    Applies a 3D convolution over an input tensor.
    r"""Applies a 3D convolution over an input tensor.

    For instance, given an input of the size :math:`(N, C_{\text{in}}, T, H, W)`,
    this layer generates an output of the size
@@ -434,50 +425,47 @@ class Conv3d(_ConvNd):
    a depthwise convolution with a depthwise multiplier `K`, can be constructed
    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
        `(kernel_size, kernel_size, kernel_size)`.
    :param stride: stride of the 3D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 3D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Default: 1
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
        `cross_correlation`

    .. note::

       * ``weight`` usually has shape ``(out_channels, in_channels, depth, height, width)`` ,
         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, depth, height, width)``
       * ``bias`` usually has shape ``(1, out_channels, *1)``
    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions. If kernel_size is
            an :class:`int`, the actual kernel size would be
            `(kernel_size, kernel_size, kernel_size)`.
        stride: stride of the 3D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 3D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and there would be an extra dimension at the beginning of the weight's
            shape. Default: 1
        bias: whether to add a bias onto the result of convolution. Default: True
        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, depth, height, width)`` ,
          if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, depth, height, width)``
        * ``bias`` usually has shape ``(1, out_channels, *1)``

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        .. testcode::

        m = M.Conv3d(in_channels=3, out_channels=1, kernel_size=3)
        inp = mge.tensor(np.arange(0, 384).astype("float32").reshape(2, 3, 4, 4, 4))
        oup = m(inp)
        print(oup.numpy().shape)
            import numpy as np
            import megengine as mge
            import megengine.module as M

    Outputs:
            m = M.Conv3d(in_channels=3, out_channels=1, kernel_size=3)
            inp = mge.tensor(np.arange(0, 384).astype("float32").reshape(2, 3, 4, 4, 4))
            oup = m(inp)
            print(oup.numpy().shape)

    .. testoutput::
        Outputs:

        (2, 1, 2, 2, 2)
        .. testoutput::

            (2, 1, 2, 2, 2)
    """

    def __init__(
@@ -551,8 +539,7 @@ class Conv3d(_ConvNd):


 class ConvTranspose2d(_ConvNd):
    r"""
    Applies a 2D transposed convolution over an input tensor.
    r"""Applies a 2D transposed convolution over an input tensor.

    This module is also known as a deconvolution or a fractionally-strided convolution.
    :class:`ConvTranspose2d` can be seen as the gradient of :class:`Conv2d` operation
@@ -562,35 +549,32 @@ class ConvTranspose2d(_ConvNd):
    the opposite way, transforming a smaller input to a larger output while preserving the
    connectivity pattern.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
        an :class:`int`, the actual kernel size would be
        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Default: 1
    :param bias: wether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
        `cross_correlation`
    :param compute_mode: When set to "default", no special requirements will be
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.

    .. note::

       * ``weight`` usually has shape ``(in_channels, out_channels, height, width)`` ,
         if groups is not 1, shape will be ``(groups, in_channels // groups, out_channels // groups, height, width)``
       * ``bias`` usually has shape ``(1, out_channels, *1)``

    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
            an :class:`int`, the actual kernel size would be
            ``(kernel_size, kernel_size)``.
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and there would be an extra dimension at the beginning of the weight's
            shape. Default: 1
        bias: wether to add a bias onto the result of convolution. Default: True
            conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
        compute_mode: When set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.

    Note:
        * ``weight`` usually has shape ``(in_channels, out_channels, height, width)`` ,
          if groups is not 1, shape will be ``(groups, in_channels // groups, out_channels // groups, height, width)``
        * ``bias`` usually has shape ``(1, out_channels, *1)``
    """

    def __init__(
@@ -669,30 +653,28 @@ class ConvTranspose2d(_ConvNd):


 class LocalConv2d(Conv2d):
    r"""
    Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
    r"""Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
    It is also known as the locally connected layer.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param input_height: the height of the input images.
    :param input_width: the width of the input images.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``. Default: 1

    .. note::

       * ``weight`` usually has shape ``(out_height, out_width, in_channels, height, width, in_channels)`` ,
         if groups is not 1, shape will be ``(groups, out_height, out_width, in_channels // groups, height, width, out_channels // groups)``
       * ``bias`` usually has shape ``(1, out_channels, *1)``

    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        input_height: the height of the input images.
        input_width: the width of the input images.
        kernel_size: size of weight on spatial dimensions. If kernel_size is
            an :class:`int`, the actual kernel size would be
            ``(kernel_size, kernel_size)``.
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``. Default: 1

    Note:
        * ``weight`` usually has shape ``(out_height, out_width, in_channels, height, width, in_channels)`` ,
          if groups is not 1, shape will be ``(groups, out_height, out_width, in_channels // groups, height, width, out_channels // groups)``
        * ``bias`` usually has shape ``(1, out_channels, *1)``
    """

    def __init__(
@@ -755,8 +737,7 @@ class LocalConv2d(Conv2d):


 class ConvRelu2d(Conv2d):
    r"""
    A fused :class:`~.Module` including :class:`~.module.Conv2d` and :func:`~.relu`.
    r"""A fused :class:`~.Module` including :class:`~.module.Conv2d` and :func:`~.relu`.
    Could be replaced with :class:`~.QATModule` version :class:`~.qat.ConvRelu2d` using :func:`~.quantize.quantize_qat`.
    """

@@ -765,38 +746,34 @@ class ConvRelu2d(Conv2d):


 class DeformableConv2d(_ConvNd):
    """
    Deformable Convolution.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If kernel_size is
        an :class:`int`, the actual kernel size would be
        ``(kernel_size, kernel_size)``.
    :param stride: stride of the 2D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on both sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
    :param groups: number of groups into which the input and output channels are divided,
        so as to perform a "grouped convolution". When ``groups`` is not 1,
        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and there would be an extra dimension at the beginning of the weight's
        shape. Default: 1
    :param bias: whether to add a bias onto the result of convolution. Default:
        True
    :param conv_mode: Supports `cross_correlation`. Default:
        `cross_correlation`
    :param compute_mode: When set to "default", no special requirements will be
        placed on the precision of intermediate results. When set to "float32",
        "float32" would be used for accumulator and intermediate result, but only
        effective when input and output are of float16 dtype.

    .. note::

       * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
         if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
       * ``bias`` usually has shape ``(1, out_channels, *1)``

    r"""Deformable Convolution.

    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions. If kernel_size is
            an :class:`int`, the actual kernel size would be
            ``(kernel_size, kernel_size)``.
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and there would be an extra dimension at the beginning of the weight's
            shape. Default: 1
        bias: whether to add a bias onto the result of convolution. Default: True
        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
        compute_mode: When set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
          if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``
        * ``bias`` usually has shape ``(1, out_channels, *1)``
    """

    def __init__(
@@ -877,8 +854,7 @@ class DeformableConv2d(_ConvNd):


 class ConvTranspose3d(_ConvNd):
    r"""
    Applies a 3D transposed convolution over an input tensor.
    r"""Applies a 3D transposed convolution over an input tensor.

    Only support the case that groups = 1 and conv_mode = "cross_correlation".

@@ -889,23 +865,21 @@ class ConvTranspose3d(_ConvNd):
    works the opposite way, transforming a smaller input to a larger output while
    preserving the connectivity pattern.

    :param in_channels: number of input channels.
    :param out_channels: number of output channels.
    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
        an :class:`int`, the actual kernel size would be
        ``(kernel_size, kernel_size, kernel_size)``.
    :param stride: stride of the 3D convolution operation. Default: 1
    :param padding: size of the paddings added to the input on all sides of its
        spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 3D convolution operation. Default: 1
    :param bias: wether to add a bias onto the result of convolution. Default:
        True

    .. note::

       * ``weight`` usually has shape ``(in_channels, out_channels, depth, height, width)`` .
       * ``bias`` usually has shape ``(1, out_channels, *1)``

    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
            an :class:`int`, the actual kernel size would be
            ``(kernel_size, kernel_size, kernel_size)``.
        stride: stride of the 3D convolution operation. Default: 1
        padding: size of the paddings added to the input on all sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 3D convolution operation. Default: 1
        bias: wether to add a bias onto the result of convolution. Default: True

    Note:
        * ``weight`` usually has shape ``(in_channels, out_channels, depth, height, width)`` .
        * ``bias`` usually has shape ``(1, out_channels, *1)``
    """

    def __init__(
--- a/imperative/python/megengine/module/conv_bn.py
+++ b/imperative/python/megengine/module/conv_bn.py
@@ -50,8 +50,7 @@ class _ConvBnActivation2d(Module):


 class ConvBn2d(_ConvBnActivation2d):
    r"""
    A fused :class:`~.Module` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d`. 
    r"""A fused :class:`~.Module` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d`.
    Could be replaced with :class:`~.QATModule` version :class:`~.qat.ConvBn2d` using
    :func:`~.quantize.quantize_qat`.
    """
@@ -61,8 +60,7 @@ class ConvBn2d(_ConvBnActivation2d):


 class ConvBnRelu2d(_ConvBnActivation2d):
    r"""
    A fused :class:`~.Module` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu`. 
    r"""A fused :class:`~.Module` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu`.
    Could be replaced with :class:`~.QATModule` version :class:`~.qat.ConvBnRelu2d` using :func:`~.quantize.quantize_qat`.
    """

--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,13 +11,13 @@ from .module import Module


 class Dropout(Module):
    r"""
    Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
    r"""Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
    Commonly used in large networks to prevent overfitting.
    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.

    :param drop_prob: The probability to drop (set to zero) each single element
    Args:
        drop_prob: The probability to drop (set to zero) each single element
    """

    def __init__(self, drop_prob=0.0, **kwargs):
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
@@ -11,67 +11,12 @@ from .module import Module


 class Elemwise(Module):
    r"""
    A :class:`~.Module` to do :mod:`~.functional.elemwise` operator. Could be replaced with :class:`~.QATModule`
    r"""A :class:`~.Module` to do :mod:`~.functional.elemwise` operator. Could be replaced with :class:`~.QATModule`
    version :class:`~.qat.Elemwise` using :func:`~.quantize.quantize_qat`.

    :param method: the elemwise method, support the following string.
        It will do the normal elemwise operator for float.

        * "add": a + b
        * "fuse_add_relu": max(x+y, 0)
        * "mul": x * y
        * "min": min(x, y)
        * "max": max(x, y)
        * "sub": x - y
        * "true_div": x / y
        * "fuse_add_sigmoid": sigmoid(x + y)
        * "fuse_add_tanh": tanh(x + y)
        * "relu": x > 0 ? x : 0
        * "silu": silu(x)
        * "gelu": gelu(x)
        * "abs": x > 0 ? x : -x
        * "sigmoid": sigmoid(x)
        * "exp": exp(x)
        * "tanh": tanh(x)
        * "fuse_mul_add3": x * y + z
        * "fast_tanh": x * (27. + x * x) / (27. + 9. * x * x)
        * "negate": -x
        * "acos": acos(x)
        * "asin": asin(x)
        * "ceil": ceil(x)
        * "cos": cos(x)
        * "expm1": expm1(x)
        * "floor": floor(x)
        * "log": log(x)
        * "log1p": log1p(x)
        * "sin": sin(x)
        * "round": round(x)
        * "erf": erf(x)
        * "erfinv": erfinv(x)
        * "erfc": erfc(x)
        * "erfcinv": erfcinv(x)
        * "abs_grad": abs_grad
        * "floor_div": floor_div
        * "mod": mod
        * "sigmoid_grad": sigmoid_grad
        * "switch_gt0": switch_gt0
        * "tanh_grad": tanh_grad
        * "lt": less
        * "leq": leq
        * "eq": equal
        * "pow": pow
        * "log_sum_exp": log_sum_exp
        * "fast_tanh_grad": fast_tanh_grad
        * "atan2": atan2
        * "cond_leq_mov": cond_leq_mov
        * "h_swish": h_swish
        * "fuse_add_h_swish": h_swish(x+y)
        * "h_swish_grad": h_swish_grad
        * "and": bool binary: x && y
        * "or": bool binary: x || y
        * "xor": bool binary: x ^ y
        * "not": bool unary: ~x
    Args:
        method: the elemwise method, support the following string.
                It will do the normal elemwise operator for float.
    """

    def __init__(self, method, **kwargs):
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
@@ -17,42 +17,41 @@ from .module import Module


 class Embedding(Module):
    r"""
    A simple lookup table that stores embeddings of a fixed dictionary and size.
    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.

    This module is often used to store word embeddings and retrieve them using indices.
    The input to the module is a list of indices, and the output is the corresponding word embeddings.
    The indices should less than num_embeddings.

    :param num_embeddings: size of embedding dictionary.
    :param embedding_dim: size of each embedding vector.
    :param padding_idx: should be set to None, not supportted now.
    :param max_norm: should be set to None, not supportted now.
    :param norm_type: should be set to None, not supportted now.
    :param initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).
    Args:
        num_embeddings: size of embedding dictionary.
        embedding_dim: size of each embedding vector.
        padding_idx: should be set to None, not supportted now.
        max_norm: should be set to None, not supportted now.
        norm_type: should be set to None, not supportted now.
        initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
        data = mge.tensor(np.array([(0,0)], dtype=np.int32))
        .. testcode::

        embedding = M.Embedding(1, 5, initial_weight=weight)
        output = embedding(data)
        with np.printoptions(precision=6):
            print(output.numpy())
            import numpy as np
            import megengine as mge
            import megengine.module as M
            weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
            data = mge.tensor(np.array([(0,0)], dtype=np.int32))

    Outputs:
            embedding = M.Embedding(1, 5, initial_weight=weight)
            output = embedding(data)
            with np.printoptions(precision=6):
                print(output.numpy())

    .. testoutput::
        Outputs:

        [[[1.2 2.3 3.4 4.5 5.6]
          [1.2 2.3 3.4 4.5 5.6]]]
        .. testoutput::

            [[[1.2 2.3 3.4 4.5 5.6]
              [1.2 2.3 3.4 4.5 5.6]]]
    """

    def __init__(
@@ -110,36 +109,35 @@ class Embedding(Module):
        max_norm: Optional[float] = None,
        norm_type: Optional[float] = None,
    ):
        r"""
        Creates Embedding instance from given 2-dimensional FloatTensor.
        r"""Creates Embedding instance from given 2-dimensional FloatTensor.

        :param embeddings: tensor contained weight for the embedding.
        :param freeze: if ``True``, the weight does not get updated during the learning process. Default: True.
        :param padding_idx: should be set to None, not support Now.
        :param max_norm: should be set to None, not support Now.
        :param norm_type: should be set to None, not support Now.
        Args:
            embeddings: tensor contained weight for the embedding.
            freeze: if ``True``, the weight does not get updated during the learning process. Default: True.
            padding_idx: should be set to None, not support Now.
            max_norm: should be set to None, not support Now.
            norm_type: should be set to None, not support Now.

        Examples:

        .. testcode::

            import numpy as np
            import megengine as mge
            import megengine.module as M
            weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
            data = mge.tensor(np.array([(0,0)], dtype=np.int32))
            .. testcode::

            embedding = M.Embedding.from_pretrained(weight, freeze=False)
            output = embedding(data)
            print(output.numpy())
                import numpy as np
                import megengine as mge
                import megengine.module as M
                weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6)], dtype=np.float32))
                data = mge.tensor(np.array([(0,0)], dtype=np.int32))

        Outputs:
                embedding = M.Embedding.from_pretrained(weight, freeze=False)
                output = embedding(data)
                print(output.numpy())

        .. testoutput::
            Outputs:

            [[[1.2 2.3 3.4 4.5 5.6]
              [1.2 2.3 3.4 4.5 5.6]]]
            .. testoutput::

                [[[1.2 2.3 3.4 4.5 5.6]
                  [1.2 2.3 3.4 4.5 5.6]]]
        """
        embeddings_shape = embeddings.shape
        embeddings_dim = len(embeddings_shape)
--- a/imperative/python/megengine/module/external.py
+++ b/imperative/python/megengine/module/external.py
@@ -19,7 +19,7 @@ from .module import Module

 class TensorrtRuntimeSubgraph(Module):
    r"""Load a serialized TensorrtRuntime subgraph.

    
    See :func:`~.tensorrt_runtime_opr` for more details.
    """

@@ -41,7 +41,7 @@ class TensorrtRuntimeSubgraph(Module):

 class CambriconRuntimeSubgraph(Module):
    r"""Load a serialized CambriconRuntime subgraph.

    
    See :func:`~.cambricon_runtime_opr` for more details.
    """

@@ -68,7 +68,7 @@ class CambriconRuntimeSubgraph(Module):

 class AtlasRuntimeSubgraph(Module):
    r"""Load a serialized AtlasRuntime subgraph.

    
    See :func:`~.atlas_runtime_opr` for more details.
    """

--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -18,53 +18,53 @@ from ..tensor import Tensor


 def fill_(tensor: Tensor, val: Union[float, int]) -> None:
    """
    Fills the given ``tensor`` with value ``val``.
    """Fills the given ``tensor`` with value ``val``.

    :param tensor: tensor to be initialized.
    :param val: value to be filled throughout the tensor.
    Args:
        tensor: tensor to be initialized.
        val: value to be filled throughout the tensor.
    """
    tensor._reset(full(shape=tensor.shape, value=val, dtype=tensor.dtype))


 def zeros_(tensor: Tensor) -> None:
    """
    Fills the given ``tensor`` with scalar value `0`.
    """Fills the given ``tensor`` with scalar value `0`.

    :param tensor: tensor to be initialized.
    Args:
        tensor: tensor to be initialized.
    """
    fill_(tensor, 0)


 def ones_(tensor: Tensor) -> None:
    """
    Fills the given ``tensor`` with the scalar value `1`.
    """Fills the given ``tensor`` with the scalar value `1`.

    :param tensor: tensor to be initialized.
    Args:
        tensor: tensor to be initialized.
    """
    fill_(tensor, 1)


 def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
    r"""
    Fills the given ``tensor`` with random value sampled from uniform distribution
    r"""Fills the given ``tensor`` with random value sampled from uniform distribution
    :math:`\mathcal{U}(\text{a}, \text{b})`.

    :param tensor: tensor to be initialized.
    :param a: lower bound of the sampling interval.
    :param b: upper bound of the sampling interval.
    Args:
        tensor: tensor to be initialized.
        a: lower bound of the sampling interval.
        b: upper bound of the sampling interval.
    """
    tensor._reset(uniform(size=tensor.shape, low=a, high=b).astype(tensor.dtype))


 def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
    r"""
    Fills the given ``tensor`` with random value sampled from normal distribution
    r"""Fills the given ``tensor`` with random value sampled from normal distribution
    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.

    :param tensor: tensor to be initialized.
    :param mean: mean of the normal distribution.
    :param std: standard deviation of the normal distribution.
    Args:
        tensor: tensor to be initialized.
        mean: mean of the normal distribution.
        std: standard deviation of the normal distribution.
    """
    tensor._reset(normal(size=tensor.shape, mean=mean, std=std).astype(tensor.dtype))

@@ -72,10 +72,9 @@ def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
 def calculate_gain(
    nonlinearity: str, param: Optional[Union[int, float]] = None
 ) -> float:
    r"""
    Returns a recommended gain value (see the table below) for the given nonlinearity
    r"""Returns a recommended gain value (see the table below) for the given nonlinearity
    function.

    
    ================= ====================================================
    nonlinearity      gain
    ================= ====================================================
@@ -87,10 +86,10 @@ def calculate_gain(
    Leaky Relu        :math:`\sqrt{\frac{2}{1 + {\text{negative}_\text{slope}}^2}}`
    ================= ====================================================

    :param nonlinearity: name of the non-linear function.
    :param param: optional parameter for leaky_relu. Only effective when
        ``nonlinearity`` is "leaky_relu".

    Args:
        nonlinearity: name of the non-linear function.
        param: optional parameter for leaky_relu. Only effective when
            ``nonlinearity`` is "leaky_relu".
    """
    linear_fns = [
        "linear",
@@ -124,11 +123,11 @@ def calculate_gain(


 def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
    """
    Calculates fan_in / fan_out value for given weight tensor. This function assumes
    r"""Calculates fan_in / fan_out value for given weight tensor. This function assumes
    input tensor is stored in ``NCHW`` format.

    :param tensor: weight tensor in ``NCHW`` format.
    Args:
        tensor: weight tensor in ``NCHW`` format.
    """
    shape = tensor.shape
    ndim = len(shape)
@@ -153,14 +152,14 @@ def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:


 def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
    """
    Calculates fan_in / fan_out value for given weight tensor, depending on given
    r"""Calculates fan_in / fan_out value for given weight tensor, depending on given
    ``mode``.

    
    See :func:`calculate_fan_in_and_fan_out` for details.

    :param tensor: weight tensor in ``NCHW`` format.
    :param mode: "fan_in" or "fan_out".
    Args:
        tensor: weight tensor in ``NCHW`` format.
        mode: fan_in" or "fan_out".
    """
    mode = mode.lower()
    valid_modes = ["fan_in", "fan_out"]
@@ -174,19 +173,20 @@ def calculate_correct_fan(tensor: Tensor, mode: str) -> float:


 def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
    r"""
    Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
    r"""Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
    where

    
    .. math::
        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}

        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
    
    Also known as Glorot initialization. Detailed information can be retrieved from
    `Understanding the difficulty of training deep feedforward neural networks` -
    Glorot, X. & Bengio, Y. (2010).

    :param tensor: tensor to be initialized.
    :param gain: scaling factor for :math:`a`.
    Args:
        tensor: tensor to be initialized.
        gain: scaling factor for :math:`a`.
    """
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -195,19 +195,20 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:


 def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
    r"""
    Fills tensor with random values sampled from
    r"""Fills tensor with random values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where

    
    .. math::
        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}

        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
    
    Also known as Glorot initialization. Detailed information can be retrieved from
    `Understanding the difficulty of training deep feedforward neural networks` -
    Glorot, X. & Bengio, Y. (2010).

    :param tensor: tensor to be initialized.
    :param gain: scaling factor for :math:`std`.
    Args:
        tensor: tensor to be initialized.
        gain: scaling factor for :math:`std`.
    """
    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
@@ -217,25 +218,26 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
 def msra_uniform_(
    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
    r"""
    Fills tensor wilth random values sampled from
    r"""Fills tensor wilth random values sampled from
    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where

    
    .. math::
        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}

        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
    
    Detailed information can be retrieved from
    `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
    classification`

    :param tensor: tensor to be initialized.
    :param a: optional parameter for calculating gain for leaky_relu. See
        :func:`calculate_gain` for details.
    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
        scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
        details.
    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
        See :func:`calculate_gain` for details.
    Args:
        tensor: tensor to be initialized.
        a: optional parameter for calculating gain for leaky_relu. See
            :func:`calculate_gain` for details.
        mode: fan_in" or "fan_out", used to calculate :math:`gain`, the
            scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
            details.
        nonlinearity: name of the non-linear function used to calculate :math:`gain`.
            See :func:`calculate_gain` for details.
    """
    fan = calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
@@ -247,25 +249,26 @@ def msra_uniform_(
 def msra_normal_(
    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
    r"""
    Fills tensor wilth random values sampled from
    r"""Fills tensor wilth random values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where

    
    .. math::
        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}

        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}
    
    Detailed information can be retrieved from
    `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
    classification`

    :param tensor: tensor to be initialized
    :param a: optional parameter for calculating gain for leaky_relu. See
        :func:`calculate_gain` for details.
    :param mode: "fan_in" or "fan_out", used to calculate :math:`gain`, the
        scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
        details.
    :param nonlinearity: name of the non-linear function used to calculate :math:`gain`.
        See :func:`calculate_gain` for details.
    Args:
        tensor: tensor to be initialized
        a: optional parameter for calculating gain for leaky_relu. See
            :func:`calculate_gain` for details.
        mode: fan_in" or "fan_out", used to calculate :math:`gain`, the
            scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
            details.
        nonlinearity: name of the non-linear function used to calculate :math:`gain`.
            See :func:`calculate_gain` for details.
    """
    fan = calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -14,8 +14,7 @@ from .module import Module


 class Linear(Module):
    r"""
    Applies a linear transformation to the input. For instance, if input
    r"""Applies a linear transformation to the input. For instance, if input
    is x, then output y is:

    .. math::
@@ -24,30 +23,31 @@ class Linear(Module):

    where :math:`y_i= \sum_j W_{ij} x_j + b_i`

    :param in_features: size of each input sample.
    :param out_features: size of each output sample.
    :param bias: if it's ``False``, the layer will not learn an additional ``bias``.
        Default: ``True``
    Args:
        in_features: size of each input sample.
        out_features: size of each output sample.
        bias: if it's ``False``, the layer will not learn an additional ``bias``.
            Default: ``True``

    Examples:

    .. testcode::
    Examples:

        import numpy as np
        import megengine as mge
        import megengine.module as M
        .. testcode::

        m = M.Linear(in_features=3, out_features=1)
        inp = mge.tensor(np.arange(0, 6).astype("float32").reshape(2, 3))
        oup = m(inp)
        print(oup.numpy().shape)
            import numpy as np
            import megengine as mge
            import megengine.module as M

    Outputs:
            m = M.Linear(in_features=3, out_features=1)
            inp = mge.tensor(np.arange(0, 6).astype("float32").reshape(2, 3))
            oup = m(inp)
            print(oup.numpy().shape)

    .. testoutput::
        Outputs:

        (2, 1)
        .. testoutput::

            (2, 1)
    """

    def __init__(
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -84,15 +84,14 @@ def _get_XNorm_typeclass():


 class Module(metaclass=ABCMeta):
    """
    Base Module class.
    r"""Base Module class.

    Args:
        name: module's name, can be initialized by the ``kwargs`` parameter
            of child class.
    """

    def __init__(self, name=None):
        """
        :param name: module's name, can be initialized by the ``kwargs`` parameter
            of child class.
        """
        self._modules = []

        if name is not None:
@@ -118,18 +117,19 @@ class Module(metaclass=ABCMeta):
        pass

    def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
        """
        Registers a hook to handle forward inputs. `hook` should be a function.
        """Registers a hook to handle forward inputs. `hook` should be a function.

        :param hook: a function that receive `module` and `inputs`, then return
            a modified `inputs` or `None`.
        :return: a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
        Args:
            hook: a function that receive `module` and `inputs`, then return
                a modified `inputs` or `None`.

        Returns:
            a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
        """
        return HookHandler(self._forward_pre_hooks, hook)

    def register_forward_hook(self, hook: Callable) -> HookHandler:
        """
        Registers a hook to handle forward results. `hook` should be a function that
        """Registers a hook to handle forward results. `hook` should be a function that
        receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.

        This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
@@ -164,19 +164,19 @@ class Module(metaclass=ABCMeta):
        predicate: Callable[[Any], bool] = lambda _: True,
        seen: Optional[Set[int]] = None
    ) -> Union[Iterable[Any], Iterable[Tuple[str, Any]]]:
        """
        Scans the module object and returns an iterable for the :class:`~.Tensor`
        """Scans the module object and returns an iterable for the :class:`~.Tensor`
        and :class:`~.Module` attributes that agree with the ``predicate``. For multiple
        calls of this function with same arguments, the order of objects within the
        returned iterable is guaranteed to be identical, as long as all the involved
        module objects' ``__dict__`` does not change thoughout those calls.

        :param recursive: whether to recursively scan all the submodules.
        :param with_key: whether to yield keys along with yielded objects.
        :param with_parent: whether to yield ``self`` along with yielded objects.
        :param prefix: prefix appended to the yielded keys.
        :param predicate: the predication function applied to scanned objects.
        :param seen: a dict that records whether a module has been traversed yet.
        Args:
            recursive: whether to recursively scan all the submodules.
            with_key: whether to yield keys along with yielded objects.
            with_parent: whether to yield ``self`` along with yielded objects.
            prefix: prefix appended to the yielded keys.
            predicate: the predication function applied to scanned objects.
            seen: a dict that records whether a module has been traversed yet.
        """
        if seen is None:
            seen = set([id(self)])
@@ -212,12 +212,12 @@ class Module(metaclass=ABCMeta):
                    )

    def parameters(self, recursive: bool = True, **kwargs) -> Iterable[Parameter]:
        r"""
        Returns an iterable for the :class:`~.Parameter` of the module.
        r"""Returns an iterable for the :class:`~.Parameter` of the module.

        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
            module, else only returns :class:`~.Parameter` that are direct attributes
            of this module.
        Args:
            recursive: If ``True``, returns all :class:`~.Parameter` within this
                module, else only returns :class:`~.Parameter` that are direct attributes
                of this module.
        """

        if "requires_grad" in kwargs:
@@ -237,14 +237,14 @@ class Module(metaclass=ABCMeta):
    def named_parameters(
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Parameter]]:
        """
        Returns an iterable for key :class:`~.Parameter` pairs of the module, where
        r"""Returns an iterable for key :class:`~.Parameter` pairs of the module, where
        ``key`` is the dotted path from this module to the :class:`~.Parameter`.

        :param prefix: prefix prepended to the keys.
        :param recursive: if ``True``, returns all :class:`~.Parameter` within this
            module, else only returns :class:`~.Parameter` that are direct attributes
            of this module.
        Args:
            prefix: prefix prepended to the keys.
            recursive: if ``True``, returns all :class:`~.Parameter` within this
                module, else only returns :class:`~.Parameter` that are direct attributes
                of this module.
        """

        if "requires_grad" in kwargs:
@@ -266,14 +266,13 @@ class Module(metaclass=ABCMeta):
        )

    def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Tensor]:
        """
        Returns an iterable for the buffers of the module.
        r"""Returns an iterable for the buffers of the module.

        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.

        :param recursive: if ``True``, returns all buffers within this
            module, else only returns buffers that are direct attributes
            of this module.
        Args:
            recursive: if ``True``, returns all buffers within this
                module, else only returns buffers that are direct attributes
        """
        yield from self._flatten(
            with_key=False, predicate=_is_buffer, recursive=recursive, **kwargs
@@ -282,16 +281,17 @@ class Module(metaclass=ABCMeta):
    def named_buffers(
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Tensor]]:
        """
        Returns an iterable for key buffer pairs of the module, where
        r"""Returns an iterable for key buffer pairs of the module, where
        ``key`` is the dotted path from this module to the buffer.

        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.

        :param prefix: prefix prepended to the keys.
        :param recursive: if ``True``, returns all buffers within this
            module, else only returns buffers that are direct attributes
            of this module.
        Args:
            prefix: prefix prepended to the keys.
            recursive: if ``True``, returns all buffers within this
                module, else only returns buffers that are direct attributes
                of this module.
            prefix: Optional[str]:
        """
        yield from self._flatten(
            with_key=True,
@@ -302,8 +302,7 @@ class Module(metaclass=ABCMeta):
        )

    def children(self, **kwargs) -> "Iterable[Module]":
        """
        Returns an iterable for all the submodules that are direct attributes of this
        r"""Returns an iterable for all the submodules that are direct attributes of this
        module.
        """
        yield from self._flatten(
@@ -311,8 +310,7 @@ class Module(metaclass=ABCMeta):
        )

    def named_children(self, **kwargs) -> "Iterable[Tuple[str, Module]]":
        """
        Returns an iterable of key-submodule pairs for all the submodules that are
        r"""Returns an iterable of key-submodule pairs for all the submodules that are
        direct attributes of this module, where 'key' is the attribute name of
        submodules.
        """
@@ -321,9 +319,7 @@ class Module(metaclass=ABCMeta):
        )

    def modules(self, **kwargs) -> "Iterable[Module]":
        """
        Returns an iterable for all the modules within this module, including itself.
        """
        r"""Returns an iterable for all the modules within this module, including itself."""
        if "with_parent" in kwargs and kwargs["with_parent"]:
            yield self, None
        else:
@@ -333,12 +329,12 @@ class Module(metaclass=ABCMeta):
    def named_modules(
        self, prefix: Optional[str] = None, **kwargs
    ) -> "Iterable[Tuple[str, Module]]":
        """
        Returns an iterable of key-module pairs for all the modules within this
        r"""Returns an iterable of key-module pairs for all the modules within this
        module, including itself, where 'key' is the dotted path from this module to the
        submodules.

        :param prefix: prefix prepended to the path.
        Args:
            prefix: prefix prepended to the path.
        """
        if "with_parent" in kwargs and kwargs["with_parent"]:
            yield ("" if prefix is None else prefix), self, None
@@ -349,33 +345,31 @@ class Module(metaclass=ABCMeta):
        )

    def apply(self, fn: "Callable[[Module], Any]") -> None:
        """
        Applies function ``fn`` to all the modules within this module, including
        r"""Applies function ``fn`` to all the modules within this module, including
        itself.

        :param fn: the function to be applied on modules.
        Args:
            fn: the function to be applied on modules.
        """
        for it in self.modules():
            fn(it)

    @deprecated(version="1.0")
    def zero_grad(self) -> None:
        """
        Sets all parameters' grads to zero
        """
        r"""Sets all parameters' grads to zero"""
        for param in self.parameters():
            if param.grad is not None:
                param.grad.reset_zero()

    def train(self, mode: bool = True, recursive: bool = True) -> None:
        """
        Sets training mode of all the modules within this module (including itself) to
        r"""Sets training mode of all the modules within this module (including itself) to
        ``mode``. This effectively sets the ``training`` attributes of those modules
        to ``mode``, but only has effect on certain modules (e.g.
        :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)

        :param mode: the training mode to be set on modules.
        :param recursive: whether to recursively call submodules' ``train()``.
        Args:
            mode: the training mode to be set on modules.
            recursive: whether to recursively call submodules' ``train()``.
        """
        if not recursive:
            self.training = mode
@@ -387,15 +381,13 @@ class Module(metaclass=ABCMeta):
        self.apply(fn)

    def eval(self) -> None:
        """
        Sets training mode of all the modules within this module (including itself) to
        r"""Sets training mode of all the modules within this module (including itself) to
        ``False``. See :meth:`~.Module.train` for details.
        """
        self.train(False)

    def disable_quantize(self, value=True):
        r"""
        Sets ``module``'s ``quantize_disabled`` attribute and return ``module``.
        r"""Sets ``module``'s ``quantize_disabled`` attribute and return ``module``.
        Could be used as a decorator.
        """

@@ -408,8 +400,7 @@ class Module(metaclass=ABCMeta):
    def replace_param(
        self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
    ):
        """
        Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
        r"""Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
        speedup multimachine training.
        """
        offset = 0
@@ -447,9 +438,7 @@ class Module(metaclass=ABCMeta):
        return rst

    def _state_dict(self, rst=None, prefix="", keep_var=False):
        r"""
        Returns a dictionary containing whole states of the module.
        """
        r"""Returns a dictionary containing whole states of the module."""

        def is_state(obj):
            return _is_parameter(obj) or _is_buffer(obj)
@@ -479,8 +468,7 @@ class Module(metaclass=ABCMeta):
        state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
        strict=True,
    ):
        r"""
        Loads a given dictionary created by :func:`state_dict` into this module.
        r"""Loads a given dictionary created by :func:`state_dict` into this module.
        If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
        returned by :func:`state_dict`.

@@ -515,8 +503,7 @@ class Module(metaclass=ABCMeta):
                if 'bias' in k:
                    M.init.zero_(v)
                if 'conv' in k:
                    return v.numpy() * (np.abs(v.numpy()) > 1e-3).astype("float32)
            model.load_state_dict(reinit_and_pruning, strict=False)

        """
        unused = []
        if isinstance(state_dict, dict):
@@ -558,8 +545,7 @@ class Module(metaclass=ABCMeta):
                )

    def _load_state_dict_with_closure(self, closure):
        """
        Advance state_dict load through callable ``closure`` whose signature is
        r"""Advance state_dict load through callable ``closure`` whose signature is
        ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]``
        """
        XNorm_typeclass = _get_XNorm_typeclass()
@@ -642,9 +628,7 @@ class Module(metaclass=ABCMeta):
        super().__delattr__(name)

    def _module_info_string(self) -> str:
        r"""
        Set the extra representation of the module.
        """
        r"""Set the extra representation of the module."""
        return ""

    def __repr__(self):
--- a/imperative/python/megengine/module/normalization.py
+++ b/imperative/python/megengine/module/normalization.py
@@ -15,8 +15,7 @@ from .module import Module


 class GroupNorm(Module):
    """
    Simple implementation of GroupNorm. Only support 4d tensor now.
    """Simple implementation of GroupNorm. Only support 4d tensor now.
    Reference: https://arxiv.org/pdf/1803.08494.pdf.
    """

@@ -64,8 +63,7 @@ class GroupNorm(Module):


 class InstanceNorm(Module):
    """
    Simple implementation of InstanceNorm. Only support 4d tensor now.
    """Simple implementation of InstanceNorm. Only support 4d tensor now.
    Reference: https://arxiv.org/abs/1607.08022.
    Note that InstanceNorm equals using GroupNome with num_groups=num_channels.
    """
@@ -108,8 +106,7 @@ class InstanceNorm(Module):


 class LayerNorm(Module):
    """
    Simple implementation of LayerNorm. Support tensor of any shape as input.
    """Simple implementation of LayerNorm. Support tensor of any shape as input.
    Reference: https://arxiv.org/pdf/1803.08494.pdf.
    """

--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
@@ -37,14 +37,14 @@ class _PoolNd(Module):


 class MaxPool2d(_PoolNd):
    r"""
    Applies a 2D max pooling over an input.
    r"""Applies a 2D max pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
    the size :math:`(N, C, H_{out}, W_{out})` through a process described as:

    .. math::

        \begin{aligned}
            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
                \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
@@ -54,30 +54,30 @@ class MaxPool2d(_PoolNd):
    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
    both sides for :attr:`padding` number of points.

    :param kernel_size: the size of the window to take a max over.
    :param stride: the stride of the window. Default value is kernel_size.
    :param padding: implicit zero padding to be added on both sides.
    Args:
        kernel_size: the size of the window to take a max over.
        stride: the stride of the window. Default value is kernel_size.
        padding: implicit zero padding to be added on both sides.

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        .. testcode::

        m = M.MaxPool2d(kernel_size=3, stride=1, padding=0)
        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
        oup = m(inp)
        print(oup.numpy())
            import numpy as np
            import megengine as mge
            import megengine.module as M

    Outputs:
            m = M.MaxPool2d(kernel_size=3, stride=1, padding=0)
            inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
            oup = m(inp)
            print(oup.numpy())

    .. testoutput::
        Outputs:

        [[[[10. 11.]
           [14. 15.]]]]
        .. testoutput::

            [[[[10. 11.]
               [14. 15.]]]]
    """

    def forward(self, inp):
@@ -85,8 +85,7 @@ class MaxPool2d(_PoolNd):


 class AvgPool2d(_PoolNd):
    r"""
    Applies a 2D average pooling over an input.
    r"""Applies a 2D average pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
@@ -100,33 +99,13 @@ class AvgPool2d(_PoolNd):
    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
    both sides for :attr:`padding` number of points.

    :param kernel_size: the size of the window.
    :param stride: the stride of the window. Default value is kernel_size。
    :param padding: implicit zero padding to be added on both sides.
    :param mode: whether to count padding values. "average" mode will do counting and 
        "average_count_exclude_padding" mode won't do counting. 
        Default: "average_count_exclude_padding"

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M

        m = M.AvgPool2d(kernel_size=3, stride=1, padding=0)
        inp = mge.tensor(np.arange(0, 16).astype("float32").reshape(1, 1, 4, 4))
        oup = m(inp)
        print(oup.numpy())

    Outputs:

    .. testoutput::

        [[[[ 5.  6.]
           [ 9. 10.]]]]

    Args:
        kernel_size: the size of the window.
        stride: the stride of the window. Default value is kernel_size。
        padding: implicit zero padding to be added on both sides.
        mode: whether to count padding values. "average" mode will do counting and
            "average_count_exclude_padding" mode won't do counting.
            Default: "average_count_exclude_padding"
    """

    def __init__(
--- a/imperative/python/megengine/module/qat/batch_matmul_activation.py
+++ b/imperative/python/megengine/module/qat/batch_matmul_activation.py
@@ -10,9 +10,7 @@ from .module import QATModule


 class BatchMatMulActivation(Float.BatchMatMulActivation, QATModule):
    r"""
    A :class:`~.QATModule` :class:`~.module.BatchMatMulActivation` with QAT support.
    """
    r"""A :class:`~.QATModule` :class:`~.module.BatchMatMulActivation` with QAT support."""

    def forward(self, inp):
        w_qat = self.apply_quant_weight(self.weight)
--- a/imperative/python/megengine/module/qat/concat.py
+++ b/imperative/python/megengine/module/qat/concat.py
@@ -13,8 +13,7 @@ from .module import QATModule


 class Concat(Float.Concat, QATModule):
    r"""
    A :class:`~.QATModule` to do functional :func:`~.concat` with QAT support.
    r"""A :class:`~.QATModule` to do functional :func:`~.concat` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

@@ -23,8 +22,4 @@ class Concat(Float.Concat, QATModule):

    @classmethod
    def from_float_module(cls, float_module):
        r"""
        Return a :class:`~.QATModule` instance converted from
        a float :class:`~.Module` instance.
        """
        return cls(name=float_module.name)
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
@@ -11,8 +11,7 @@ from .module import QATModule


 class Conv2d(Float.Conv2d, QATModule):
    r"""
    A :class:`~.QATModule` :class:`~.module.Conv2d` with QAT support.
    r"""A :class:`~.QATModule` :class:`~.module.Conv2d` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

@@ -50,8 +49,7 @@ class Conv2d(Float.Conv2d, QATModule):


 class ConvRelu2d(Conv2d):
    r"""
    A :class:`~.QATModule` include :class:`~.module.Conv2d` and :func:`~.relu` with QAT support.
    r"""A :class:`~.QATModule` include :class:`~.module.Conv2d` and :func:`~.relu` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

@@ -60,8 +58,7 @@ class ConvRelu2d(Conv2d):


 class ConvTranspose2d(Float.ConvTranspose2d, QATModule):
    r"""
    A :class:`~.QATModule` :class:`~.module.ConvTranspose2d` with QAT support.
    r"""A :class:`~.QATModule` :class:`~.module.ConvTranspose2d` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
@@ -136,10 +136,6 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):

    @classmethod
    def from_float_module(cls, float_module: Float._ConvBnActivation2d):
        r"""
        Return a :class:`~.QATModule` instance converted from
        a float :class:`~.Module` instance.
        """
        qat_module = cls(
            float_module.conv.in_channels,
            float_module.conv.out_channels,
@@ -160,8 +156,7 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):


 class ConvBn2d(_ConvBnActivation2d):
    r"""
    A fused :class:`~.QATModule` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d` with QAT support.
    r"""A fused :class:`~.QATModule` including :class:`~.module.Conv2d` and :class:`~.module.BatchNorm2d` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

@@ -170,8 +165,7 @@ class ConvBn2d(_ConvBnActivation2d):


 class ConvBnRelu2d(_ConvBnActivation2d):
    r"""
    A fused :class:`~.QATModule` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu` with QAT support.
    r"""A fused :class:`~.QATModule` including :class:`~.module.Conv2d`, :class:`~.module.BatchNorm2d` and :func:`~.relu` with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
    """

--- a/imperative/python/megengine/module/qat/elemwise.py
+++ b/imperative/python/megengine/module/qat/elemwise.py
@@ -10,11 +10,8 @@ from .module import QATModule


 class Elemwise(Float.Elemwise, QATModule):
    r"""
    A :class:`~.QATModule` to do :mod:`~.functional.elemwise` operator with QAT support.
    r"""A :class:`~.QATModule` to do :mod:`~.functional.elemwise` operator with QAT support.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.

    :param method: the elemwise method, see :class:`~.module.Elemwise` for detail.
    """

    with_weight = False
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
@@ -10,15 +10,14 @@ from .module import QATModule


 class Linear(Float.Linear, QATModule):
    r"""
    A :class:`~.QATModule` version of :class:`~.module.Linear`.
    r"""A :class:`~.QATModule` version of :class:`~.module.Linear`.
    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.

    :param in_features: size of each input sample.
    :param out_features: size of each output sample.
    :param bias: If set to ``False``, the layer will not learn an additive bias.
        Default: True

    Args:
        in_features: size of each input sample.
        out_features: size of each output sample.
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: True
    """

    def forward(self, inp):
--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
@@ -17,12 +17,11 @@ from ..module import Module


 class QATModule(Module):
    r"""
    Base class of quantized-float related :class:`~.Module`, basically for QAT and Calibration.

    r"""Base class of quantized-float related :class:`~.Module`, basically for QAT and Calibration.
    
    Use :meth:`from_float_module` to generate a instance from float :class:`~.Module`.
    Or use :func:`~.quantize.quantize_qat` to do it recursively and automatically.

    
    Can also be converted to :class:`~.QuantizedModule` for deployment using
    :func:`~.quantize.quantize` further.
    """
@@ -43,8 +42,7 @@ class QATModule(Module):
        return "QAT." + super().__repr__()

    def set_qconfig(self, qconfig: QConfig):
        r"""
        Set quantization related configs with ``qconfig``, including
        r"""Set quantization related configs with ``qconfig``, including
        observer and fake_quant for weight and activation.
        """

@@ -96,24 +94,19 @@ class QATModule(Module):
        return oup

    def apply_quant_weight(self, target: Tensor):
        r"""
        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
        """
        r"""Apply weight's observer and fake_quant from ``qconfig`` on ``target``."""
        return self._apply_fakequant_with_observer(
            target, self.weight_fake_quant, self.weight_observer
        )

    def apply_quant_activation(self, target: Tensor):
        r"""
        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
        """
        r"""Apply weight's observer and fake_quant from ``qconfig`` on ``target``."""
        return self._apply_fakequant_with_observer(
            target, self.act_fake_quant, self.act_observer
        )

    def apply_quant_bias(self, target: Tensor, inp: Tensor, w_qat: Tensor):
        r"""
        Use :func:`~.fake_quant_bias` to process ``target``. Only valid when
        r"""Use :func:`~.fake_quant_bias` to process ``target``. Only valid when
        ``act_fake_quant`` and ``weight_fake_quant`` are both enabled.
        """
        # bias should have the same dtype as activation, so act_fake_quant can also
@@ -139,33 +132,25 @@ class QATModule(Module):
        return None

    def get_weight_dtype(self):
        r"""
        Get weight's quantization dtype as the method from ``qconfig``.
        """
        r"""Get weight's quantization dtype as the method from ``qconfig``."""
        return self._get_method_result(
            "get_quantized_dtype", self.weight_fake_quant, self.weight_observer
        )

    def get_activation_dtype(self):
        r"""
        Get activation's quantization dtype as the method from ``qconfig``.
        """
        r"""Get activation's quantization dtype as the method from ``qconfig``."""
        return self._get_method_result(
            "get_quantized_dtype", self.act_fake_quant, self.act_observer
        )

    def get_weight_qparams(self):
        r"""
        Get weight's quantization parameters.
        """
        r"""Get weight's quantization parameters."""
        return self._get_method_result(
            "get_qparams", self.weight_fake_quant, self.weight_observer
        )

    def get_activation_qparams(self):
        r"""
        Get activation's quantization parameters.
        """
        r"""Get activation's quantization parameters."""
        return self._get_method_result(
            "get_qparams", self.act_fake_quant, self.act_observer
        )
@@ -173,7 +158,6 @@ class QATModule(Module):
    @classmethod
    @abstractmethod
    def from_float_module(cls, float_module: Module):
        r"""
        Return a :class:`~.QATModule` instance converted from
        r"""Return a :class:`~.QATModule` instance converted from
        a float :class:`~.Module` instance.
        """
--- a/imperative/python/megengine/module/qat/quant_dequant.py
+++ b/imperative/python/megengine/module/qat/quant_dequant.py
@@ -10,8 +10,7 @@ from .module import QATModule


 class QuantStub(Float.QuantStub, QATModule):
    r"""
    A helper :class:`~.QATModule` simply return input, but will quantize
    r"""A helper :class:`~.QATModule` simply return input, but will quantize
    input after converted to :class:`~.QuantizedModule`.
    """

@@ -30,8 +29,7 @@ class QuantStub(Float.QuantStub, QATModule):


 class DequantStub(Float.DequantStub, QATModule):
    r"""
    A helper :class:`~.QATModule` simply return input, but will de-quantize
    r"""A helper :class:`~.QATModule` simply return input, but will de-quantize
    input after converted to :class:`~.QuantizedModule`.
    """

--- a/imperative/python/megengine/module/quant_dequant.py
+++ b/imperative/python/megengine/module/quant_dequant.py
@@ -9,8 +9,7 @@ from .module import Module


 class QuantStub(Module):
    r"""
    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
    r"""A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
    version :class:`~.qat.QuantStub` using :func:`~.quantize.quantize_qat`.
    """

@@ -19,8 +18,7 @@ class QuantStub(Module):


 class DequantStub(Module):
    r"""
    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
    r"""A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
    version :class:`~.qat.DequantStub` using :func:`~.quantize.quantize_qat`.
    """

--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
@@ -14,9 +14,7 @@ from .module import QuantizedModule


 class Concat(QuantizedModule):
    r"""
    A :class:`~.QuantizedModule` to do quantized :func:`~.concat`, used for inference only.
    """
    r"""A :class:`~.QuantizedModule` to do quantized :func:`~.concat`, used for inference only."""

    def __init__(self, dtype=None, **kwargs):
        super().__init__(**kwargs)
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -75,7 +75,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
    @classmethod
    def from_qat_module(cls, qat_module: QAT.Conv2d):
        r"""
        return a :class:`~.QuantizedModule` instance converted from a
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        output_dtype = qat_module.get_activation_dtype()
@@ -119,7 +119,8 @@ class ConvTranspose2d(Float.ConvTranspose2d, QuantizedModule):

    The parameter is same with :class:`~.module.ConvTranspose2d` but dtype.

    :param dtype: data type of the output, should be qint8.
    Args:
        dtype: data type of the output, should be qint8.
    """

    def __init__(
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -11,10 +11,7 @@ from .conv import Conv2d


 class _ConvBnActivation2d(Conv2d):
    r"""
    Applies a 2D convolution over a quantized input tensor, used for inference only.

    The parameter is same with :class: `~.module.Conv2d`.
    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.
    """

    @classmethod
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
@@ -12,8 +12,7 @@ from ..qat import QATModule


 class QuantizedModule(Module):
    r"""
    Base class of quantized :class:`~.Module`, 
    r"""Base class of quantized :class:`~.Module`,
    which should be converted from :class:`~.QATModule` and not support traning.
    """

@@ -29,6 +28,6 @@ class QuantizedModule(Module):
    @abstractmethod
    def from_qat_module(cls, qat_module: QATModule):
        r"""
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        Return a :class:`~.QATModule` instance converted from
        a float :class:`~.Module` instance.
        """
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
@@ -10,8 +10,7 @@ from .module import QuantizedModule


 class QuantStub(QuantizedModule):
    r"""
    Quantized version of :class:`~.qat.QuantStub`,
    r"""Quantized version of :class:`~.qat.QuantStub`,
    will convert input to quantized dtype.
    """

@@ -24,16 +23,11 @@ class QuantStub(QuantizedModule):

    @classmethod
    def from_qat_module(cls, qat_module: QAT.QuantStub):
        r"""
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        return cls(qat_module.get_activation_dtype(), name=qat_module.name)


 class DequantStub(QuantizedModule):
    r"""
    Quantized version of :class:`~.qat.DequantStub`,
    r"""Quantized version of :class:`~.qat.DequantStub`,
    will restore quantized input to float32 dtype.
    """

@@ -42,8 +36,4 @@ class DequantStub(QuantizedModule):

    @classmethod
    def from_qat_module(cls, qat_module: QAT.DequantStub):
        r"""
        Return a :class:`~.QuantizedModule` instance converted from a
        :class:`~.QATModule` instance.
        """
        return cls(name=qat_module.name)
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
@@ -12,38 +12,35 @@ from .module import Module


 class Sequential(Module):
    r"""
    A sequential container.
    r"""A sequential container.
    Modules will be added to it in the order they are passed in the constructor.
    Alternatively, an ordered dict of modules can also be passed in.

    To make it easier to understand, here is a small example:

    Examples:

    .. testcode::

        import numpy as np
        import megengine as mge
        import megengine.module as M
        import megengine.functional as F
        from collections import OrderedDict

        batch_size = 64
        data = mge.tensor(np.zeros((batch_size, 28 * 28)), dtype=np.float32)
        label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)

        net0 = M.Sequential(
                M.Linear(28 * 28, 320),
                M.Linear(320, 10)
            )
        pred0 = net0(data)

        modules = OrderedDict()
        modules["fc0"] = M.Linear(28 * 28, 320)
        modules["fc1"] = M.Linear(320, 10)
        net1 = M.Sequential(modules)
        pred1 = net1(data)
        .. testcode::

            import numpy as np
            import megengine as mge
            import megengine.module as M
            import megengine.functional as F
            from collections import OrderedDict

            batch_size = 64
            data = mge.tensor(np.zeros((batch_size, 28 * 28)), dtype=np.float32)
            label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)

            net0 = M.Sequential(
                    M.Linear(28 * 28, 320),
                    M.Linear(320, 10)
                )
            pred0 = net0(data)

            modules = OrderedDict()
            modules["fc0"] = M.Linear(28 * 28, 320)
            modules["fc1"] = M.Linear(320, 10)
            net1 = M.Sequential(modules)
            pred1 = net1(data)
    """

    def __init__(self, *args, **kwargs):
--- a/imperative/python/megengine/module/sliding_window.py
+++ b/imperative/python/megengine/module/sliding_window.py
@@ -13,8 +13,7 @@ from .module import Module


 class SlidingWindow(Module):
    r"""
    Apply a sliding window to input tensor and copy content in the window to
    r"""Apply a sliding window to input tensor and copy content in the window to
    corresponding output location. Assume input shape is :math:`(N, C, IH, IW)`,
    then output shape would be :math:`(N, C, OH, OW, window_h, window_w)` where
    :math:`(OH, OW)` would be computed from padding, stride, window and
@@ -26,46 +25,45 @@ class SlidingWindow(Module):
        \text{where } & ih=-pad_h+oh \times stride_h + (wh-1) \times (dilation_h-1) \\
                       & iw=-pad_w+ow \times stride_w + (ww-1) \times (dilation_w-1)


    :param kernel_size: the size of the window to take a max over.
    :param padding: implicit zero padding to be added on both sides. Default: 0
    :param stride: the stride of the window. Default: 1
    :param dilation: the dilation of the window. Default: 1
    Args:
        kernel_size: the size of the window to take a max over.
        padding: implicit zero padding to be added on both sides. Default: 0
        stride: the stride of the window. Default: 1
        dilation: the dilation of the window. Default: 1

    Example:

    .. testcode::

        from megengine import tensor
        import megengine.module as M
        import numpy as np
        .. testcode::

        inp = tensor(np.arange(30).reshape(1,1,5,6))
        op = M.SlidingWindow(kernel_size=3, padding=1, stride=2, dilation=2)
        out = op(inp)
        print(out.numpy())
            from megengine import tensor
            import megengine.module as M
            import numpy as np

    Outputs:
            inp = tensor(np.arange(30).reshape(1,1,5,6))
            op = M.SlidingWindow(kernel_size=3, padding=1, stride=2, dilation=2)
            out = op(inp)
            print(out.numpy())

    .. testoutput::
        Outputs:

        [[[[[[ 0  0  0]
             [ 0  7  9]
             [ 0 19 21]]
        .. testoutput::

            [[ 0  0  0]
             [ 7  9 11]
             [19 21 23]]]
            [[[[[[ 0  0  0]
                 [ 0  7  9]
                 [ 0 19 21]]

                [[ 0  0  0]
                 [ 7  9 11]
                 [19 21 23]]]

           [[[ 0  7  9]
             [ 0 19 21]
             [ 0  0  0]]

            [[ 7  9 11]
             [19 21 23]
             [ 0  0  0]]]]]]
               [[[ 0  7  9]
                 [ 0 19 21]
                 [ 0  0  0]]

                [[ 7  9 11]
                 [19 21 23]
                 [ 0  0  0]]]]]]
    """

    def __init__(
@@ -89,21 +87,20 @@ class SlidingWindow(Module):


 class SlidingWindowTranspose(Module):
    r"""
    Opposite opration of SlidingWindow, sum over the sliding windows on the 
    corresponding input location. Given an input of the size 
    :math:`(N, C,  IH, IW, window_h, window_w)` and :attr:`output_size`, the 
    r"""Opposite opration of SlidingWindow, sum over the sliding windows on the
    corresponding input location. Given an input of the size
    :math:`(N, C,  IH, IW, window_h, window_w)` and :attr:`output_size`, the
    output shape would be :math:`(N, C, output\_size_{h}, output\_size_{w})` and the
    arguments must satisfy

    .. math::
        \text{IH} = \lfloor \frac{\text{output_size}_{h} + 2 * \text{padding}_{h} - 
        \text{IH} = \lfloor \frac{\text{output_size}_{h} + 2 * \text{padding}_{h} -
        \text{dilation}_{h} * (\text{kernel_size}_{h} - 1) - 1}{\text{stride}_{h}} + 1 \rfloor

    .. math::
        \text{IW} = \lfloor \frac{\text{output_size}_{w} + 2 * \text{padding}_{w} - 
        \text{IW} = \lfloor \frac{\text{output_size}_{w} + 2 * \text{padding}_{w} -
        \text{dilation}_{w} * (\text{kernel_size}_{w} - 1) - 1}{\text{stride}_{w}} + 1 \rfloor
    

    For each output location, we have:

    .. math::
@@ -113,36 +110,13 @@ class SlidingWindowTranspose(Module):
        \text{location}(n, c, ih, iw, wh, ww) &= (n, c, oh+wh, ow+ww) \\
        \text{where } & oh=-pad_h+ih \times stride_h + (wh-1) \times (dilation_h-1) \\
                       & ow=-pad_w+iw \times stride_w + (ww-1) \times (dilation_w-1)
                       
    :param output_size: the size of the output tensor.
    :param kernel_size: the size of the window to take a max over.
    :param padding: implicit zero padding to be added on both sides. Default: 0
    :param stride: the stride of the window. Default: 1
    :param dilation: the dilation of the window. Default: 1

    Example:

    .. testcode::

        from megengine import tensor
        import megengine.module as M
        import numpy as np

        inp = tensor(np.arange(20).reshape(1,1,4,5))
        unfold = M.SlidingWindow(kernel_size=3, padding=0, stride=1, dilation=1)
        fold = M.SlidingWindowTranspose((4,5), kernel_size=3, padding=0, stride=1, dilation=1)
        out = fold(unfold(inp))
        print(out.numpy())

    Outputs:

    .. testoutput::
        
        [[[[ 0  2  6  6  4]
           [10 24 42 32 18]
           [20 44 72 52 28]
           [15 32 51 36 19]]]]

    Args:
        output_size: the size of the output tensor.
        kernel_size: the size of the window to take a max over.
        padding: implicit zero padding to be added on both sides. Default: 0
        stride: the stride of the window. Default: 1
        dilation: the dilation of the window. Default: 1
    """

    def __init__(
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -15,20 +15,20 @@ from .optimizer import Optimizer


 class Adadelta(Optimizer):
    r"""
    Implements Adadelta algorithm.

    r"""Implements Adadelta algorithm.
    
    It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.

    :param params: iterable of parameters to optimize or dicts defining
        parameter groups.
    :param lr: coefficient that scales delta before it is applied
        to the parameters. Default: 1.0
    :param rho: coefficient used for computing a running average
        of squared gradients. Default: 0.9
    :param eps: term added to the denominator to improve
        numerical stability. Default: 1e-6
    :param weight_decay: weight decay (L2 penalty). Default: 0
    Args:
        params: iterable of parameters to optimize or dicts defining
            parameter groups.
        lr: coefficient that scales delta before it is applied
            to the parameters. Default: 1.0
        rho: coefficient used for computing a running average
            of squared gradients. Default: 0.9
        eps: term added to the denominator to improve
            numerical stability. Default: 1e-6
        weight_decay: weight decay (L2 penalty). Default: 0
    """

    def __init__(
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -15,20 +15,20 @@ from .optimizer import Optimizer


 class Adagrad(Optimizer):
    r"""
    Implements Adagrad algorithm.

    r"""Implements Adagrad algorithm.
    
    It has been proposed in `"Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization" <http://jmlr.org/papers/v12/duchi11a.html>`_.

    :param params: iterable of parameters to optimize or dicts defining
        parameter groups.
    :param lr: coefficient that scales delta before it is applied
        to the parameters. Default: 1e-2
    :param lr_decay: learning rate decay. Default: 0
    :param eps: term added to the denominator to improve
        numerical stability. Default: 1e-10
    :param weight_decay: weight decay (L2 penalty). Default: 0
    Args:
        params: iterable of parameters to optimize or dicts defining
            parameter groups.
        lr: coefficient that scales delta before it is applied
            to the parameters. Default: 1e-2
        lr_decay: learning rate decay. Default: 0
        eps: term added to the denominator to improve
            numerical stability. Default: 1e-10
        weight_decay: weight decay (L2 penalty). Default: 0
    """

    def __init__(
--- a/imperative/python/megengine/optimizer/adam.py
+++ b/imperative/python/megengine/optimizer/adam.py
@@ -15,17 +15,16 @@ from .optimizer import Optimizer


 class Adam(Optimizer):
    r"""
    Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.
    r"""Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.

    :param params: iterable of parameters to optimize or dicts defining
    Args:
        params: iterable of parameters to optimize or dicts defining
            parameter groups.
    :param lr: learning rate.
    :param betas: coefficients used for computing running averages of gradient
        and its square. Default: (0.9, 0.999)
    :param eps: term added to the denominator to improve numerical stability
        Default: 1e-8
    :param weight_decay: weight decay (L2 penalty). Default: 0
        lr: learning rate.
            betas: coefficients used for computing running averages of gradient
            and its square. Default: (0.9, 0.999)
        eps: term added to the denominator to improve numerical stability. Default: 1e-8
        weight_decay: weight decay (L2 penalty). Default: 0
    """

    def __init__(
--- a/imperative/python/megengine/optimizer/adamw.py
+++ b/imperative/python/megengine/optimizer/adamw.py
@@ -15,17 +15,16 @@ from .optimizer import Optimizer


 class AdamW(Optimizer):
    r"""
    Implements AdamW algorithm proposed in `"Decoupled Weight Decay Regularization" <https://arxiv.org/abs/1711.05101>`_.
    r"""Implements AdamW algorithm proposed in `"Decoupled Weight Decay Regularization" <https://arxiv.org/abs/1711.05101>`_.

    :param params: iterable of parameters to optimize or dicts defining
    Args:
        params: iterable of parameters to optimize or dicts defining
            parameter groups.
    :param lr: learning rate.
    :param betas: coefficients used for computing running averages of gradient
        and its square. Default: (0.9, 0.999)
    :param eps: term added to the denominator to improve numerical stability
        Default: 1e-8
    :param weight_decay: weight decay (L2 penalty). Default: 1e-2
        lr: learning rate.
            betas: coefficients used for computing running averages of gradient
            and its square. Default: (0.9, 0.999)
        eps: term added to the denominator to improve numerical stability. Default: 1e-8
        weight_decay: weight decay (L2 penalty). Default: 1e-2
    """

    def __init__(
--- a/imperative/python/megengine/optimizer/clip_grad.py
+++ b/imperative/python/megengine/optimizer/clip_grad.py
@@ -23,10 +23,13 @@ def clip_grad_norm(
    The norm is computed over all gradients together, as if they were
    concatenated into a single vector. Gradients are modified in-place.

    :param tensors: an iterable of Tensors or a single Tensor.
    :param max_norm: max norm of the gradients.
    :param ord: type of the used p-norm. Can be ``'inf'`` for infinity norm.
    :return: total norm of the parameters (viewed as a single vector).
    Args:
        tensors: an iterable of Tensors or a single Tensor.
        max_norm: max norm of the gradients.
        ord: type of the used p-norm. Can be ``'inf'`` for infinity norm.

    Returns:
      total norm of the parameters (viewed as a single vector).
    """
    push_scope("clip_grad_norm")
    if isinstance(tensors, Tensor):
@@ -53,14 +56,15 @@ def clip_grad_value(
 ):
    r"""Clips gradient of an iterable of parameters to a specified lower and
    upper. Gradients are modified in-place.

    
    The gradients are clipped in the range:

    
    .. math:: \left[\text{lower}, \text{upper}\right]

    :param tensors: an iterable of Tensors or a single Tensor.
    :param lower: minimum allowed value of the gradients.
    :param upper: maximum allowed value of the gradients.
    Args:
        tensors: an iterable of Tensors or a single Tensor.
        lower: minimum allowed value of the gradients.
        upper: maximum allowed value of the gradients.
    """
    push_scope("clip_grad_value")
    if isinstance(tensors, Tensor):
--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
@@ -12,11 +12,11 @@ from .optimizer import Optimizer


 class LRScheduler(metaclass=ABCMeta):
    r"""
    Base class for all learning rate based schedulers.
    r"""Base class for all learning rate based schedulers.

    :param optimizer: wrapped optimizer.
    :param current_epoch: the index of current epoch. Default: -1
    Args:
        optimizer: wrapped optimizer.
        current_epoch: the index of current epoch. Default: -1
    """

    def __init__(  # pylint: disable=too-many-branches
@@ -45,25 +45,22 @@ class LRScheduler(metaclass=ABCMeta):
        self.step()

    def state_dict(self):
        r"""
        Returns the state of the scheduler as a :class:`dict`.
        r"""Returns the state of the scheduler as a :class:`dict`.
            It contains an entry for every variable in self.__dict__ which
            is not the optimizer.
        """
        raise NotImplementedError

    def load_state_dict(self, state_dict):
        r"""
        Loads the schedulers state.
        r"""Loads the schedulers state.

        :type state_dict: dict
        :param state_dict: scheduler state.
        Args:
            state_dict: scheduler state.
        """
        raise NotImplementedError

    def get_lr(self):
        r""" Compute current learning rate for the scheduler.
        """
        r"""Compute current learning rate for the scheduler."""
        raise NotImplementedError

    def step(self, epoch=None):
--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
@@ -14,16 +14,14 @@ from .optimizer import Optimizer


 class MultiStepLR(LRScheduler):
    r"""
    Decays the learning rate of each parameter group by gamma once the
    r"""Decays the learning rate of each parameter group by gamma once the
        number of epoch reaches one of the milestones.

    :param optimizer: wrapped optimizer.
    :type milestones: list
    :param milestones: list of epoch indices which should be increasing.
    :type gamma: float
    :param gamma: multiplicative factor of learning rate decay. Default: 0.1
    :param current_epoch: the index of current epoch. Default: -1
    Args:
        optimizer: wrapped optimizer.
        milestones: list of epoch indices which should be increasing.
        gamma: multiplicative factor of learning rate decay. Default: 0.1
        current_epoch: the index of current epoch. Default: -1
    """

    def __init__(
@@ -45,8 +43,7 @@ class MultiStepLR(LRScheduler):
        super().__init__(optimizer, current_epoch)

    def state_dict(self):
        r"""
        Returns the state of the scheduler as a :class:`dict`.
        r"""Returns the state of the scheduler as a :class:`dict`.
            It contains an entry for every variable in self.__dict__ which
            is not the optimizer.
        """
@@ -57,11 +54,10 @@ class MultiStepLR(LRScheduler):
        }

    def load_state_dict(self, state_dict):
        r"""
        Loads the schedulers state.
        r"""Loads the schedulers state.

        :type state_dict: dict
        :param state_dict: scheduler state.
        Args:
          state_dict: scheduler state.
        """
        tmp_dict = {}
        for key in ["milestones", "gamma", "current_epoch"]:
--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
@@ -30,11 +30,11 @@ required = _RequiredParameter()


 class Optimizer(metaclass=ABCMeta):
    r"""
    Base class for all optimizers.
    r"""Base class for all optimizers.

    :param params: specifies what Tensors should be optimized.
    :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
    Args:
        params: specifies what Tensors should be optimized.
        defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
    """

    def __init__(  # pylint: disable=too-many-branches
@@ -76,14 +76,13 @@ class Optimizer(metaclass=ABCMeta):
            self._create_state(group)

    def add_param_group(self, param_group: dict):
        r"""
        Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.

        r"""Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
        
        This can be useful when fine tuning a pre-trained network as frozen layers can be made
        trainable and added to the :class:`~megengine.optim.optimizer.Optimizer` as training progresses.

        :param param_group: specifies what tensors should be optimized along with group.

        Args:
            param_group: specifies what tensors should be optimized along with group.
        """
        assert isinstance(param_group, dict), "param group must be a dict"

@@ -143,10 +142,7 @@ class Optimizer(metaclass=ABCMeta):
        return params

    def step(self):
        r"""
        Performs a single optimization step.

        """
        r"""Performs a single optimization step."""
        # set the globle state `_enable_convert_inputs` to `False` to disable
        # the `convert_inputs` for param updates
        set_option("record_computing_path", 0)
@@ -176,9 +172,7 @@ class Optimizer(metaclass=ABCMeta):
                    param.grad.reset_zero()

    def clear_grad(self):
        r"""
        Set the grad attribute to None for all parameters.
        """
        r"""Set the grad attribute to None for all parameters."""
        for param_group in self.param_groups:
            push_scope("clear_grad")
            for param in param_group["params"]:
@@ -186,10 +180,10 @@ class Optimizer(metaclass=ABCMeta):
            pop_scope("clear_grad")

    def state_dict(self, keep_var=False) -> Dict:
        r"""
        Export the optimizer state.
        r"""Export the optimizer state.

        :return: optimizer state. Can be loaded by :meth:`load_state_dict`.
        Return:
            optimizer state. Can be loaded by :meth:`load_state_dict`.
        """
        param_groups = []
        state = dict()
@@ -217,10 +211,10 @@ class Optimizer(metaclass=ABCMeta):
        return {"param_groups": param_groups, "state": state}

    def load_state_dict(self, state: dict):
        r"""
        Loads the optimizer state.
        r"""Loads the optimizer state.

        :param state: optimizer state. Should be an object returned
        Args:
            state: optimizer state. Should be an object returned
                from a call to :meth:`state_dict`.
        """
        if len(self.param_groups) != len(state["param_groups"]):
--- a/imperative/python/megengine/optimizer/sgd.py
+++ b/imperative/python/megengine/optimizer/sgd.py
@@ -15,17 +15,17 @@ from .optimizer import Optimizer


 class SGD(Optimizer):
    r"""
    Implements stochastic gradient descent.

    r"""Implements stochastic gradient descent.
    
    Nesterov momentum is based on the formula from
    `"On the importance of initialization and momentum in deep learning" <http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf>`_ .

    :param params: iterable of parameters to optimize or dicts defining
    Args:
        params: iterable of parameters to optimize or dicts defining
            parameter groups.
    :param lr: learning rate.
    :param momentum: momentum factor. Default: 0.0
    :param weight_decay: weight decay (L2 penalty). Default: 0.0
        lr: learning rate.
        momentum: momentum factor. Default: 0.0
        weight_decay: weight decay (L2 penalty). Default: 0.0
    """

    def __init__(
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
@@ -72,13 +72,13 @@ class _FakeQuantize(Module):


 class TQT(_FakeQuantize, QParamsModuleMixin):
    r"""
    TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
    r"""TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
    for Accurate and Efficient Fixed-Point Inference of Deep Neural Networks.

    :param dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
        quantization dtype of input.
    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
    Args:
        dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
            quantization dtype of input.
        enable: whether do ``normal_forward`` or ``fake_quant_forward``.
    """

    def __init__(
@@ -104,12 +104,12 @@ class TQT(_FakeQuantize, QParamsModuleMixin):


 class FakeQuantize(_FakeQuantize):
    r"""
    A module to do quant and dequant according to observer's scale and zero_point.
    r"""A module to do quant and dequant according to observer's scale and zero_point.

    :param dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
        quantization dtype of input.
    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
    Args:
        dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
            quantization dtype of input.
        enable: whether do ``normal_forward`` or ``fake_quant_forward``.
    """

    def fake_quant_forward(self, inp, qparams: QParams = None):
@@ -122,14 +122,14 @@ class FakeQuantize(_FakeQuantize):


 class LSQ(_FakeQuantize, QParamsModuleMixin):
    r"""
    LSQ: https://arxiv.org/pdf/1902.08153.pdf Estimating and scaling the 
    r"""LSQ: https://arxiv.org/pdf/1902.08153.pdf Estimating and scaling the
    task loss gradient at each weight and activation layer's quantizer step size

    :param dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
        quantization dtype of input.
    :param enable: whether do ``normal_forward`` or ``fake_quant_forward``.
    :param eps:a small value to avoid division by zero. Default: 1e-5
    Args:
        dtype: a string or :class:`~.QuantDtypeMeta` indicating the target
            quantization dtype of input.
        enable: whether do ``normal_forward`` or ``fake_quant_forward``.
        eps: a small value to avoid division by zero. Default: 1e-5
    """

    def __init__(
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
@@ -25,11 +25,11 @@ logger = get_logger(__name__)


 class Observer(Module, QParamsModuleMixin):
    r"""
    A base class for Observer Module. Used to record input tensor's statistics for
    r"""A base class for Observer Module. Used to record input tensor's statistics for
    quantization.

    :param dtype: a string indicating which dtype to collect scale and zero_point of.
    Args:
        dtype: a string indicating which dtype to collect scale and zero_point of.
    """

    def __init__(self, dtype: Union[str, QuantDtypeMeta], **kwargs):
@@ -73,12 +73,12 @@ class Observer(Module, QParamsModuleMixin):


 class MinMaxObserver(Observer):
    r"""
    A Observer Module records input tensor's running min and max values to calc scale.
    r"""A Observer Module records input tensor's running min and max values to calc scale.

    :param mode: set quantization mode.
    :param eps: a initial maximum value to avoid division by zero problem.
    :param dtype: a string indicating which dtype to collect scale and zero_point of.
    Args:
        mode: set quantization mode.
        eps: a initial maximum value to avoid division by zero problem.
        dtype: a string indicating which dtype to collect scale and zero_point of.
    """

    def __init__(
@@ -128,12 +128,12 @@ class MinMaxObserver(Observer):


 class SyncMinMaxObserver(MinMaxObserver):
    r"""
    A distributed version of :class:`~.MinMaxObserver`.
    r"""A distributed version of :class:`~.MinMaxObserver`.

    :param mode: set quantization mode.
    :param eps: a initial maximum value to avoid division by zero problem.
    :param dtype: a string indicating which dtype to collect scale and zero_point of.
    Args:
        mode: set quantization mode.
        eps: a initial maximum value to avoid division by zero problem.
        dtype: a string indicating which dtype to collect scale and zero_point of.
    """

    def forward(self, x_orig):
@@ -151,13 +151,13 @@ class SyncMinMaxObserver(MinMaxObserver):


 class ExponentialMovingAverageObserver(MinMaxObserver):
    r"""
    A :class:`~.MinMaxObserver` with momentum support for min/max updating.
    r"""A :class:`~.MinMaxObserver` with momentum support for min/max updating.

    :param momentum: momentum ratio for min/max updating.
    :param mode: set quantization mode.
    :param eps: a initial maximum value to avoid division by zero problem.
    :param dtype: a string indicating which dtype to collect scale and zero_point of.
    Args:
        momentum: momentum ratio for min/max updating.
        mode: set quantization mode.
        eps: a initial maximum value to avoid division by zero problem.
        dtype: a string indicating which dtype to collect scale and zero_point of.
    """

    def __init__(
@@ -196,13 +196,13 @@ class ExponentialMovingAverageObserver(MinMaxObserver):


 class SyncExponentialMovingAverageObserver(ExponentialMovingAverageObserver):
    r"""
    A distributed version of :class:`~.ExponentialMovingAverageObserver`.
    r"""A distributed version of :class:`~.ExponentialMovingAverageObserver`.

    :param momentum: momentum ratio for min/max updating.
    :param mode: set quantization mode.
    :param eps: a initial maximum value to avoid division by zero problem.
    :param dtype: a string indicating which dtype to collect scale and zero_point of.
    Args:
        momentum: momentum ratio for min/max updating.
        mode: set quantization mode.
        eps: a initial maximum value to avoid division by zero problem.
        dtype: a string indicating which dtype to collect scale and zero_point of.
    """

    def forward(self, x_orig):
@@ -227,15 +227,15 @@ class SyncExponentialMovingAverageObserver(ExponentialMovingAverageObserver):


 class HistogramObserver(MinMaxObserver):
    r"""
    A :class:`~.MinMaxObserver` using running histogram of tensor values
    r"""A :class:`~.MinMaxObserver` using running histogram of tensor values
    for min/max updating. Usually used for calibration quantization.

    :param bins: number of bins to use for the histogram.
    :param upsample_rate: which ratio to interpolate histograms in.
    :param mode: set quantization mode.
    :param eps: a initial maximum value to avoid division by zero problem.
    :param dtype: a string indicating which dtype to collect scale and zero_point of.
    Args:
        bins: number of bins to use for the histogram.
        upsample_rate: which ratio to interpolate histograms in.
        mode: set quantization mode.
        eps: a initial maximum value to avoid division by zero problem.
        dtype: a string indicating which dtype to collect scale and zero_point of.
    """

    def __init__(
@@ -256,8 +256,7 @@ class HistogramObserver(MinMaxObserver):
        self.histogram = Tensor([-1] + [0.0] * (bins - 1), dtype="float32")

    def _non_linear_param_search(self):
        r"""
        Non-linear parameter search.
        r"""Non-linear parameter search.
        An approximation for L2 error minimization for selecting min/max.
        By selecting new min/max, we filter out outliers in input distribution.
        """
@@ -269,8 +268,7 @@ class HistogramObserver(MinMaxObserver):
        bin_width = (np_max_val - np_min_val) / self.bins

        def _get_norm(delta_begin, delta_end, density, norm_type):
            r"""
            Compute the norm of the values uniformaly distributed between
            r"""Compute the norm of the values uniformaly distributed between
            delta_begin and delta_end.
            norm = density * (integral_{begin, end} x^2)
                 = density * (end^3 - begin^3) / 3
@@ -285,8 +283,7 @@ class HistogramObserver(MinMaxObserver):
            return density * norm

        def _compute_quantization_error(next_start_bin, next_end_bin, norm_type):
            r"""
            Compute the quantization error if we use start_bin to end_bin as the
            r"""Compute the quantization error if we use start_bin to end_bin as the
            min and max to do the quantization.
            """

@@ -488,9 +485,7 @@ class HistogramObserver(MinMaxObserver):


 class PassiveObserver(Observer):
    r"""
    An Observer that supports setting :attr:`scale` directly.
    """
    r"""An Observer that supports setting :attr:`scale` directly."""

    def __init__(self, dtype: Union[str, QuantDtypeMeta], **kwargs):
        super().__init__(dtype, **kwargs)
@@ -510,8 +505,10 @@ class PassiveObserver(Observer):
        return self.qparams

    def set_qparams(self, qparams: QParams):
        """
        :param qparams: used to set initial scale.
        r"""set the ``qparams``.

        Args:
          qparams: used to set initial scale.
        """
        self.qparams = deepcopy(qparams)
        if qparams.scale is None:
@@ -527,7 +524,5 @@ class PassiveObserver(Observer):
        self.orig_scale = qparams.scale.numpy()

    def forward(self, x):
        r"""
        Just return input because :attr:`qparams` is set by :func:`~.apply_easy_quant`.
        """
        r"""Just return input because :attr:`qparams` is set by :func:`~.apply_easy_quant`."""
        return x
--- a/imperative/python/megengine/quantization/qconfig.py
+++ b/imperative/python/megengine/quantization/qconfig.py
@@ -27,33 +27,33 @@ class QConfig(
        ["weight_observer", "act_observer", "weight_fake_quant", "act_fake_quant"],
    )
 ):
    r"""
    A config class indicating how to do quantize toward :class:`~.QATModule` 's
    r"""A config class indicating how to do quantize toward :class:`~.QATModule` 's
    ``activation`` and ``weight``. See :meth:`~.QATModule.set_qconfig` for detail usage.

    :param weight_observer: interface to instantiate an :class:`~.Observer` indicating
        how to collect scales and zero_point of wegiht.
    :param act_observer: similar to ``weight_observer`` but toward activation.
    :param weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
        how to do fake_quant calculation.
    :param act_observer: similar to ``weight_fake_quant`` but toward activation.

    Args:
        weight_observer: interface to instantiate an :class:`~.Observer` indicating
            how to collect scales and zero_point of wegiht.
        act_observer: similar to ``weight_observer`` but toward activation.
        weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
            how to do fake_quant calculation.
        act_observer: similar to ``weight_fake_quant`` but toward activation.
    
    Examples:

    .. code-block::

        # Default EMA QConfig for QAT.
        ema_fakequant_qconfig = QConfig(
            weight_observer=partial(MinMaxObserver, dtype="qint8_narrow"),
            act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8"),
            weight_fake_quant=partial(FakeQuantize, dtype="qint8_narrow"),
            act_fake_quant=partial(FakeQuantize, dtype="qint8"),
        )

    
        .. code-block::
  
           # Default EMA QConfig for QAT.
           ema_fakequant_qconfig = QConfig(
               weight_observer=partial(MinMaxObserver, dtype="qint8_narrow"),
               act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8"),
               weight_fake_quant=partial(FakeQuantize, dtype="qint8_narrow"),
               act_fake_quant=partial(FakeQuantize, dtype="qint8"),
           )
    
    Each parameter is a ``class`` rather than an instance. And we recommand using ``functools.partial``
    to add initialization parameters of the ``class``, so that don't need to provide parameters in
    :meth:`~.QATModule.set_qconfig`.

    
    Usually we choose narrow version dtype (like ``qint8_narrow``) for weight related
    paramters and normal version for activation related ones. For the result of
    multiplication and addition as ``a * b + c * d``, if four variables are all -128 of
--- a/imperative/python/megengine/quantization/quantize.py
+++ b/imperative/python/megengine/quantization/quantize.py
@@ -57,14 +57,14 @@ qat_modules = tuple(_qat2quantized_dict.keys())


 def quantize(module: Module, inplace: bool = True, mapping: dict = None):
    r"""
    Recursively convert :class:`~.QATModule` to :class:`~.QuantizedModule`
    r"""Recursively convert :class:`~.QATModule` to :class:`~.QuantizedModule`
    through :meth:`~.Module.apply`.

    :param module: root module to do convert recursively.
    :param inplace: whether to convert submodules in-place.
    :param mapping: a dict indicating how to convert custom modules from QATModule to
        QuantizedModule. Will be combined with internal default convert mapping dict.
    Args:
        module: root module to do convert recursively.
        inplace: whether to convert submodules in-place.
        mapping: a dict indicating how to convert custom modules from QATModule to
            QuantizedModule. Will be combined with internal default convert mapping dict.
    """

    if not inplace:
@@ -94,16 +94,16 @@ def quantize_qat(
    qconfig: QConfig = ema_fakequant_qconfig,
    mapping: dict = None,
 ):
    r"""
    Recursively convert float :class:`~.Module` to :class:`~.QATModule`
    r"""Recursively convert float :class:`~.Module` to :class:`~.QATModule`
    through :meth:`~.Module.apply` and set qconfig relatively.

    :param module: root module to do convert recursively.
    :param inplace: whether to convert submodules in-place.
    :param qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
        default is ``ema_fakequant_qconfig``.
    :param mapping: a dict indicating how to convert custom modules from Module to QATModule.
        Will be combined with internal default convert mapping dict.
    Args:
        module: root module to do convert recursively.
        inplace: whether to convert submodules in-place.
        qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
            default is ``ema_fakequant_qconfig``.
        mapping: a dict indicating how to convert custom modules from Module to QATModule.
            Will be combined with internal default convert mapping dict.
    """

    if not inplace:
@@ -133,12 +133,12 @@ def quantize_qat(


 def reset_qconfig(module: Module, qconfig: QConfig, inplace: bool = True):
    r"""
    Reset :class:`~._FakeQuantize` and :class:`~.Observer` according to ``qconfig``
    r"""Reset :class:`~._FakeQuantize` and :class:`~.Observer` according to ``qconfig``

    :param module: root module to reset recursively.
    :param qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
    :param inplace: whether to reset submodules in-place.
    Args:
        module: root module to reset recursively.
        qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
        inplace: whether to reset submodules in-place.
    """

    if not inplace:
@@ -175,19 +175,17 @@ def _propagate(module: Module, func_str: str, *args, **kargs):


 def propagate_qconfig(module: QATModule, qconfig: QConfig):
    r"""
    Recursively set ``module``'s qconfig through :meth:`~.Module.apply`.
    r"""Recursively set ``module``'s qconfig through :meth:`~.Module.apply`.

    :param module: root module to traverse recursively.
    :param qconfig: a instance of :class:`~.QConfig` to be set as submodules' qconfig.
    Args:
        module: root module to traverse recursively.
        qconfig: a instance of :class:`~.QConfig` to be set as submodules' qconfig.
    """
    _propagate(module, "set_qconfig", qconfig)


 def hook_qat_module(module: Module, func: Callable):
    r"""
    Add hooks for all :class:`~.QATModule` submodule
    """
    r"""Add hooks for all :class:`~.QATModule` submodule"""

    def is_qat(mod: Module):
        return isinstance(mod, QATModule)
@@ -202,15 +200,16 @@ def hook_qat_module(module: Module, func: Callable):
 def apply_easy_quant(
    module: Module, data: Tensor, start: float = 0.8, stop: float = 1.2, num: int = 40
 ):
    r"""
    Implementation of ``EasyQuant``: https://arxiv.org/pdf/2006.16669.
    r"""Implementation of ``EasyQuant``: https://arxiv.org/pdf/2006.16669.
    Search for optimal scales.

    :param module: root module.
    :param data: input tensor used to search optimal scale.
    :param start: lower bound of the search interval.
    :param stop: upper bound of the search interval.
    :param num: number of samples to search.
    Args:
        module: root module.
        data: input tensor used to search optimal scale.
        start: lower bound of the search interval.
        stop: upper bound of the search interval.
        num: number of samples to search.
        module: Module: 
    """

    batch_size = data.shape[0]
@@ -267,40 +266,40 @@ def apply_easy_quant(


 def disable_fake_quant(module: Module):
    r"""
    Recursively disable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
    r"""Recursively disable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`

    :param module: root module to do disable fake quantization recursively.
    Args:
        module: root module to do disable fake quantization recursively.
    """

    _propagate(module, "set_fake_quant", False)


 def disable_observer(module: Module):
    r"""
    Recursively disable ``module`` observer in QATModule through :meth:`~.Module.apply`
    r"""Recursively disable ``module`` observer in QATModule through :meth:`~.Module.apply`

    :param module: root module to do disable observer recursively.
    Args:
        module: root module to do disable observer recursively.
    """

    _propagate(module, "set_observer", False)


 def enable_fake_quant(module: Module):
    r"""
    Recursively enable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
    r"""Recursively enable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`

    :param module: root module to do enable fake quantization recursively.
    Args:
        module: root module to do enable fake quantization recursively.
    """

    _propagate(module, "set_fake_quant", True)


 def enable_observer(module: Module):
    r"""
    Recursively enable ``module`` observer in QATModule through :meth:`~.Module.apply`
    r"""Recursively enable ``module`` observer in QATModule through :meth:`~.Module.apply`

    :param module: root module to do enable observer recursively.
    Args:
        module: root module to do enable observer recursively.
    """

    _propagate(module, "set_observer", True)
--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
@@ -25,8 +25,7 @@ from ..tensor import Tensor


 class Round(Function):
    """
    The functional round have no grad and can not use for quantization-aware-training.
    r"""The functional round have no grad and can not use for quantization-aware-training.
    We use Function and STE(Straight-Through Estimator) to implement backward propagation.
    """

@@ -68,17 +67,14 @@ def register_method_to_class(cls):


 class QuantMode(Enum):
    """
    Quantization mode enumerate class.
    """
    r"""Quantization mode enumerate class."""

    SYMMERTIC = 1
    ASYMMERTIC = 2


 class QParams:
    """
    To standardize FakeQuant, Observer and Tensor's qparams format. If custom
    r"""To standardize FakeQuant, Observer and Tensor's qparams format. If custom
    qparams is needed, inherit this class and add custom ``__slots__``.
    """

@@ -116,8 +112,7 @@ class QParams:


 class LSQParams:
    """
    To standardize LSQ's qparams format. If custom
    r"""To standardize LSQ's qparams format. If custom
    qparams is needed, inherit this class and add custom ``__slots__``.
    """

@@ -183,8 +178,14 @@ def create_qparams(
    scale: Tensor = None,
    zero_point: Tensor = None,
 ):
    """
    Return :class:`~.QParams` according to the mode.
    r"""

    Args:
        mode: QuantMode:
        dtype_meta: Union[str:
        QuantDtypeMeta]:
        scale: Tensor:
        zero_point: Tensor:
    """
    if isinstance(dtype_meta, str):
        dtype_meta = _builtin_quant_dtypes[dtype_meta]
@@ -197,12 +198,11 @@ def create_qparams(


 def fake_quant_tensor(inp: Tensor, qparams: QParams) -> Tensor:
    """
    Apply fake quantization to the inp tensor.

    :param inp: the input tensor which need to be faked.
    :param qparams: to get mode, qmin, qmax, scale and zero_point from.
    """Apply fake quantization to the inp tensor.

    Args:
        inp: the input tensor which need to be faked.
        qparams: to get mode, qmin, qmax, scale and zero_point from.
    """
    scale = qparams.scale
    if qparams.mode == QuantMode.ASYMMERTIC:
@@ -217,17 +217,16 @@ def fake_quant_tensor(inp: Tensor, qparams: QParams) -> Tensor:


 def fake_quant_bias(bias: Tensor, inp: Tensor, w_qat: Tensor) -> Tensor:
    """
    Apply fake quantization to bias, with the special scale from input tensor
    """Apply fake quantization to bias, with the special scale from input tensor
    and weight tensor, the quantized type set to qint32 also.

    :param bias: the bias tensor which need to be faked.
    :param inp:  the input tensor which contain the quantization parameters.
    :param w_qat: the weight tensor which contain the quantization parameters.
    Args:
        bias: the bias tensor which need to be faked.
        inp: the input tensor which contain the quantization parameters.
        w_qat: the weight tensor which contain the quantization parameters.

    .. warning::
    Warning:
        Only work for symmetric quantization method now.

    """
    b_qat = bias
    if (
--- a/imperative/python/megengine/random/rng.py
+++ b/imperative/python/megengine/random/rng.py
@@ -220,29 +220,29 @@ def _permutation(n: int, seed: int, device: str, handle: int, dtype: str) -> Ten

 class RNG:

    r"""
    :class:`RNG` exposes a number of methods for generating random numbers.
    r""":class:`RNG` exposes a number of methods for generating random numbers.

    Args:
        seed: random seed used to initialize the pseudo-random number generator. Default: None
        device: the device of generated tensor. Default: None

    :param seed: random seed used to initialize the pseudo-random number generator. 
        Default: None
    :param device: the device of generated tensor. Default: None

    Examples:

    .. testcode::
        .. testcode::

        import megengine.random as rand
        rng = rand.RNG(seed=100)
        x = rng.uniform(size=(2, 2))
        print(x.numpy())
            import megengine.random as rand
            rng = rand.RNG(seed=100)
            x = rng.uniform(size=(2, 2))
            print(x.numpy())

    Outputs:
        Outputs:

    .. testoutput::
        :options: +SKIP
        .. testoutput::
            :options: +SKIP

        [[0.84811664 0.6147553 ]
         [0.59429836 0.64727545]]
            [[0.84811664 0.6147553 ]
             [0.59429836 0.64727545]]

    """

@@ -259,32 +259,33 @@ class RNG:
    def uniform(
        self, low: float = 0, high: float = 1, size: Optional[Iterable[int]] = None
    ):
        r"""
        Random variable with uniform distribution $U(0, 1)$.
        r"""Random variable with uniform distribution $U(0, 1)$.

        Args:
            low: lower range. Default: 0
            high: upper range. Default: 1
            size: the size of output tensor. Default: None

        :param low: lower range. Default: 0
        :param high: upper range. Default: 1
        :param size: the size of output tensor. Default: None
        :return: the output tensor.
        Returns:
            the output tensor.

        Examples:

        .. testcode::
            .. testcode::

            import megengine as mge
            import megengine.random as rand
                import megengine as mge
                import megengine.random as rand

            x = rand.uniform(size=(2, 2))
            print(x.numpy())
        
        Outputs:
        
        .. testoutput::
            :options: +SKIP
                x = rand.uniform(size=(2, 2))
                print(x.numpy())

            Outputs:

            [[0.91600335 0.6680226 ]
             [0.2046729  0.2769141 ]]
            .. testoutput::
                :options: +SKIP

                [[0.91600335 0.6680226 ]
                 [0.2046729  0.2769141 ]]
        """
        _seed = self._seed() if callable(self._seed) else self._seed
        return _uniform(
@@ -299,33 +300,34 @@ class RNG:
    def normal(
        self, mean: float = 0, std: float = 1, size: Optional[Iterable[int]] = None
    ):
        r"""
        Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.
        r"""Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.

        :param mean: the mean or expectation of the distribution. Default: 0
        :param std: the standard deviation of the distribution (variance = :math:`\sigma ^ 2`).
            Default: 1
        :param size: the size of output tensor. Default: None
        :return: the output tensor.
        Args:
            mean: the mean or expectation of the distribution. Default: 0
            std: the standard deviation of the distribution (variance = :math:`\sigma ^ 2`).
                Default: 1
            size: the size of output tensor. Default: None

        Returns:
            the output tensor.

        Examples:

        .. testcode::
            .. testcode::

            import megengine as mge
            import megengine.random as rand
                import megengine as mge
                import megengine.random as rand

            x = rand.normal(mean=0, std=1, size=(2, 2))
            print(x.numpy())
        
        Outputs:
        
        .. testoutput::
            :options: +SKIP
                x = rand.normal(mean=0, std=1, size=(2, 2))
                print(x.numpy())

            [[-1.4010863  -0.9874344 ]
             [ 0.56373274  0.79656655]]
            Outputs:

            .. testoutput::
                :options: +SKIP

                [[-1.4010863  -0.9874344 ]
                 [ 0.56373274  0.79656655]]
        """
        _seed = self._seed() if callable(self._seed) else self._seed
        return _normal(
@@ -343,12 +345,12 @@ class RNG:
        scale: Union[Tensor, float] = 1,
        size: Optional[Iterable[int]] = None,
    ):
        r"""
        Random variable with Gamma distribution :math:`\Gamma(k, \theta)`.
        r"""Random variable with Gamma distribution :math:`\Gamma(k, \theta)`.

        The corresponding probability density function is
        

        .. math::

            p(x)=x^{k-1} \frac{e^{-x / \theta}}{\theta^{k} \Gamma(k)}
            \quad \text { for } x>0 \quad k, \theta>0,

@@ -357,52 +359,54 @@ class RNG:
        .. math::
            \Gamma(k)=(k-1) !  \quad \text { for } \quad k>0.

        :param shape: the shape parameter (sometimes designated "k") of the distribution. 
            Must be non-negative.
        :param scale: the scale parameter (sometimes designated "theta") of the distribution. 
            Must be non-negative. Default: 1
        :param size: the size of output tensor. If shape and scale are scalars and given size is, e.g.,
            `(m, n)`, then the output shape is `(m, n)`. If shape or scale is a Tensor and given size 
            is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(shape, scale).shape`. 
            The broadcast rules are consistent with `numpy.broadcast`. Default: None
        :return: the output tensor.
        Args:
            shape: the shape parameter (sometimes designated "k") of the distribution.
                Must be non-negative.
            scale: the scale parameter (sometimes designated "theta") of the distribution.
                Must be non-negative. Default: 1
            size: the size of output tensor. If shape and scale are scalars and given size is, e.g.,
                `(m, n)`, then the output shape is `(m, n)`. If shape or scale is a Tensor and given size
                is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(shape, scale).shape`.
                The broadcast rules are consistent with `numpy.broadcast`. Default: None

        Returns:
            the output tensor.

        Examples:

        .. testcode::
            .. testcode::

            import megengine as mge
            import megengine.random as rand
                import megengine as mge
                import megengine.random as rand

            x = rand.gamma(shape=2, scale=1, size=(2, 2))
            print(x.numpy())
                x = rand.gamma(shape=2, scale=1, size=(2, 2))
                print(x.numpy())

            shape = mge.Tensor([[ 1],
                                [10]], dtype="float32")
            scale = mge.Tensor([1,5], dtype="float32")
                shape = mge.Tensor([[ 1],
                                    [10]], dtype="float32")
                scale = mge.Tensor([1,5], dtype="float32")

            x = rand.gamma(shape=shape, scale=scale)
            print(x.numpy())
                x = rand.gamma(shape=shape, scale=scale)
                print(x.numpy())

            x = rand.gamma(shape=shape, scale=scale, size=2)
            print(x.numpy())
        
        Outputs:
        
        .. testoutput::
            :options: +SKIP
            
            [[1.5064533  4.0689363 ]
             [0.71639484 1.4551026 ]]
                x = rand.gamma(shape=shape, scale=scale, size=2)
                print(x.numpy())

            Outputs:

            [[ 0.4352188 11.399335 ]
             [ 9.1888    52.009277 ]]
            .. testoutput::
                :options: +SKIP

            [[[ 1.1726005   3.9654975 ]
              [13.656933   36.559006  ]]
             [[ 0.25848487  2.5540342 ]
              [11.960409   21.031536  ]]]
                [[1.5064533  4.0689363 ]
                 [0.71639484 1.4551026 ]]

                [[ 0.4352188 11.399335 ]
                 [ 9.1888    52.009277 ]]

                [[[ 1.1726005   3.9654975 ]
                  [13.656933   36.559006  ]]
                 [[ 0.25848487  2.5540342 ]
                  [11.960409   21.031536  ]]]
        """
        _seed = self._seed() if callable(self._seed) else self._seed
        return _gamma(
@@ -415,155 +419,161 @@ class RNG:
        beta: Union[Tensor, float],
        size: Optional[Iterable[int]] = None,
    ):
        r"""
        Random variable with Beta distribution :math:`\operatorname{Beta}(\alpha, \beta)`.
        r"""Random variable with Beta distribution :math:`\operatorname{Beta}(\alpha, \beta)`.

        The corresponding probability density function is
        

        .. math::
            p(x)=\frac{1}{\mathrm{~B}(\alpha, \beta)} x^{\alpha-1}(1-x)^{\beta-1} 

            p(x)=\frac{1}{\mathrm{~B}(\alpha, \beta)} x^{\alpha-1}(1-x)^{\beta-1}
            \quad \text { for } \alpha, \beta>0,

        where :math:`\mathrm{~B}(\alpha, \beta)` is the beta function,

        .. math::

            \mathrm{~B}(\alpha, \beta)=\int_{0}^{1} t^{\alpha-1}(1-t)^{\beta-1} d t.

        :param alpha: the alpha parameter of the distribution. Must be non-negative.
        :param beta: the beta parameter of the distribution. Must be non-negative.
        :param size: the size of output tensor. If alpha and beta are scalars and given size is, e.g.,
            `(m, n)`, then the output shape is `(m, n)`. If alpha or beta is a Tensor and given size 
            is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(alpha, beta).shape`. 
            The broadcast rules are consistent with `numpy.broadcast`. Default: None
        :return: the output tensor.
        Args:
            alpha: the alpha parameter of the distribution. Must be non-negative.
            beta: the beta parameter of the distribution. Must be non-negative.
            size: the size of output tensor. If alpha and beta are scalars and given size is, e.g.,
                `(m, n)`, then the output shape is `(m, n)`. If alpha or beta is a Tensor and given size
                is, e.g., `(m, n)`, then the output shape is `(m, n) + broadcast(alpha, beta).shape`.

        Returns:
            the output tensor.

        Examples:

        .. testcode::
            .. testcode::

            import megengine as mge
            import megengine.random as rand
                import megengine as mge
                import megengine.random as rand

            x = rand.beta(alpha=2, beta=1, size=(2, 2))
            print(x.numpy())
                x = rand.beta(alpha=2, beta=1, size=(2, 2))
                print(x.numpy())

            alpha = mge.Tensor([[0.5],
                                [  3]], dtype="float32")
            beta = mge.Tensor([0.5,5], dtype="float32")
                alpha = mge.Tensor([[0.5],
                                    [  3]], dtype="float32")
                beta = mge.Tensor([0.5,5], dtype="float32")

            x = rand.beta(alpha=alpha, beta=beta)
            print(x.numpy())
                x = rand.beta(alpha=alpha, beta=beta)
                print(x.numpy())

            x = rand.beta(alpha=alpha, beta=beta, size=2)
            print(x.numpy())
        
        Outputs:
        
        .. testoutput::
            :options: +SKIP
            
            [[0.582565   0.91763186]
             [0.86963767 0.6088103 ]]
            
            [[0.41503012 0.16438372]
             [0.90159506 0.47588003]]
            
            [[[0.55195075 0.01111084]
              [0.95298755 0.25048104]]
             [[0.11680304 0.13859665]
              [0.997879   0.43259275]]]
                x = rand.beta(alpha=alpha, beta=beta, size=2)
                print(x.numpy())

            Outputs:

            .. testoutput::
                :options: +SKIP

                [[0.582565   0.91763186]
                 [0.86963767 0.6088103 ]]

                [[0.41503012 0.16438372]
                 [0.90159506 0.47588003]]

                [[[0.55195075 0.01111084]
                  [0.95298755 0.25048104]]
                 [[0.11680304 0.13859665]
                  [0.997879   0.43259275]]]
        """
        _seed = self._seed() if callable(self._seed) else self._seed
        return _beta(alpha=alpha, beta=beta, size=size, seed=_seed, handle=self._handle)

    def poisson(self, lam: Union[float, Tensor], size: Optional[Iterable[int]] = None):
        r"""
        Random variable with poisson distribution :math:`\operatorname{Poisson}(\lambda)`.
        r"""Random variable with poisson distribution :math:`\operatorname{Poisson}(\lambda)`.

        The corresponding probability density function is

        .. math::

            f(k ; \lambda)=\frac{\lambda^{k} e^{-\lambda}}{k !},
        

        where k is the number of occurrences :math:`({\displaystyle k=0,1,2...})`.

        :param lam: the lambda parameter of the distribution. Must be non-negative.
        :param size: the size of output tensor. If lam is a scalar and given size is, e.g., `(m, n)`, 
            then the output shape is `(m, n)`. If lam is a Tensor with shape `(k, v)` and given 
            size is, e.g., `(m, n)`, then the output shape is `(m, n, k, v)`. Default: None.
        :return: the output tensor.
        Args:
            lam: the lambda parameter of the distribution. Must be non-negative.
            size: the size of output tensor. If lam is a scalar and given size is, e.g., `(m, n)`,
                then the output shape is `(m, n)`. If lam is a Tensor with shape `(k, v)` and given
                size is, e.g., `(m, n)`, then the output shape is `(m, n, k, v)`. Default: None.

        Returns:
            the output tensor.



        Examples:

        .. testcode::
            .. testcode::

            import megengine as mge
            import megengine.random as rand
                import megengine as mge
                import megengine.random as rand

            x = rand.poisson(lam=2., size=(1, 3))
            print(x.numpy())
                x = rand.poisson(lam=2., size=(1, 3))
                print(x.numpy())

            lam = mge.Tensor([[1.,1.],
                            [10,10]], dtype="float32")
                lam = mge.Tensor([[1.,1.],
                                [10,10]], dtype="float32")

            x = rand.poisson(lam=lam)
            print(x.numpy())
                x = rand.poisson(lam=lam)
                print(x.numpy())

            x = rand.poisson(lam=lam, size=(1,3))
            print(x.numpy())
                x = rand.poisson(lam=lam, size=(1,3))
                print(x.numpy())

        Outputs:
        
        .. testoutput::
            :options: +SKIP
            Outputs:

            [[3. 1. 3.]]
            .. testoutput::
                :options: +SKIP

            [[ 2.  2.]
             [12. 11.]]
                [[3. 1. 3.]]

            [[[[ 1.  1.]
               [11.  4.]]
              [[ 0.  0.]
               [ 9. 13.]]
              [[ 0.  1.]
               [ 7. 12.]]]]
                [[ 2.  2.]
                 [12. 11.]]

                [[[[ 1.  1.]
                   [11.  4.]]
                  [[ 0.  0.]
                   [ 9. 13.]]
                  [[ 0.  1.]
                   [ 7. 12.]]]]
        """
        _seed = self._seed() if callable(self._seed) else self._seed
        return _poisson(lam=lam, size=size, seed=_seed, handle=self._handle)

    def permutation(self, n: int, *, dtype: str = "int32"):
        r"""
        Generates a random permutation of integers from :math:`0` to :math:`n - 1`.
        r"""Generates a random permutation of integers from :math:`0` to :math:`n - 1`.

        :param n: the upper bound. Must be larger than 0.
        :param dtype: the output data type. int32, int16 and float32 are 
            supported. Default: int32
        :return: the output tensor.
        Args:
            n: the upper bound. Must be larger than 0.
            dtype: the output data type. int32, int16 and float32 are supported. Default: int32

        Returns:
            the output tensor.

        Examples:

        .. testcode::
            .. testcode::

            import megengine as mge
            import megengine.random as rand
                import megengine as mge
                import megengine.random as rand

            x = rand.permutation(n=10, dtype="int32")
            print(x.numpy())
                x = rand.permutation(n=10, dtype="int32")
                print(x.numpy())

            x = rand.permutation(n=10, dtype="float32")
            print(x.numpy())
        
        Outputs:
        
        .. testoutput::
            :options: +SKIP
                x = rand.permutation(n=10, dtype="float32")
                print(x.numpy())

            Outputs:

            [4 5 0 7 3 8 6 1 9 2]
            [3. 4. 9. 0. 6. 8. 7. 1. 5. 2.]
            .. testoutput::
                :options: +SKIP

                [4 5 0 7 3 8 6 1 9 2]
                [3. 4. 9. 0. 6. 8. 7. 1. 5. 2.]
        """
        _seed = self._seed() if callable(self._seed) else self._seed
        return _permutation(